From b1e37eeb864537ffef00e218dcc2bde431acfa46 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 26 Mar 2026 22:31:46 +0100 Subject: [PATCH 1/2] Implement tail loading support in backends - Support negative offsets in BackendBase.load to load from the end of a file. - Implement negative offset support in PosixFS, S3, Rclone, REST, and Sftp backends. - Create borgstore.backends._utils for shared HTTP Range header logic. - Update REST server to handle suffix-byte-range requests. - Add tests for tail loading and range header utilities. --- src/borgstore/backends/_base.py | 6 +++- src/borgstore/backends/_utils.py | 59 +++++++++++++++++++++++++++++++ src/borgstore/backends/posixfs.py | 4 +-- src/borgstore/backends/rclone.py | 13 ++++--- src/borgstore/backends/rest.py | 12 +++++-- src/borgstore/backends/s3.py | 24 ++++++------- src/borgstore/backends/sftp.py | 2 +- src/borgstore/server/rest.py | 14 ++------ tests/test_backends.py | 2 ++ tests/test_backends_utils.py | 30 ++++++++++++++++ 10 files changed, 129 insertions(+), 37 deletions(-) create mode 100644 src/borgstore/backends/_utils.py create mode 100644 tests/test_backends_utils.py diff --git a/src/borgstore/backends/_base.py b/src/borgstore/backends/_base.py index de4ee68..e122586 100644 --- a/src/borgstore/backends/_base.py +++ b/src/borgstore/backends/_base.py @@ -96,7 +96,11 @@ def info(self, name) -> ItemInfo: @abstractmethod def load(self, name: str, *, size=None, offset=0) -> bytes: - """load value from """ + """load value from + + If offset is negative, it is counted from the end of the file. + If size is None, the whole object starting at offset is loaded. + """ @abstractmethod def store(self, name: str, value: bytes) -> None: diff --git a/src/borgstore/backends/_utils.py b/src/borgstore/backends/_utils.py new file mode 100644 index 0000000..5badd55 --- /dev/null +++ b/src/borgstore/backends/_utils.py @@ -0,0 +1,59 @@ +""" +Utilities for backend implementations. +""" + +from typing import Tuple, Optional + + +def make_range_header(offset: int, size: Optional[int] = None, total_size: Optional[int] = None) -> Optional[str]: + """ + Generate a standards compliant HTTP Range header. + + :param offset: offset in bytes. If negative, it is counted from the end of the file. + :param size: number of bytes to load. If None, load until the end of the file. + :param total_size: total size of the file. Required if offset < 0 and size is not None. + :return: Range header value (e.g., "bytes=0-99") or None if no Range header is needed. + """ + if offset < 0: + if size is None: + return f"bytes={offset}" + else: + if total_size is None: + raise ValueError("total_size is required for negative offset with a specific size") + start = total_size + offset + return f"bytes={start}-{start + size - 1}" + else: + if size is None: + return f"bytes={offset}-" if offset > 0 else None + else: + return f"bytes={offset}-{offset + size - 1}" + + +def parse_range_header(range_header: str) -> Tuple[int, Optional[int]]: + """ + Parse a standards compliant HTTP Range header. + Only supports "bytes" unit and single range specs. + + :param range_header: Range header value (e.g., "bytes=0-99", "bytes=100-", "bytes=-500"). + :return: A tuple (offset, size). offset is negative for suffix ranges. + """ + if not range_header or not range_header.startswith("bytes="): + return 0, None + + try: + range_val = range_header.split("=")[1] + if range_val.startswith("-"): + # bytes=-SUFFIX + return int(range_val), None + elif "-" in range_val: + # bytes=OFFSET- or bytes=OFFSET-END + start_str, end_str = range_val.split("-") + offset = int(start_str) + size = None + if end_str: + size = int(end_str) - offset + 1 + return offset, size + except (ValueError, IndexError): + pass + + return 0, None diff --git a/src/borgstore/backends/posixfs.py b/src/borgstore/backends/posixfs.py index eba050f..274bfbb 100644 --- a/src/borgstore/backends/posixfs.py +++ b/src/borgstore/backends/posixfs.py @@ -193,8 +193,8 @@ def load(self, name, *, size=None, offset=0): self._check_permission(name, "r") try: with path.open("rb") as f: - if offset > 0: - f.seek(offset) + if offset != 0: + f.seek(offset, os.SEEK_SET if offset >= 0 else os.SEEK_END) return f.read(-1 if size is None else size) except FileNotFoundError: raise ObjectNotFound(name) from None diff --git a/src/borgstore/backends/rclone.py b/src/borgstore/backends/rclone.py index 52f5aab..1c42ac4 100644 --- a/src/borgstore/backends/rclone.py +++ b/src/borgstore/backends/rclone.py @@ -17,6 +17,7 @@ requests = None from ._base import BackendBase, ItemInfo, validate_name +from ._utils import make_range_header from .errors import ( BackendError, BackendDoesNotExist, @@ -258,11 +259,13 @@ def load(self, name: str, *, size=None, offset=0) -> bytes: """Load value from .""" validate_name(name) headers = {} - if size is not None or offset > 0: - if size is not None: - headers["Range"] = f"bytes={offset}-{offset+size-1}" - else: - headers["Range"] = f"bytes={offset}-" + if offset < 0 and size is not None: + info = self.info(name) + range_header = make_range_header(offset, size, info.size) + else: + range_header = make_range_header(offset, size) + if range_header: + headers["Range"] = range_header r = self._requests(requests.get, f"{self.url}[{self.fs}]/{name}", tries=self.TRIES, headers=headers) return r.content diff --git a/src/borgstore/backends/rest.py b/src/borgstore/backends/rest.py index 9a73fd0..741089e 100644 --- a/src/borgstore/backends/rest.py +++ b/src/borgstore/backends/rest.py @@ -15,6 +15,7 @@ requests = HTTPBasicAuth = None from ._base import BackendBase, ItemInfo, validate_name +from ._utils import make_range_header from .errors import ( ObjectNotFound, BackendAlreadyExists, @@ -169,10 +170,15 @@ def load(self, name: str, *, size=None, offset=0) -> bytes: self._assert_open() validate_name(name) - r_hdr = (None if not offset else f"bytes={offset}-") if size is None else f"bytes={offset}-{offset + size - 1}" + if offset < 0 and size is not None: + info = self.info(name) + range_header = make_range_header(offset, size, info.size) + else: + range_header = make_range_header(offset, size) + headers = self.headers.copy() - if r_hdr: - headers["Range"] = r_hdr + if range_header: + headers["Range"] = range_header response = self._request("get", self._url(name), headers=headers) self._handle_response(response, name) diff --git a/src/borgstore/backends/s3.py b/src/borgstore/backends/s3.py index df98dff..5ffdb3f 100644 --- a/src/borgstore/backends/s3.py +++ b/src/borgstore/backends/s3.py @@ -13,6 +13,7 @@ import urllib.parse from ._base import BackendBase, ItemInfo, validate_name +from ._utils import make_range_header from .errors import BackendError, BackendMustBeOpen, BackendMustNotBeOpen, BackendDoesNotExist, BackendAlreadyExists from .errors import ObjectNotFound @@ -187,20 +188,17 @@ def load(self, name, *, size=None, offset=0): validate_name(name) key = self.base_path + name try: - if size is None and offset == 0: + if offset < 0 and size is not None: + info = self.info(name) + range_header = make_range_header(offset, size, info.size) + else: + range_header = make_range_header(offset, size) + + if range_header: + obj = self.s3.get_object(Bucket=self.bucket, Key=key, Range=range_header) + else: obj = self.s3.get_object(Bucket=self.bucket, Key=key) - return obj["Body"].read() - elif size is not None and offset == 0: - obj = self.s3.get_object(Bucket=self.bucket, Key=key, Range=f"bytes=0-{size - 1}") - return obj["Body"].read() - elif size is None and offset != 0: - head = self.s3.head_object(Bucket=self.bucket, Key=key) - length = head["ContentLength"] - obj = self.s3.get_object(Bucket=self.bucket, Key=key, Range=f"bytes={offset}-{length - 1}") - return obj["Body"].read() - elif size is not None and offset != 0: - obj = self.s3.get_object(Bucket=self.bucket, Key=key, Range=f"bytes={offset}-{offset + size - 1}") - return obj["Body"].read() + return obj["Body"].read() except self.s3.exceptions.NoSuchKey: raise ObjectNotFound(name) diff --git a/src/borgstore/backends/sftp.py b/src/borgstore/backends/sftp.py index aadbf42..3add575 100644 --- a/src/borgstore/backends/sftp.py +++ b/src/borgstore/backends/sftp.py @@ -238,7 +238,7 @@ def load(self, name, *, size=None, offset=0): validate_name(name) try: with self.client.open(name) as f: - f.seek(offset) + f.seek(offset, 0 if offset >= 0 else 2) f.prefetch(size) # speeds up the following read() significantly! return f.read(size) except FileNotFoundError: diff --git a/src/borgstore/server/rest.py b/src/borgstore/server/rest.py index 3eb8ff5..fa14a62 100644 --- a/src/borgstore/server/rest.py +++ b/src/borgstore/server/rest.py @@ -15,6 +15,7 @@ BackendMustBeOpen, BackendMustNotBeOpen, ) +from ..backends._utils import parse_range_header from ..store import get_backend logger = logging.getLogger(__name__) @@ -251,18 +252,7 @@ def do_GET(self): try: range_header = self.headers.get("Range") - offset = 0 - size = None - if range_header and range_header.startswith("bytes="): - # Simple Range: bytes=OFFSET- or bytes=OFFSET-END - try: - range_val = range_header.split("=")[1] - start_str, end_str = range_val.split("-") - offset = int(start_str) - if end_str: - size = int(end_str) - offset + 1 - except ValueError: - pass + offset, size = parse_range_header(range_header) if range_header else (0, None) with self.server.backend: data = self.server.backend.load(self.name, offset=offset, size=size) diff --git a/tests/test_backends.py b/tests/test_backends.py index 221601e..2216be7 100644 --- a/tests/test_backends.py +++ b/tests/test_backends.py @@ -459,6 +459,8 @@ def test_load_partial(tested_backends, request): assert backend.load("key", size=3) == b"012" assert backend.load("key", offset=5) == b"56789" assert backend.load("key", offset=4, size=4) == b"4567" + assert backend.load("key", offset=-3) == b"789" + assert backend.load("key", offset=-4, size=2) == b"67" def test_already_exists(tested_backends, request): diff --git a/tests/test_backends_utils.py b/tests/test_backends_utils.py new file mode 100644 index 0000000..b989961 --- /dev/null +++ b/tests/test_backends_utils.py @@ -0,0 +1,30 @@ +from borgstore.backends._utils import make_range_header, parse_range_header +import pytest + + +def test_make_range_header(): + # From start + assert make_range_header(0) is None + assert make_range_header(100) == "bytes=100-" + assert make_range_header(100, 50) == "bytes=100-149" + assert make_range_header(0, 50) == "bytes=0-49" + + # From end + assert make_range_header(-100) == "bytes=-100" + assert make_range_header(-100, 50, 1000) == "bytes=900-949" + + with pytest.raises(ValueError): + make_range_header(-100, 50) + + +def test_parse_range_header(): + assert parse_range_header(None) == (0, None) + assert parse_range_header("") == (0, None) + assert parse_range_header("invalid") == (0, None) + assert parse_range_header("bytes=invalid") == (0, None) + + assert parse_range_header("bytes=100-") == (100, None) + assert parse_range_header("bytes=100-149") == (100, 50) + assert parse_range_header("bytes=0-49") == (0, 50) + + assert parse_range_header("bytes=-100") == (-100, None) From cf83d5ac1e9c260a8a11932f11233533724cd8c2 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 26 Mar 2026 23:34:36 +0100 Subject: [PATCH 2/2] Optimize tail loading for offset < 0 and given size http range request headers have a bad limitation: When loading bytes from the end of a file, we can not have a range header expressing "give us bytes from offset -200 to offset -100", we would need the overall file length for that (e.g. 1000) and say "give us bytes 800-899". If we do not know the overall file length, that would be an additional http request and response, just to find that out. As there is usually quite some overhead for that (latency, maybe also API call costs), we should avoid that and just request all the tail bytes starting from the desired offset IF the additional bytes aren't many (e.g. less than 1024). Use case: If store objects are produced by streaming smaller chunks, appending a table of contents (with chunk lengths) and finally the length of the TOC and maybe some MAGIC value, these additional bytes are usually only a few (e.g. 4bytes for TOC length, 4bytes MAGIC), so the transfer time overhead is negligible, but the latency without this optimization could be bad. --- src/borgstore/backends/rclone.py | 14 +++++++++++--- src/borgstore/backends/rest.py | 14 +++++++++++--- src/borgstore/backends/s3.py | 14 +++++++++++--- tests/test_backends.py | 12 ++++++++++++ 4 files changed, 45 insertions(+), 9 deletions(-) diff --git a/src/borgstore/backends/rclone.py b/src/borgstore/backends/rclone.py index 1c42ac4..d87b5fa 100644 --- a/src/borgstore/backends/rclone.py +++ b/src/borgstore/backends/rclone.py @@ -260,14 +260,22 @@ def load(self, name: str, *, size=None, offset=0) -> bytes: validate_name(name) headers = {} if offset < 0 and size is not None: - info = self.info(name) - range_header = make_range_header(offset, size, info.size) + if -offset - size <= 1024: + # Optimization: if the part of the tail we don't need is small, + # we just request the last N bytes and truncate locally. + range_header = make_range_header(offset, size=None) + else: + info = self.info(name) + range_header = make_range_header(offset, size, info.size) else: range_header = make_range_header(offset, size) if range_header: headers["Range"] = range_header r = self._requests(requests.get, f"{self.url}[{self.fs}]/{name}", tries=self.TRIES, headers=headers) - return r.content + content = r.content + if offset < 0 and size is not None and size < len(content): + content = content[:size] + return content def store(self, name: str, value: bytes) -> None: """Store into .""" diff --git a/src/borgstore/backends/rest.py b/src/borgstore/backends/rest.py index 741089e..6f5ce80 100644 --- a/src/borgstore/backends/rest.py +++ b/src/borgstore/backends/rest.py @@ -171,8 +171,13 @@ def load(self, name: str, *, size=None, offset=0) -> bytes: validate_name(name) if offset < 0 and size is not None: - info = self.info(name) - range_header = make_range_header(offset, size, info.size) + if -offset - size <= 1024: + # Optimization: if the part of the tail we don't need is small, + # we just request the last N bytes and truncate locally. + range_header = make_range_header(offset, size=None) + else: + info = self.info(name) + range_header = make_range_header(offset, size, info.size) else: range_header = make_range_header(offset, size) @@ -182,7 +187,10 @@ def load(self, name: str, *, size=None, offset=0) -> bytes: response = self._request("get", self._url(name), headers=headers) self._handle_response(response, name) - return response.content + content = response.content + if offset < 0 and size is not None and size < len(content): + content = content[:size] + return content def store(self, name: str, value: bytes) -> None: self._assert_open() diff --git a/src/borgstore/backends/s3.py b/src/borgstore/backends/s3.py index 5ffdb3f..e026bfd 100644 --- a/src/borgstore/backends/s3.py +++ b/src/borgstore/backends/s3.py @@ -189,8 +189,13 @@ def load(self, name, *, size=None, offset=0): key = self.base_path + name try: if offset < 0 and size is not None: - info = self.info(name) - range_header = make_range_header(offset, size, info.size) + if -offset - size <= 1024: + # Optimization: if the part of the tail we don't need is small, + # we just request the last N bytes and truncate locally. + range_header = make_range_header(offset, size=None) + else: + info = self.info(name) + range_header = make_range_header(offset, size, info.size) else: range_header = make_range_header(offset, size) @@ -198,7 +203,10 @@ def load(self, name, *, size=None, offset=0): obj = self.s3.get_object(Bucket=self.bucket, Key=key, Range=range_header) else: obj = self.s3.get_object(Bucket=self.bucket, Key=key) - return obj["Body"].read() + content = obj["Body"].read() + if offset < 0 and size is not None and size < len(content): + content = content[:size] + return content except self.s3.exceptions.NoSuchKey: raise ObjectNotFound(name) diff --git a/tests/test_backends.py b/tests/test_backends.py index 2216be7..93c5625 100644 --- a/tests/test_backends.py +++ b/tests/test_backends.py @@ -462,6 +462,18 @@ def test_load_partial(tested_backends, request): assert backend.load("key", offset=-3) == b"789" assert backend.load("key", offset=-4, size=2) == b"67" + # test tail loading optimization (-offset - size <= 1024) + # offset=-10, size=9 -> -offset - size = 1 <= 1024 -> optimized + assert backend.load("key", offset=-10, size=9) == b"012345678" + # offset=-10, size=1 -> -offset - size = 9 <= 1024 -> optimized + assert backend.load("key", offset=-10, size=1) == b"0" + # offset=-2000, size=1000 -> -offset - size = 1000 <= 1024 -> optimized + backend.store("large", b"x" * 2000) + assert backend.load("large", offset=-2000, size=1000) == b"x" * 1000 + # offset=-3000, size=1000 -> -offset - size = 2000 > 1024 -> NOT optimized + backend.store("huge", b"y" * 3000) + assert backend.load("huge", offset=-3000, size=1000) == b"y" * 1000 + def test_already_exists(tested_backends, request): backend = get_backend_from_fixture(tested_backends, request)