IndicoDataSolutions · mawelborn · Oct 20, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,34 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and versions match the minimum IPA version required to use functionality.
 
 
+## [v7.2.2] - 2025-10-14
+
+### Added
+
+- Parse table spans from ETL Output as `Table.spans`.
+- `NULL_CELL`, `NULL_RANGE`, `NULL_TABLE`, and `NULL_TOKEN` constants.
+- Document Extraction attributes for assigning tokens, tables, and cells from OCR:
+    - `DocumentExtraction.tokens`, `DocumentExtraction.tables`, `DocumentExtraction.cells`
+- Document Extraction convenience properties for singular token, table, and cell access:
+    - `DocumentExtraction.token`, `DocumentExtraction.table`, `DocumentExtraction.cell`
+- `PredictionList.assign_ocr(etl_outputs, tokens=True, tables=True)` method.
+- Custom `__hash__` methods for tables and cells to speed up `.groupby(...)`.
+- Prediction `.copy()` methods that only copy mutable state.
+
+### Changed
+
+- Move `Box` and `Span` from results to etloutput to avoid circular imports.
+  (Both can still be imported from either module.)
+- Return `NULL_TOKEN` instead of raising an exception from `EtlOutput.token_for(span)`.
+- Rewrite table cell lookup `EtlOutput.table_cells_for(span)` using a fast, span-based,
+  binary search algorithm that can return multiple overlapped table cells.
+
+### Removed
+
+- Custom `results` and `etloutput` error classes that are nearly never used.
+  (Replaced with idiomatic Python error classes.)
+
+
 ## [v7.2.1] - 2025-09-09
 
 ### Fixed
@@ -265,6 +293,7 @@ This is the first major version release tested to work on Indico 6.X.
 - Row Association now also sorting on 'bbtop'.
 
 
+[v7.2.1]: https://github.com/IndicoDataSolutions/indico-toolkit-python/compare/v7.2.1...v7.2.2
 [v7.2.1]: https://github.com/IndicoDataSolutions/indico-toolkit-python/compare/v7.2.0...v7.2.1
 [v7.2.0]: https://github.com/IndicoDataSolutions/indico-toolkit-python/compare/v6.14.2...v7.2.0
 [v6.14.2]: https://github.com/IndicoDataSolutions/indico-toolkit-python/compare/v6.14.1...v6.14.2

diff --git a/indico_toolkit/__init__.py b/indico_toolkit/__init__.py
@@ -21,4 +21,4 @@
     "ToolkitStaggeredLoopError",
     "ToolkitStatusError",
 )
-__version__ = "7.2.1"
+__version__ = "7.2.2"
diff --git a/indico_toolkit/etloutput/__init__.py b/indico_toolkit/etloutput/__init__.py
@@ -1,13 +1,13 @@
 from typing import TYPE_CHECKING, TypeAlias, TypeVar
 
-from ..results import NULL_BOX, NULL_SPAN, Box, Span
-from ..results.utils import get, has, json_loaded, str_decoded
-from .cell import Cell, CellType
-from .errors import EtlOutputError, TableCellNotFoundError, TokenNotFoundError
+from .box import NULL_BOX, Box
+from .cell import NULL_CELL, Cell, CellType
 from .etloutput import EtlOutput
-from .range import Range
-from .table import Table
-from .token import Token
+from .range import NULL_RANGE, Range
+from .span import NULL_SPAN, Span
+from .table import NULL_TABLE, Table
+from .token import NULL_TOKEN, Token
+from .utils import get, has, json_loaded, str_decoded
 
 if TYPE_CHECKING:
     from collections.abc import Awaitable, Callable
@@ -17,17 +17,18 @@
     "Cell",
     "CellType",
     "EtlOutput",
-    "EtlOutputError",
     "load",
     "load_async",
     "NULL_BOX",
+    "NULL_CELL",
+    "NULL_RANGE",
     "NULL_SPAN",
+    "NULL_TABLE",
+    "NULL_TOKEN",
     "Range",
     "Span",
     "Table",
-    "TableCellNotFoundError",
     "Token",
-    "TokenNotFoundError",
 )
 
 Loadable: TypeAlias = "dict[str, object] | list[object] | str | bytes"

diff --git a/indico_toolkit/results/predictions/box.py → indico_toolkit/etloutput/box.py b/indico_toolkit/results/predictions/box.py → indico_toolkit/etloutput/box.py
@@ -1,10 +1,7 @@
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from typing import Final
 
-from ..utils import get
-
-if TYPE_CHECKING:
-    from typing import Final
+from .utils import get
 
 
 @dataclass(frozen=True)
@@ -18,6 +15,30 @@ class Box:
     def __bool__(self) -> bool:
         return self != NULL_BOX
 
+    def __and__(self, other: "Box") -> "Box":
+        """
+        Return a new `Box` for the overlap between `self` and `other`
+        or `NULL_BOX` if they don't overlap.
+
+        Supports set-like `extraction.box & cell.box` syntax.
+        """
+        if (
+            self.page != other.page
+            or self.bottom <= other.top  # `self` is above `other`
+            or self.top >= other.bottom  # `self` is below `other`
+            or self.right <= other.left  # `self` is to the left of `other`
+            or self.left >= other.right  # `self` is to the right of `other`
+        ):
+            return NULL_BOX
+        else:
+            return Box(
+                page=self.page,
+                top=max(self.top, other.top),
+                left=max(self.left, other.left),
+                right=min(self.right, other.right),
+                bottom=min(self.bottom, other.bottom),
+            )
+
     def __lt__(self, other: "Box") -> bool:
         """
         Bounding boxes are sorted with vertical hysteresis. Those on the same line are
@@ -58,4 +79,4 @@ def from_dict(box: object) -> "Box":
 # object rather than using `None` or raising an error. This lets you e.g. sort by the
 # `box` attribute without having to constantly check for `None`, while still allowing
 # you do a "None check" with `bool(extraction.box)` or `extraction.box == NULL_BOX`.
-NULL_BOX: "Final" = Box(page=0, top=0, left=0, right=0, bottom=0)
+NULL_BOX: Final = Box(page=0, top=0, left=0, right=0, bottom=0)
diff --git a/indico_toolkit/etloutput/cell.py b/indico_toolkit/etloutput/cell.py
@@ -1,9 +1,11 @@
 from dataclasses import dataclass
 from enum import Enum
+from typing import Final
 
-from ..results import NULL_SPAN, Box, Span
-from ..results.utils import get
-from .range import Range
+from .box import NULL_BOX, Box
+from .range import NULL_RANGE, Range
+from .span import NULL_SPAN, Span
+from .utils import get
 
 
 class CellType(Enum):
@@ -19,6 +21,18 @@ class Cell:
     range: Range
     spans: "tuple[Span, ...]"
 
+    def __bool__(self) -> bool:
+        return self != NULL_CELL
+
+    def __hash__(self) -> int:
+        """
+        Uniquely identify cells by hashing their bounding box and spans.
+
+        This is small speedup for `.groupby(attrgetter("cell"))` compared to
+        dataclasses's default __hash__ implementation.
+        """
+        return hash((self.box, self.spans))
+
     @property
     def span(self) -> Span:
         """
@@ -45,3 +59,16 @@ def from_dict(cell: object, page: int) -> "Cell":
             range=Range.from_dict(cell),
             spans=tuple(map(Span.from_dict, get(cell, list, "doc_offsets"))),
         )
+
+
+# It's more ergonomic to represent the lack of cells with a special null cell object
+# rather than using `None` or raising an error. This lets you e.g. sort by the `cell`
+# attribute without having to constantly check for `None`, while still allowing you do
+# a "None check" with `bool(extraction.cell)` or `extraction.cell == NULL_CELL`.
+NULL_CELL: Final = Cell(
+    type=CellType.CONTENT,
+    text="",
+    box=NULL_BOX,
+    range=NULL_RANGE,
+    spans=tuple(),
+)
diff --git a/indico_toolkit/etloutput/errors.py b/indico_toolkit/etloutput/errors.py
diff --git a/indico_toolkit/etloutput/etloutput.py b/indico_toolkit/etloutput/etloutput.py
@@ -1,18 +1,20 @@
 import itertools
 from bisect import bisect_left, bisect_right
+from collections import namedtuple
 from dataclasses import dataclass
+from functools import cached_property
 from operator import attrgetter
 from typing import TYPE_CHECKING
 
-from ..results import Box, Span
-from .errors import TableCellNotFoundError, TokenNotFoundError
+from .box import Box
 from .table import Table
-from .token import Token
+from .token import NULL_TOKEN, Token
 
 if TYPE_CHECKING:
-    from collections.abc import Iterable
+    from collections.abc import Iterable, Iterator
 
     from .cell import Cell
+    from .span import Span
 
 
 @dataclass(frozen=True)
@@ -54,18 +56,19 @@ def from_pages(
             tables_on_page=table_pages,
         )
 
-    def token_for(self, span: Span) -> Token:
+    def token_for(self, span: "Span") -> Token:
         """
-        Return a `Token` that contains every character from `span`.
-        Raise `TokenNotFoundError` if one can't be produced.
+        Return a `Token` that contains every character from `span`
+        or `NULL_TOKEN` if one doesn't exist.
         """
         try:
             tokens = self.tokens_on_page[span.page]
             first = bisect_right(tokens, span.start, key=attrgetter("span.end"))
             last = bisect_left(tokens, span.end, lo=first, key=attrgetter("span.start"))
             tokens = tokens[first:last]
-        except (IndexError, ValueError) as error:
-            raise TokenNotFoundError(f"no token contains {span!r}") from error
+            assert tokens
+        except (AssertionError, IndexError, ValueError):
+            return NULL_TOKEN
 
         return Token(
             text=self.text[span.slice],
@@ -79,28 +82,53 @@ def token_for(self, span: Span) -> Token:
             span=span,
         )
 
-    def table_cell_for(self, token: Token) -> "tuple[Table, Cell]":
+    _TableCellSpan = namedtuple("_TableCellSpan", ["table", "cell", "span"])
+
+    @cached_property
+    def _table_cell_spans_on_page(self) -> "tuple[tuple[_TableCellSpan, ...], ...]":
+        """
+        Order table cells on each page by their spans such that they can be bisected.
+        """
+        return tuple(
+            tuple(
+                sorted(
+                    (
+                        self._TableCellSpan(table, cell, span)
+                        for table in page_tables
+                        for cell in table.cells
+                        for span in cell.spans
+                        if span
+                    ),
+                    key=attrgetter("span"),
+                )
+            )
+            for page_tables in self.tables_on_page
+        )
+
+    def table_cells_for(self, span: "Span") -> "Iterator[tuple[Table, Cell]]":
         """
-        Return the `Table` and `Cell` that contain the midpoint of `token`.
-        Raise `TableCellNotFoundError` if it's not inside a table cell.
+        Yield the table cells that overlap with `span`.
+
+        Note: a single span may overlap the same cell multiple times causing it to be
+        yielded multiple times. Deduplication in `DocumentExtraction.table_cells`
+        accounts for this when OCR is assigned with `PredictionList.assign_ocr()`.
         """
-        token_vmid = (token.box.top + token.box.bottom) // 2
-        token_hmid = (token.box.left + token.box.right) // 2
-
-        for table in self.tables_on_page[token.box.page]:
-            if (
-                (table.box.top  <= token_vmid <= table.box.bottom) and
-                (table.box.left <= token_hmid <= table.box.right)
-            ):  # fmt: skip
-                break
-        else:
-            raise TableCellNotFoundError(f"no table contains {token!r}")
-
-        for cell in table.cells:
-            if (
-                (cell.box.top  <= token_vmid <= cell.box.bottom) and
-                (cell.box.left <= token_hmid <= cell.box.right)
-            ):  # fmt: skip
-                return table, cell
-        else:
-            raise TableCellNotFoundError(f"no cell contains {token!r}")
+        try:
+            page_table_cell_spans = self._table_cell_spans_on_page[span.page]
+            first = bisect_right(
+                page_table_cell_spans,
+                span.start,
+                key=attrgetter("span.end"),
+            )
+            last = bisect_left(
+                page_table_cell_spans,
+                span.end,
+                lo=first,
+                key=attrgetter("span.start"),
+            )
+            table_cell_spans = page_table_cell_spans[first:last]
+        except (IndexError, ValueError):
+            table_cell_spans = tuple()
+
+        for table, cell, span in table_cell_spans:
+            yield table, cell
diff --git a/indico_toolkit/etloutput/range.py b/indico_toolkit/etloutput/range.py
@@ -1,6 +1,7 @@
 from dataclasses import dataclass
+from typing import Final
 
-from ..results.utils import get
+from .utils import get
 
 
 @dataclass(order=True, frozen=True)
@@ -12,6 +13,9 @@ class Range:
     rows: "tuple[int, ...]"
     columns: "tuple[int, ...]"
 
+    def __bool__(self) -> bool:
+        return self != NULL_RANGE
+
     @staticmethod
     def from_dict(cell: object) -> "Range":
         """
@@ -28,3 +32,17 @@ def from_dict(cell: object) -> "Range":
             rows=tuple(rows),
             columns=tuple(columns),
         )
+
+
+# It's more ergonomic to represent the lack of ranges with a special null range object
+# rather than using `None` or raising an error. This lets you e.g. sort by the `range`
+# attribute without having to constantly check for `None`, while still allowing you do
+# a "None check" with `bool(cell.range)` or `cell.range == NULL_RANGE`.
+NULL_RANGE: Final = Range(
+    row=0,
+    column=0,
+    rowspan=0,
+    columnspan=0,
+    rows=tuple(),
+    columns=tuple(),
+)