Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
2ecd797
Move `Box`, `Span`, and utils to `etloutput` to avoid circular imports
mawelborn Oct 10, 2025
d9a5363
Ensure null spans raise `TokenNotFoundError` instead of `ValueError`
mawelborn Oct 10, 2025
f586553
Parse table spans
mawelborn Oct 10, 2025
c207996
Add `NULL_CELL`, `NULL_RANGE`, `NULL_TABLE`, and `NULL_TOKEN`
mawelborn Oct 10, 2025
8fd5845
Add `DocumentExtraction` properties for OCR tokens, tables, and cells
mawelborn Oct 10, 2025
f44b8f1
Add set-like overlap syntax for `Box` and `Span`
mawelborn Oct 10, 2025
95301f5
Rewrite table cell lookup to support multiple cells using a naive spa…
mawelborn Oct 10, 2025
85552ea
Return `NULL_TOKEN` rather than raising an error for failed token loo…
mawelborn Oct 10, 2025
b9712ef
Replace custom `ResultError` with idiomatic `ValueError`
mawelborn Oct 10, 2025
a436ebf
Remove unused custom ETL Output and Result error classes
mawelborn Oct 10, 2025
6ebe77e
Clean up `TYPE_CHECKING` imports
mawelborn Oct 10, 2025
dc32773
Clean up some comments and formatting
mawelborn Oct 10, 2025
6e1ad81
Add `PredictionList.assign_ocr()` method
mawelborn Oct 10, 2025
7fdf57c
Rewrite `EtlOutput.table_cells_for()` using a bisection algorithm
mawelborn Oct 10, 2025
9ba9fea
Optimize table cell lookup by bisecting a single page instead of the …
mawelborn Oct 10, 2025
fe419ff
Bump version and update changelog
mawelborn Oct 14, 2025
477367e
Speed up `.groupby("table")` and `.groupby("cell")` with custom __hash__
mawelborn Oct 20, 2025
fb797b6
Add prediction `.copy()` methods that only copy mutable state
mawelborn Oct 20, 2025
68c407d
Update changelog
mawelborn Oct 20, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,34 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and versions match the minimum IPA version required to use functionality.


## [v7.2.2] - 2025-10-14

### Added

- Parse table spans from ETL Output as `Table.spans`.
- `NULL_CELL`, `NULL_RANGE`, `NULL_TABLE`, and `NULL_TOKEN` constants.
- Document Extraction attributes for assigning tokens, tables, and cells from OCR:
- `DocumentExtraction.tokens`, `DocumentExtraction.tables`, `DocumentExtraction.cells`
- Document Extraction convenience properties for singular token, table, and cell access:
- `DocumentExtraction.token`, `DocumentExtraction.table`, `DocumentExtraction.cell`
- `PredictionList.assign_ocr(etl_outputs, tokens=True, tables=True)` method.
- Custom `__hash__` methods for tables and cells to speed up `.groupby(...)`.
- Prediction `.copy()` methods that only copy mutable state.

### Changed

- Move `Box` and `Span` from results to etloutput to avoid circular imports.
(Both can still be imported from either module.)
- Return `NULL_TOKEN` instead of raising an exception from `EtlOutput.token_for(span)`.
- Rewrite table cell lookup `EtlOutput.table_cells_for(span)` using a fast, span-based,
binary search algorithm that can return multiple overlapped table cells.

### Removed

- Custom `results` and `etloutput` error classes that are nearly never used.
(Replaced with idiomatic Python error classes.)


## [v7.2.1] - 2025-09-09

### Fixed
Expand Down Expand Up @@ -265,6 +293,7 @@ This is the first major version release tested to work on Indico 6.X.
- Row Association now also sorting on 'bbtop'.


[v7.2.1]: https://github.com/IndicoDataSolutions/indico-toolkit-python/compare/v7.2.1...v7.2.2
[v7.2.1]: https://github.com/IndicoDataSolutions/indico-toolkit-python/compare/v7.2.0...v7.2.1
[v7.2.0]: https://github.com/IndicoDataSolutions/indico-toolkit-python/compare/v6.14.2...v7.2.0
[v6.14.2]: https://github.com/IndicoDataSolutions/indico-toolkit-python/compare/v6.14.1...v6.14.2
Expand Down
2 changes: 1 addition & 1 deletion indico_toolkit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@
"ToolkitStaggeredLoopError",
"ToolkitStatusError",
)
__version__ = "7.2.1"
__version__ = "7.2.2"
21 changes: 11 additions & 10 deletions indico_toolkit/etloutput/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from typing import TYPE_CHECKING, TypeAlias, TypeVar

from ..results import NULL_BOX, NULL_SPAN, Box, Span
from ..results.utils import get, has, json_loaded, str_decoded
from .cell import Cell, CellType
from .errors import EtlOutputError, TableCellNotFoundError, TokenNotFoundError
from .box import NULL_BOX, Box
from .cell import NULL_CELL, Cell, CellType
from .etloutput import EtlOutput
from .range import Range
from .table import Table
from .token import Token
from .range import NULL_RANGE, Range
from .span import NULL_SPAN, Span
from .table import NULL_TABLE, Table
from .token import NULL_TOKEN, Token
from .utils import get, has, json_loaded, str_decoded

if TYPE_CHECKING:
from collections.abc import Awaitable, Callable
Expand All @@ -17,17 +17,18 @@
"Cell",
"CellType",
"EtlOutput",
"EtlOutputError",
"load",
"load_async",
"NULL_BOX",
"NULL_CELL",
"NULL_RANGE",
"NULL_SPAN",
"NULL_TABLE",
"NULL_TOKEN",
"Range",
"Span",
"Table",
"TableCellNotFoundError",
"Token",
"TokenNotFoundError",
)

Loadable: TypeAlias = "dict[str, object] | list[object] | str | bytes"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING
from typing import Final

from ..utils import get

if TYPE_CHECKING:
from typing import Final
from .utils import get


@dataclass(frozen=True)
Expand All @@ -18,6 +15,30 @@ class Box:
def __bool__(self) -> bool:
return self != NULL_BOX

def __and__(self, other: "Box") -> "Box":
"""
Return a new `Box` for the overlap between `self` and `other`
or `NULL_BOX` if they don't overlap.

Supports set-like `extraction.box & cell.box` syntax.
"""
if (
self.page != other.page
or self.bottom <= other.top # `self` is above `other`
or self.top >= other.bottom # `self` is below `other`
or self.right <= other.left # `self` is to the left of `other`
or self.left >= other.right # `self` is to the right of `other`
):
return NULL_BOX
else:
return Box(
page=self.page,
top=max(self.top, other.top),
left=max(self.left, other.left),
right=min(self.right, other.right),
bottom=min(self.bottom, other.bottom),
)

def __lt__(self, other: "Box") -> bool:
"""
Bounding boxes are sorted with vertical hysteresis. Those on the same line are
Expand Down Expand Up @@ -58,4 +79,4 @@ def from_dict(box: object) -> "Box":
# object rather than using `None` or raising an error. This lets you e.g. sort by the
# `box` attribute without having to constantly check for `None`, while still allowing
# you do a "None check" with `bool(extraction.box)` or `extraction.box == NULL_BOX`.
NULL_BOX: "Final" = Box(page=0, top=0, left=0, right=0, bottom=0)
NULL_BOX: Final = Box(page=0, top=0, left=0, right=0, bottom=0)
33 changes: 30 additions & 3 deletions indico_toolkit/etloutput/cell.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from dataclasses import dataclass
from enum import Enum
from typing import Final

from ..results import NULL_SPAN, Box, Span
from ..results.utils import get
from .range import Range
from .box import NULL_BOX, Box
from .range import NULL_RANGE, Range
from .span import NULL_SPAN, Span
from .utils import get


class CellType(Enum):
Expand All @@ -19,6 +21,18 @@ class Cell:
range: Range
spans: "tuple[Span, ...]"

def __bool__(self) -> bool:
return self != NULL_CELL

def __hash__(self) -> int:
"""
Uniquely identify cells by hashing their bounding box and spans.

This is small speedup for `.groupby(attrgetter("cell"))` compared to
dataclasses's default __hash__ implementation.
"""
return hash((self.box, self.spans))

@property
def span(self) -> Span:
"""
Expand All @@ -45,3 +59,16 @@ def from_dict(cell: object, page: int) -> "Cell":
range=Range.from_dict(cell),
spans=tuple(map(Span.from_dict, get(cell, list, "doc_offsets"))),
)


# It's more ergonomic to represent the lack of cells with a special null cell object
# rather than using `None` or raising an error. This lets you e.g. sort by the `cell`
# attribute without having to constantly check for `None`, while still allowing you do
# a "None check" with `bool(extraction.cell)` or `extraction.cell == NULL_CELL`.
NULL_CELL: Final = Cell(
type=CellType.CONTENT,
text="",
box=NULL_BOX,
range=NULL_RANGE,
spans=tuple(),
)
16 changes: 0 additions & 16 deletions indico_toolkit/etloutput/errors.py

This file was deleted.

92 changes: 60 additions & 32 deletions indico_toolkit/etloutput/etloutput.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
import itertools
from bisect import bisect_left, bisect_right
from collections import namedtuple
from dataclasses import dataclass
from functools import cached_property
from operator import attrgetter
from typing import TYPE_CHECKING

from ..results import Box, Span
from .errors import TableCellNotFoundError, TokenNotFoundError
from .box import Box
from .table import Table
from .token import Token
from .token import NULL_TOKEN, Token

if TYPE_CHECKING:
from collections.abc import Iterable
from collections.abc import Iterable, Iterator

from .cell import Cell
from .span import Span


@dataclass(frozen=True)
Expand Down Expand Up @@ -54,18 +56,19 @@ def from_pages(
tables_on_page=table_pages,
)

def token_for(self, span: Span) -> Token:
def token_for(self, span: "Span") -> Token:
"""
Return a `Token` that contains every character from `span`.
Raise `TokenNotFoundError` if one can't be produced.
Return a `Token` that contains every character from `span`
or `NULL_TOKEN` if one doesn't exist.
"""
try:
tokens = self.tokens_on_page[span.page]
first = bisect_right(tokens, span.start, key=attrgetter("span.end"))
last = bisect_left(tokens, span.end, lo=first, key=attrgetter("span.start"))
tokens = tokens[first:last]
except (IndexError, ValueError) as error:
raise TokenNotFoundError(f"no token contains {span!r}") from error
assert tokens
except (AssertionError, IndexError, ValueError):
return NULL_TOKEN

return Token(
text=self.text[span.slice],
Expand All @@ -79,28 +82,53 @@ def token_for(self, span: Span) -> Token:
span=span,
)

def table_cell_for(self, token: Token) -> "tuple[Table, Cell]":
_TableCellSpan = namedtuple("_TableCellSpan", ["table", "cell", "span"])

@cached_property
def _table_cell_spans_on_page(self) -> "tuple[tuple[_TableCellSpan, ...], ...]":
"""
Order table cells on each page by their spans such that they can be bisected.
"""
return tuple(
tuple(
sorted(
(
self._TableCellSpan(table, cell, span)
for table in page_tables
for cell in table.cells
for span in cell.spans
if span
),
key=attrgetter("span"),
)
)
for page_tables in self.tables_on_page
)

def table_cells_for(self, span: "Span") -> "Iterator[tuple[Table, Cell]]":
"""
Return the `Table` and `Cell` that contain the midpoint of `token`.
Raise `TableCellNotFoundError` if it's not inside a table cell.
Yield the table cells that overlap with `span`.

Note: a single span may overlap the same cell multiple times causing it to be
yielded multiple times. Deduplication in `DocumentExtraction.table_cells`
accounts for this when OCR is assigned with `PredictionList.assign_ocr()`.
"""
token_vmid = (token.box.top + token.box.bottom) // 2
token_hmid = (token.box.left + token.box.right) // 2

for table in self.tables_on_page[token.box.page]:
if (
(table.box.top <= token_vmid <= table.box.bottom) and
(table.box.left <= token_hmid <= table.box.right)
): # fmt: skip
break
else:
raise TableCellNotFoundError(f"no table contains {token!r}")

for cell in table.cells:
if (
(cell.box.top <= token_vmid <= cell.box.bottom) and
(cell.box.left <= token_hmid <= cell.box.right)
): # fmt: skip
return table, cell
else:
raise TableCellNotFoundError(f"no cell contains {token!r}")
try:
page_table_cell_spans = self._table_cell_spans_on_page[span.page]
first = bisect_right(
page_table_cell_spans,
span.start,
key=attrgetter("span.end"),
)
last = bisect_left(
page_table_cell_spans,
span.end,
lo=first,
key=attrgetter("span.start"),
)
table_cell_spans = page_table_cell_spans[first:last]
except (IndexError, ValueError):
table_cell_spans = tuple()

for table, cell, span in table_cell_spans:
yield table, cell
20 changes: 19 additions & 1 deletion indico_toolkit/etloutput/range.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from dataclasses import dataclass
from typing import Final

from ..results.utils import get
from .utils import get


@dataclass(order=True, frozen=True)
Expand All @@ -12,6 +13,9 @@ class Range:
rows: "tuple[int, ...]"
columns: "tuple[int, ...]"

def __bool__(self) -> bool:
return self != NULL_RANGE

@staticmethod
def from_dict(cell: object) -> "Range":
"""
Expand All @@ -28,3 +32,17 @@ def from_dict(cell: object) -> "Range":
rows=tuple(rows),
columns=tuple(columns),
)


# It's more ergonomic to represent the lack of ranges with a special null range object
# rather than using `None` or raising an error. This lets you e.g. sort by the `range`
# attribute without having to constantly check for `None`, while still allowing you do
# a "None check" with `bool(cell.range)` or `cell.range == NULL_RANGE`.
NULL_RANGE: Final = Range(
row=0,
column=0,
rowspan=0,
columnspan=0,
rows=tuple(),
columns=tuple(),
)
Loading