From 2ecd7978eea94e33660575fc91c06a2f8c7cb2cf Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Thu, 9 Oct 2025 21:49:54 -0500 Subject: [PATCH 01/19] Move `Box`, `Span`, and utils to `etloutput` to avoid circular imports --- indico_toolkit/etloutput/__init__.py | 5 +- .../{results/predictions => etloutput}/box.py | 2 +- indico_toolkit/etloutput/cell.py | 5 +- indico_toolkit/etloutput/etloutput.py | 3 +- indico_toolkit/etloutput/range.py | 2 +- .../predictions => etloutput}/span.py | 2 +- indico_toolkit/etloutput/table.py | 4 +- indico_toolkit/etloutput/token.py | 5 +- indico_toolkit/etloutput/utils.py | 72 +++++++++++++++++ indico_toolkit/results/__init__.py | 5 +- .../results/predictions/__init__.py | 6 -- .../results/predictions/citation.py | 2 +- .../results/predictions/documentextraction.py | 2 +- .../results/predictions/formextraction.py | 2 +- .../results/predictions/summarization.py | 2 +- .../results/predictions/unbundling.py | 2 +- indico_toolkit/results/utils.py | 81 +++---------------- tests/etloutput/test_rowspan_colspan.py | 3 +- tests/etloutput/test_utils.py | 54 +++++++++++++ 19 files changed, 161 insertions(+), 98 deletions(-) rename indico_toolkit/{results/predictions => etloutput}/box.py (98%) rename indico_toolkit/{results/predictions => etloutput}/span.py (97%) create mode 100644 indico_toolkit/etloutput/utils.py create mode 100644 tests/etloutput/test_utils.py diff --git a/indico_toolkit/etloutput/__init__.py b/indico_toolkit/etloutput/__init__.py index 9c7f3fe..0fc3f31 100644 --- a/indico_toolkit/etloutput/__init__.py +++ b/indico_toolkit/etloutput/__init__.py @@ -1,13 +1,14 @@ from typing import TYPE_CHECKING, TypeAlias, TypeVar -from ..results import NULL_BOX, NULL_SPAN, Box, Span -from ..results.utils import get, has, json_loaded, str_decoded +from .box import NULL_BOX, Box from .cell import Cell, CellType from .errors import EtlOutputError, TableCellNotFoundError, TokenNotFoundError from .etloutput import EtlOutput from .range import Range +from .span import NULL_SPAN, Span from .table import Table from .token import Token +from .utils import get, has, json_loaded, str_decoded if TYPE_CHECKING: from collections.abc import Awaitable, Callable diff --git a/indico_toolkit/results/predictions/box.py b/indico_toolkit/etloutput/box.py similarity index 98% rename from indico_toolkit/results/predictions/box.py rename to indico_toolkit/etloutput/box.py index fb78147..1269213 100644 --- a/indico_toolkit/results/predictions/box.py +++ b/indico_toolkit/etloutput/box.py @@ -1,7 +1,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING -from ..utils import get +from .utils import get if TYPE_CHECKING: from typing import Final diff --git a/indico_toolkit/etloutput/cell.py b/indico_toolkit/etloutput/cell.py index 292a78e..d60ec1c 100644 --- a/indico_toolkit/etloutput/cell.py +++ b/indico_toolkit/etloutput/cell.py @@ -1,9 +1,10 @@ from dataclasses import dataclass from enum import Enum -from ..results import NULL_SPAN, Box, Span -from ..results.utils import get +from .box import Box from .range import Range +from .span import NULL_SPAN, Span +from .utils import get class CellType(Enum): diff --git a/indico_toolkit/etloutput/etloutput.py b/indico_toolkit/etloutput/etloutput.py index 899dade..66be61b 100644 --- a/indico_toolkit/etloutput/etloutput.py +++ b/indico_toolkit/etloutput/etloutput.py @@ -4,8 +4,9 @@ from operator import attrgetter from typing import TYPE_CHECKING -from ..results import Box, Span +from .box import Box from .errors import TableCellNotFoundError, TokenNotFoundError +from .span import Span from .table import Table from .token import Token diff --git a/indico_toolkit/etloutput/range.py b/indico_toolkit/etloutput/range.py index 7d26431..a0db7e8 100644 --- a/indico_toolkit/etloutput/range.py +++ b/indico_toolkit/etloutput/range.py @@ -1,6 +1,6 @@ from dataclasses import dataclass -from ..results.utils import get +from .utils import get @dataclass(order=True, frozen=True) diff --git a/indico_toolkit/results/predictions/span.py b/indico_toolkit/etloutput/span.py similarity index 97% rename from indico_toolkit/results/predictions/span.py rename to indico_toolkit/etloutput/span.py index 8ca2c31..bd33d31 100644 --- a/indico_toolkit/results/predictions/span.py +++ b/indico_toolkit/etloutput/span.py @@ -1,7 +1,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING -from ..utils import get +from .utils import get if TYPE_CHECKING: from typing import Any, Final diff --git a/indico_toolkit/etloutput/table.py b/indico_toolkit/etloutput/table.py index 807f7ea..2c3dff9 100644 --- a/indico_toolkit/etloutput/table.py +++ b/indico_toolkit/etloutput/table.py @@ -1,9 +1,9 @@ from dataclasses import dataclass from operator import attrgetter -from ..results import Box -from ..results.utils import get +from .box import Box from .cell import Cell +from .utils import get @dataclass(frozen=True) diff --git a/indico_toolkit/etloutput/token.py b/indico_toolkit/etloutput/token.py index 9478d39..f08d4a8 100644 --- a/indico_toolkit/etloutput/token.py +++ b/indico_toolkit/etloutput/token.py @@ -1,7 +1,8 @@ from dataclasses import dataclass -from ..results import Box, Span -from ..results.utils import get +from .box import Box +from .span import Span +from .utils import get @dataclass(frozen=True) diff --git a/indico_toolkit/etloutput/utils.py b/indico_toolkit/etloutput/utils.py new file mode 100644 index 0000000..5a91a2a --- /dev/null +++ b/indico_toolkit/etloutput/utils.py @@ -0,0 +1,72 @@ +import json +from typing import TYPE_CHECKING, TypeVar + +if TYPE_CHECKING: + from typing import Any + +Value = TypeVar("Value") + + +def get(value: object, value_type: "type[Value]", *keys: "str | int") -> Value: + """ + Return the value of type `value_type` obtained by traversing `value` using `keys`. + Raise an error if a key doesn't exist or the value has the wrong type. + """ + for key in keys: + if isinstance(value, dict): + if key in value: + value = value[key] + else: + raise KeyError(f"{key!r} not in {value.keys()!r}") + elif isinstance(value, list): + if isinstance(key, int): + if 0 <= key < len(value): + value = value[key] + else: + raise IndexError(f"{key} out of range [0,{len(value)})") + else: + raise TypeError(f"list can't be indexed with {key!r}") + else: + raise TypeError(f"{type(value)} can't be traversed") + + if isinstance(value, value_type): + return value + else: + raise TypeError(f"value `{value!r}` doesn't have type {value_type}") + + +def has(value: object, value_type: "type[Value]", *keys: "str | int") -> bool: + """ + Check if `value` can be traversed using `keys` to a value of type `value_type`. + """ + for key in keys: + if isinstance(value, dict) and key in value: + value = value[key] + elif isinstance(value, list) and isinstance(key, int) and 0 <= key < len(value): # fmt: skip # noqa: E501 + value = value[key] + else: + return False + + return isinstance(value, value_type) + + +def json_loaded(value: "Any") -> "Any": + """ + Ensure `value` has been loaded as JSON. + """ + value = str_decoded(value) + + if isinstance(value, str): + value = json.loads(value) + + return value + + +def str_decoded(value: str | bytes) -> str: + """ + Ensure `value` has been decoded to a string. + """ + if isinstance(value, bytes): + value = value.decode() + + return value diff --git a/indico_toolkit/results/__init__.py b/indico_toolkit/results/__init__.py index ea9a4e8..660d687 100644 --- a/indico_toolkit/results/__init__.py +++ b/indico_toolkit/results/__init__.py @@ -1,13 +1,11 @@ from typing import TYPE_CHECKING, TypeAlias, TypeVar, overload +from ..etloutput import NULL_BOX, NULL_SPAN, Box, Span from .document import Document from .errors import ResultError from .predictionlist import PredictionList from .predictions import ( - NULL_BOX, NULL_CITATION, - NULL_SPAN, - Box, Classification, DocumentExtraction, Extraction, @@ -15,7 +13,6 @@ FormExtractionType, Group, Prediction, - Span, Summarization, Unbundling, ) diff --git a/indico_toolkit/results/predictions/__init__.py b/indico_toolkit/results/predictions/__init__.py index 5a29151..eda6709 100644 --- a/indico_toolkit/results/predictions/__init__.py +++ b/indico_toolkit/results/predictions/__init__.py @@ -3,7 +3,6 @@ from ..errors import ResultError from ..normalization import normalize_prediction_dict from ..task import TaskType -from .box import NULL_BOX, Box from .citation import NULL_CITATION, Citation from .classification import Classification from .documentextraction import DocumentExtraction @@ -11,7 +10,6 @@ from .formextraction import FormExtraction, FormExtractionType from .group import Group from .prediction import Prediction -from .span import NULL_SPAN, Span from .summarization import Summarization from .unbundling import Unbundling @@ -21,7 +19,6 @@ from ..task import Task __all__ = ( - "Box", "Citation", "Classification", "DocumentExtraction", @@ -29,11 +26,8 @@ "FormExtraction", "FormExtractionType", "Group", - "NULL_BOX", "NULL_CITATION", - "NULL_SPAN", "Prediction", - "Span", "Summarization", "Unbundling", ) diff --git a/indico_toolkit/results/predictions/citation.py b/indico_toolkit/results/predictions/citation.py index 177cf60..9cd42b0 100644 --- a/indico_toolkit/results/predictions/citation.py +++ b/indico_toolkit/results/predictions/citation.py @@ -1,8 +1,8 @@ from dataclasses import dataclass from typing import TYPE_CHECKING +from ...etloutput import NULL_SPAN, Span from ..utils import get -from .span import NULL_SPAN, Span if TYPE_CHECKING: from typing import Any, Final diff --git a/indico_toolkit/results/predictions/documentextraction.py b/indico_toolkit/results/predictions/documentextraction.py index 5a112d8..0b4d611 100644 --- a/indico_toolkit/results/predictions/documentextraction.py +++ b/indico_toolkit/results/predictions/documentextraction.py @@ -1,11 +1,11 @@ from dataclasses import dataclass from typing import TYPE_CHECKING +from ...etloutput import NULL_SPAN, Span from ..review import Review from ..utils import get, has, omit from .extraction import Extraction from .group import Group -from .span import NULL_SPAN, Span if TYPE_CHECKING: from typing import Any diff --git a/indico_toolkit/results/predictions/formextraction.py b/indico_toolkit/results/predictions/formextraction.py index 15a01a9..7e90535 100644 --- a/indico_toolkit/results/predictions/formextraction.py +++ b/indico_toolkit/results/predictions/formextraction.py @@ -2,9 +2,9 @@ from enum import Enum from typing import TYPE_CHECKING +from ...etloutput import Box from ..review import Review from ..utils import get, has, omit -from .box import Box from .extraction import Extraction if TYPE_CHECKING: diff --git a/indico_toolkit/results/predictions/summarization.py b/indico_toolkit/results/predictions/summarization.py index 57fe56a..bed82b4 100644 --- a/indico_toolkit/results/predictions/summarization.py +++ b/indico_toolkit/results/predictions/summarization.py @@ -9,9 +9,9 @@ if TYPE_CHECKING: from typing import Any + from ...etloutput import Span from ..document import Document from ..task import Task - from .span import Span @dataclass diff --git a/indico_toolkit/results/predictions/unbundling.py b/indico_toolkit/results/predictions/unbundling.py index 4e15042..cdca292 100644 --- a/indico_toolkit/results/predictions/unbundling.py +++ b/indico_toolkit/results/predictions/unbundling.py @@ -1,10 +1,10 @@ from dataclasses import dataclass from typing import TYPE_CHECKING +from ...etloutput import Span from ..review import Review from ..utils import get, omit from .prediction import Prediction -from .span import Span if TYPE_CHECKING: from typing import Any diff --git a/indico_toolkit/results/utils.py b/indico_toolkit/results/utils.py index 9c8e5f0..ae6b41c 100644 --- a/indico_toolkit/results/utils.py +++ b/indico_toolkit/results/utils.py @@ -1,66 +1,19 @@ -import json from collections.abc import Iterable, Iterator -from typing import TYPE_CHECKING, TypeVar +from typing import TYPE_CHECKING -if TYPE_CHECKING: - from typing import Any, Callable - -Value = TypeVar("Value") - - -def get(result: object, value_type: "type[Value]", *keys: "str | int") -> Value: - """ - Return the value of type `value_type` obtained by traversing `result` using `keys`. - Raise an error if a key doesn't exist or the value has the wrong type. - """ - for key in keys: - if isinstance(result, dict): - if key in result: - result = result[key] - else: - raise KeyError(f"{key!r} not in {result.keys()!r}") - elif isinstance(result, list): - if isinstance(key, int): - if 0 <= key < len(result): - result = result[key] - else: - raise IndexError(f"{key} out of range [0,{len(result)})") - else: - raise TypeError(f"list can't be indexed with {key!r}") - else: - raise TypeError(f"{type(result)} can't be traversed") - - if isinstance(result, value_type): - return result - else: - raise TypeError(f"value `{result!r}` doesn't have type {value_type}") - - -def has(result: object, value_type: "type[Value]", *keys: "str | int") -> bool: - """ - Check if `result` can be traversed using `keys` to a value of type `value_type`. - """ - for key in keys: - if isinstance(result, dict) and key in result: - result = result[key] - elif isinstance(result, list) and isinstance(key, int) and 0 <= key < len(result): # fmt: skip # noqa: E501 - result = result[key] - else: - return False +from ..etloutput.utils import Value, get, has, json_loaded, str_decoded - return isinstance(result, value_type) - - -def json_loaded(value: "Any") -> "Any": - """ - Ensure `value` has been loaded as JSON. - """ - value = str_decoded(value) - - if isinstance(value, str): - value = json.loads(value) +if TYPE_CHECKING: + from typing import Callable - return value +__all__ = ( + "get", + "has", + "json_loaded", + "nfilter", + "omit", + "str_decoded", +) def nfilter( @@ -89,13 +42,3 @@ def omit(dictionary: object, *keys: str) -> "dict[str, Value]": for key, value in dictionary.items() if key not in keys } # fmt: skip - - -def str_decoded(value: str | bytes) -> str: - """ - Ensure `value` has been decoded to a string. - """ - if isinstance(value, bytes): - value = value.decode() - - return value diff --git a/tests/etloutput/test_rowspan_colspan.py b/tests/etloutput/test_rowspan_colspan.py index b21409c..2b1715e 100644 --- a/tests/etloutput/test_rowspan_colspan.py +++ b/tests/etloutput/test_rowspan_colspan.py @@ -3,8 +3,7 @@ import pytest from indico_toolkit import etloutput -from indico_toolkit.etloutput import EtlOutput, Table -from indico_toolkit.results import Span +from indico_toolkit.etloutput import EtlOutput, Span, Table data_folder = Path(__file__).parent.parent / "data" / "etloutput" etl_output_file = data_folder / "4725" / "112731" / "112257" / "etl_output_rs_cs.json" diff --git a/tests/etloutput/test_utils.py b/tests/etloutput/test_utils.py new file mode 100644 index 0000000..4da4d0d --- /dev/null +++ b/tests/etloutput/test_utils.py @@ -0,0 +1,54 @@ +import pytest + +from indico_toolkit.etloutput.utils import get, has + + +@pytest.fixture +def cell() -> "dict[str, object]": + return { + "cell_type": "header", + "columns": [0], + "rows": [0], + "doc_offsets": [{"start": 285, "end": 289}], + "position": {"bottom": 1209, "left": 150, "right": 848, "top": 1107}, + "text": "Item", + } + + +def test_get_has(cell: "dict[str, object]") -> None: + assert has(cell, str, "text") + assert get(cell, str, "text") == "Item" + + assert has(cell, dict, "position") + assert has(cell, int, "position", "top") + assert get(cell, int, "position", "top") == 1107 + + assert has(cell, list, "doc_offsets") + assert has(cell, int, "doc_offsets", 0, "start") + assert get(cell, int, "doc_offsets", 0, "start") == 285 + + +def test_get_has_not(cell: object) -> None: + assert not has(cell, str, "missing") + with pytest.raises(KeyError): + get(cell, str, "missing") + + assert not has(cell, int, "text") + with pytest.raises(TypeError): + get(cell, int, "text") + + assert not has(cell, float, "position", "top", 0) + with pytest.raises(TypeError): + get(cell, float, "position", "top", 0) + + assert not has(cell, int, "doc_offsets", "0", "start") + with pytest.raises(TypeError): + get(cell, int, "doc_offsets", "0", "start") + + assert not has(cell, int, "doc_offsets", -1, "start") + with pytest.raises(IndexError): + get(cell, int, "doc_offsets", -1, "start") + + assert not has(cell, int, "doc_offsets", -1, "start") + with pytest.raises(IndexError): + get(cell, int, "doc_offsets", 1, "start") From d9a5363ca37d70da386b63a9555c3dee224d65ff Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Thu, 9 Oct 2025 23:10:48 -0500 Subject: [PATCH 02/19] Ensure null spans raise `TokenNotFoundError` instead of `ValueError` --- indico_toolkit/etloutput/etloutput.py | 3 ++- tests/etloutput/test_token_table_cell.py | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/indico_toolkit/etloutput/etloutput.py b/indico_toolkit/etloutput/etloutput.py index 66be61b..0bb573b 100644 --- a/indico_toolkit/etloutput/etloutput.py +++ b/indico_toolkit/etloutput/etloutput.py @@ -65,7 +65,8 @@ def token_for(self, span: Span) -> Token: first = bisect_right(tokens, span.start, key=attrgetter("span.end")) last = bisect_left(tokens, span.end, lo=first, key=attrgetter("span.start")) tokens = tokens[first:last] - except (IndexError, ValueError) as error: + assert tokens + except (AssertionError, IndexError, ValueError) as error: raise TokenNotFoundError(f"no token contains {span!r}") from error return Token( diff --git a/tests/etloutput/test_token_table_cell.py b/tests/etloutput/test_token_table_cell.py index b994d1a..14c8f2e 100644 --- a/tests/etloutput/test_token_table_cell.py +++ b/tests/etloutput/test_token_table_cell.py @@ -62,6 +62,11 @@ def test_token_not_found(etl_output: EtlOutput, header_span: Span) -> None: etl_output.token_for(replace(header_span, page=3)) +def test_null_span_not_found(etl_output: EtlOutput) -> None: + with pytest.raises(TokenNotFoundError): + etl_output.token_for(NULL_SPAN) + + def test_table_cell( etl_output: EtlOutput, header_span: Span, content_span: Span ) -> None: From f586553744968ccf39727caa2443519a1810ab12 Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Thu, 9 Oct 2025 23:13:58 -0500 Subject: [PATCH 03/19] Parse table spans --- indico_toolkit/etloutput/table.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/indico_toolkit/etloutput/table.py b/indico_toolkit/etloutput/table.py index 2c3dff9..fab0f7a 100644 --- a/indico_toolkit/etloutput/table.py +++ b/indico_toolkit/etloutput/table.py @@ -3,16 +3,25 @@ from .box import Box from .cell import Cell +from .span import NULL_SPAN, Span from .utils import get @dataclass(frozen=True) class Table: box: Box + spans: "tuple[Span, ...]" cells: "tuple[Cell, ...]" rows: "tuple[tuple[Cell, ...], ...]" columns: "tuple[tuple[Cell, ...], ...]" + @property + def span(self) -> Span: + """ + Return the first `Span` the table covers or `NULL_SPAN` otherwise. + """ + return self.spans[0] if self.spans else NULL_SPAN + @staticmethod def from_dict(table: object) -> "Table": """ @@ -50,8 +59,12 @@ def from_dict(table: object) -> "Table": for column in range(column_count) ) # fmt: skip + for doc_offset in get(table, list, "doc_offsets"): + doc_offset["page_num"] = page + return Table( box=Box.from_dict(get(table, dict, "position")), + spans=tuple(map(Span.from_dict, get(table, list, "doc_offsets"))), cells=cells, rows=rows, columns=columns, From c2079962477d436941c90f8ed553361e3d9620d1 Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Thu, 9 Oct 2025 23:22:49 -0500 Subject: [PATCH 04/19] Add `NULL_CELL`, `NULL_RANGE`, `NULL_TABLE`, and `NULL_TOKEN` --- indico_toolkit/etloutput/__init__.py | 12 ++++++++---- indico_toolkit/etloutput/cell.py | 24 ++++++++++++++++++++++-- indico_toolkit/etloutput/range.py | 21 +++++++++++++++++++++ indico_toolkit/etloutput/table.py | 22 +++++++++++++++++++++- indico_toolkit/etloutput/token.py | 22 ++++++++++++++++++++-- 5 files changed, 92 insertions(+), 9 deletions(-) diff --git a/indico_toolkit/etloutput/__init__.py b/indico_toolkit/etloutput/__init__.py index 0fc3f31..3bfa3b4 100644 --- a/indico_toolkit/etloutput/__init__.py +++ b/indico_toolkit/etloutput/__init__.py @@ -1,13 +1,13 @@ from typing import TYPE_CHECKING, TypeAlias, TypeVar from .box import NULL_BOX, Box -from .cell import Cell, CellType +from .cell import NULL_CELL, Cell, CellType from .errors import EtlOutputError, TableCellNotFoundError, TokenNotFoundError from .etloutput import EtlOutput -from .range import Range +from .range import NULL_RANGE, Range from .span import NULL_SPAN, Span -from .table import Table -from .token import Token +from .table import NULL_TABLE, Table +from .token import NULL_TOKEN, Token from .utils import get, has, json_loaded, str_decoded if TYPE_CHECKING: @@ -22,7 +22,11 @@ "load", "load_async", "NULL_BOX", + "NULL_CELL", + "NULL_RANGE", "NULL_SPAN", + "NULL_TABLE", + "NULL_TOKEN", "Range", "Span", "Table", diff --git a/indico_toolkit/etloutput/cell.py b/indico_toolkit/etloutput/cell.py index d60ec1c..d05c14d 100644 --- a/indico_toolkit/etloutput/cell.py +++ b/indico_toolkit/etloutput/cell.py @@ -1,11 +1,15 @@ from dataclasses import dataclass from enum import Enum +from typing import TYPE_CHECKING -from .box import Box -from .range import Range +from .box import NULL_BOX, Box +from .range import NULL_RANGE, Range from .span import NULL_SPAN, Span from .utils import get +if TYPE_CHECKING: + from typing import Final + class CellType(Enum): HEADER = "header" @@ -20,6 +24,9 @@ class Cell: range: Range spans: "tuple[Span, ...]" + def __bool__(self) -> bool: + return self != NULL_CELL + @property def span(self) -> Span: """ @@ -46,3 +53,16 @@ def from_dict(cell: object, page: int) -> "Cell": range=Range.from_dict(cell), spans=tuple(map(Span.from_dict, get(cell, list, "doc_offsets"))), ) + + +# It's more ergonomic to represent the lack of cells with a special null cell object +# rather than using `None` or raising an error. This lets you e.g. sort by the `cell` +# attribute without having to constantly check for `None`, while still allowing you do +# a "None check" with `bool(extraction.cell)` or `extraction.cell == NULL_CELL`. +NULL_CELL: "Final" = Cell( + type=CellType.CONTENT, + text="", + box=NULL_BOX, + range=NULL_RANGE, + spans=tuple(), +) diff --git a/indico_toolkit/etloutput/range.py b/indico_toolkit/etloutput/range.py index a0db7e8..6933dc4 100644 --- a/indico_toolkit/etloutput/range.py +++ b/indico_toolkit/etloutput/range.py @@ -1,7 +1,11 @@ from dataclasses import dataclass +from typing import TYPE_CHECKING from .utils import get +if TYPE_CHECKING: + from typing import Final + @dataclass(order=True, frozen=True) class Range: @@ -12,6 +16,9 @@ class Range: rows: "tuple[int, ...]" columns: "tuple[int, ...]" + def __bool__(self) -> bool: + return self != NULL_RANGE + @staticmethod def from_dict(cell: object) -> "Range": """ @@ -28,3 +35,17 @@ def from_dict(cell: object) -> "Range": rows=tuple(rows), columns=tuple(columns), ) + + +# It's more ergonomic to represent the lack of ranges with a special null range object +# rather than using `None` or raising an error. This lets you e.g. sort by the `range` +# attribute without having to constantly check for `None`, while still allowing you do +# a "None check" with `bool(cell.range)` or `cell.range == NULL_RANGE`. +NULL_RANGE: "Final" = Range( + row=0, + column=0, + rowspan=0, + columnspan=0, + rows=tuple(), + columns=tuple(), +) diff --git a/indico_toolkit/etloutput/table.py b/indico_toolkit/etloutput/table.py index fab0f7a..15d419c 100644 --- a/indico_toolkit/etloutput/table.py +++ b/indico_toolkit/etloutput/table.py @@ -1,11 +1,15 @@ from dataclasses import dataclass from operator import attrgetter +from typing import TYPE_CHECKING -from .box import Box +from .box import NULL_BOX, Box from .cell import Cell from .span import NULL_SPAN, Span from .utils import get +if TYPE_CHECKING: + from typing import Final + @dataclass(frozen=True) class Table: @@ -15,6 +19,9 @@ class Table: rows: "tuple[tuple[Cell, ...], ...]" columns: "tuple[tuple[Cell, ...], ...]" + def __bool__(self) -> bool: + return self != NULL_TABLE + @property def span(self) -> Span: """ @@ -69,3 +76,16 @@ def from_dict(table: object) -> "Table": rows=rows, columns=columns, ) + + +# It's more ergonomic to represent the lack of tables with a special null table object +# rather than using `None` or raising an error. This lets you e.g. group by the `table` +# attribute without having to constantly check for `None`, while still allowing you do +# a "None check" with `bool(extraction.table)` or `extraction.table == NULL_TABLE`. +NULL_TABLE: "Final" = Table( + box=NULL_BOX, + spans=tuple(), + cells=tuple(), + rows=tuple(), + columns=tuple(), +) diff --git a/indico_toolkit/etloutput/token.py b/indico_toolkit/etloutput/token.py index f08d4a8..2c88002 100644 --- a/indico_toolkit/etloutput/token.py +++ b/indico_toolkit/etloutput/token.py @@ -1,9 +1,13 @@ from dataclasses import dataclass +from typing import TYPE_CHECKING -from .box import Box -from .span import Span +from .box import NULL_BOX, Box +from .span import NULL_SPAN, Span from .utils import get +if TYPE_CHECKING: + from typing import Final + @dataclass(frozen=True) class Token: @@ -11,6 +15,9 @@ class Token: box: Box span: Span + def __bool__(self) -> bool: + return self != NULL_TOKEN + @staticmethod def from_dict(token: object) -> "Token": """ @@ -24,3 +31,14 @@ def from_dict(token: object) -> "Token": box=Box.from_dict(get(token, dict, "position")), span=Span.from_dict(get(token, dict, "doc_offset")), ) + + +# It's more ergonomic to represent the lack of tokens with a special null token object +# rather than using `None` or raising an error. This lets you e.g. sort by the `token` +# attribute without having to constantly check for `None`, while still allowing you do +# a "None check" with `bool(extraction.token)` or `extraction.token == NULL_TOKEN`. +NULL_TOKEN: "Final" = Token( + text="", + box=NULL_BOX, + span=NULL_SPAN, +) From 8fd584558aab285a4e39505decb84f19e21f03f0 Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Fri, 10 Oct 2025 10:50:21 -0500 Subject: [PATCH 05/19] Add `DocumentExtraction` properties for OCR tokens, tables, and cells --- .../results/predictions/documentextraction.py | 91 ++++++++++++++++++- 1 file changed, 89 insertions(+), 2 deletions(-) diff --git a/indico_toolkit/results/predictions/documentextraction.py b/indico_toolkit/results/predictions/documentextraction.py index 0b4d611..284b49a 100644 --- a/indico_toolkit/results/predictions/documentextraction.py +++ b/indico_toolkit/results/predictions/documentextraction.py @@ -1,13 +1,23 @@ -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import TYPE_CHECKING -from ...etloutput import NULL_SPAN, Span +from ...etloutput import ( + NULL_CELL, + NULL_SPAN, + NULL_TABLE, + NULL_TOKEN, + Cell, + Span, + Table, + Token, +) from ..review import Review from ..utils import get, has, omit from .extraction import Extraction from .group import Group if TYPE_CHECKING: + from collections.abc import Iterable, Iterator from typing import Any from ..document import Document @@ -19,6 +29,10 @@ class DocumentExtraction(Extraction): groups: "set[Group]" spans: "list[Span]" + tokens: "list[Token]" = field(default_factory=list) + tables: "list[Table]" = field(default_factory=list) + cells: "list[Cell]" = field(default_factory=list) + @property def span(self) -> Span: """ @@ -39,6 +53,79 @@ def span(self, span: Span) -> None: """ self.spans = [span] if span else [] + @property + def token(self) -> Token: + """ + Return the first `Token` the document extraction covers + or `NULL_TOKEN` if it doesn't cover a token or OCR hasn't been assigned yet. + """ + return self.tokens[0] if self.tokens else NULL_TOKEN + + @token.setter + def token(self, token: Token) -> None: + """ + Overwrite all tokens with the one provided, handling `NULL_TOKEN`. + + This is assumes if you're setting a single token you want it to be the only one. + Multiple-token sensitive contexts should work with `extraction.tokens` instead. + """ + self.tokens = [token] if token else [] + + @property + def table(self) -> Table: + """ + Return the first `Table` the document extraction is in + or `NULL_TABLE` if it's not in a table or OCR hasn't been assigned yet. + """ + return self.tables[0] if self.tables else NULL_TABLE + + @table.setter + def table(self, table: Table) -> None: + """ + Overwrite all tables with the one provided, handling `NULL_TABLE`. + + This is assumes if you're setting a single table you want it to be the only one. + Multiple-table sensitive contexts should work with `extraction.tables` instead. + """ + self.tables = [table] if table else [] + + @property + def cell(self) -> Cell: + """ + Return the first `Cell` the document extraction is in + or `NULL_CELL` if it's not in a cell or OCR hasn't been assigned yet. + """ + return self.cells[0] if self.cells else NULL_CELL + + @cell.setter + def cell(self, cell: Cell) -> None: + """ + Overwrite all cells with the one provided, handling `NULL_CELL`. + + This is assumes if you're setting a single cell you want it to be the only one. + Multiple-cell sensitive contexts should work with `extraction.cells` instead. + """ + self.cells = [cell] if cell else [] + + @property + def table_cells(self) -> "Iterator[tuple[Table, Cell]]": + """ + Yield the table cells the document extraction is in. + """ + yield from zip(self.tables, self.cells) + + @table_cells.setter + def table_cells(self, table_cells: "Iterable[tuple[Table, Cell]]") -> None: + """ + Set the tables cells the document extraction is in. + """ + self.tables = [] + self.cells = [] + + for table, cell in table_cells: + self.tables.append(table) + self.cells.append(cell) + @staticmethod def from_dict( document: "Document", From f44b8f18c78fc02e360616a94b02468450e944b1 Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Fri, 10 Oct 2025 11:06:41 -0500 Subject: [PATCH 06/19] Add set-like overlap syntax for `Box` and `Span` --- indico_toolkit/etloutput/box.py | 24 ++++++++++++++++++++++++ indico_toolkit/etloutput/span.py | 20 ++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/indico_toolkit/etloutput/box.py b/indico_toolkit/etloutput/box.py index 1269213..763be39 100644 --- a/indico_toolkit/etloutput/box.py +++ b/indico_toolkit/etloutput/box.py @@ -18,6 +18,30 @@ class Box: def __bool__(self) -> bool: return self != NULL_BOX + def __and__(self, other: "Box") -> "Box": + """ + Return a new `Box` for the overlap between `self` and `other` + or `NULL_BOX` if they don't overlap. + + Supports set-like `extraction.box & cell.box` syntax. + """ + if ( + self.page != other.page + or self.bottom <= other.top # `self` is above `other` + or self.top >= other.bottom # `self` is below `other` + or self.right <= other.left # `self` is to the left of `other` + or self.left >= other.right # `self` is to the right of `other` + ): + return NULL_BOX + else: + return Box( + page=self.page, + top=max(self.top, other.top), + left=max(self.left, other.left), + right=min(self.right, other.right), + bottom=min(self.bottom, other.bottom), + ) + def __lt__(self, other: "Box") -> bool: """ Bounding boxes are sorted with vertical hysteresis. Those on the same line are diff --git a/indico_toolkit/etloutput/span.py b/indico_toolkit/etloutput/span.py index bd33d31..26939f5 100644 --- a/indico_toolkit/etloutput/span.py +++ b/indico_toolkit/etloutput/span.py @@ -20,6 +20,26 @@ def slice(self) -> slice: def __bool__(self) -> bool: return self != NULL_SPAN + def __and__(self, other: "Span") -> "Span": + """ + Return a new `Span` for the overlap between `self` and `other` + or `NULL_SPAN` if they don't overlap. + + Supports set-like `extraction.span & cell.span` syntax. + """ + if ( + self.page != other.page + or self.end <= other.start # `self` is to the left of `other` + or self.start >= other.end # `self` is to the right of `other` + ): + return NULL_SPAN + else: + return Span( + page=self.page, + start=max(self.start, other.start), + end=min(self.end, other.end), + ) + @staticmethod def from_dict(span: object) -> "Span": return Span( From 95301f5d08b96f93c2ae7a1c531d0ad846ad98a5 Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Fri, 10 Oct 2025 11:17:09 -0500 Subject: [PATCH 07/19] Rewrite table cell lookup to support multiple cells using a naive span overlap algorithm --- indico_toolkit/etloutput/etloutput.py | 35 +++++----------- .../4725/112731/112257/tables_0.json | 2 +- tests/etloutput/test_rowspan_colspan.py | 3 +- tests/etloutput/test_token_table_cell.py | 42 +++++++++++++++---- 4 files changed, 45 insertions(+), 37 deletions(-) diff --git a/indico_toolkit/etloutput/etloutput.py b/indico_toolkit/etloutput/etloutput.py index 0bb573b..e426659 100644 --- a/indico_toolkit/etloutput/etloutput.py +++ b/indico_toolkit/etloutput/etloutput.py @@ -5,13 +5,13 @@ from typing import TYPE_CHECKING from .box import Box -from .errors import TableCellNotFoundError, TokenNotFoundError +from .errors import TokenNotFoundError from .span import Span from .table import Table from .token import Token if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import Iterable, Iterator from .cell import Cell @@ -81,28 +81,13 @@ def token_for(self, span: Span) -> Token: span=span, ) - def table_cell_for(self, token: Token) -> "tuple[Table, Cell]": + def table_cells_for(self, span: Span) -> "Iterator[tuple[Table, Cell]]": """ - Return the `Table` and `Cell` that contain the midpoint of `token`. - Raise `TableCellNotFoundError` if it's not inside a table cell. + Yield the table cells that overlap with `span`. """ - token_vmid = (token.box.top + token.box.bottom) // 2 - token_hmid = (token.box.left + token.box.right) // 2 - - for table in self.tables_on_page[token.box.page]: - if ( - (table.box.top <= token_vmid <= table.box.bottom) and - (table.box.left <= token_hmid <= table.box.right) - ): # fmt: skip - break - else: - raise TableCellNotFoundError(f"no table contains {token!r}") - - for cell in table.cells: - if ( - (cell.box.top <= token_vmid <= cell.box.bottom) and - (cell.box.left <= token_hmid <= cell.box.right) - ): # fmt: skip - return table, cell - else: - raise TableCellNotFoundError(f"no cell contains {token!r}") + if 0 <= span.page < len(self.tables_on_page): + for table in self.tables_on_page[span.page]: + if any(span & table_span for table_span in table.spans): + for cell in table.cells: + if any(span & cell_span for cell_span in cell.spans): + yield table, cell diff --git a/tests/data/etloutput/4725/112731/112257/tables_0.json b/tests/data/etloutput/4725/112731/112257/tables_0.json index 8800fc7..014ffc2 100644 --- a/tests/data/etloutput/4725/112731/112257/tables_0.json +++ b/tests/data/etloutput/4725/112731/112257/tables_0.json @@ -1 +1 @@ -[{"cells":[{"cell_type":"header","columns":[0],"doc_offsets":[{"end":29,"start":25}],"page_offsets":[{"end":29,"start":25}],"position":{"bottom":562,"left":713,"right":1052,"top":435},"rows":[0],"text":"Alfa"},{"cell_type":"header","columns":[1],"doc_offsets":[{"end":35,"start":30}],"page_offsets":[{"end":35,"start":30}],"position":{"bottom":561,"left":1051,"right":1301,"top":434},"rows":[0],"text":"Bravo"},{"cell_type":"header","columns":[2],"doc_offsets":[{"end":43,"start":36}],"page_offsets":[{"end":43,"start":36}],"position":{"bottom":560,"left":1300,"right":1578,"top":433},"rows":[0],"text":"Charlie"},{"cell_type":"header","columns":[3],"doc_offsets":[{"end":49,"start":44}],"page_offsets":[{"end":49,"start":44}],"position":{"bottom":561,"left":1580,"right":1821,"top":430},"rows":[0],"text":"Delta"},{"cell_type":"content","columns":[0],"doc_offsets":[{"end":54,"start":50}],"page_offsets":[{"end":54,"start":50}],"position":{"bottom":778,"left":712,"right":1052,"top":561},"rows":[1,2],"text":"Echo"},{"cell_type":"content","columns":[1,2],"doc_offsets":[{"end":62,"start":55}],"page_offsets":[{"end":62,"start":55}],"position":{"bottom":670,"left":1052,"right":1580,"top":561},"rows":[1],"text":"Foxtrot"},{"cell_type":"content","columns":[3],"doc_offsets":[{"end":68,"start":64}],"page_offsets":[{"end":68,"start":64}],"position":{"bottom":669,"left":1580,"right":1821,"top":560},"rows":[1],"text":"Golf"},{"cell_type":"content","columns":[1],"doc_offsets":[{"end":75,"start":70}],"page_offsets":[{"end":75,"start":70}],"position":{"bottom":778,"left":1052,"right":1301,"top":670},"rows":[2],"text":"Hotel"},{"cell_type":"content","columns":[2],"doc_offsets":[{"end":81,"start":76}],"page_offsets":[{"end":81,"start":76}],"position":{"bottom":777,"left":1301,"right":1581,"top":669},"rows":[2],"text":"India"},{"cell_type":"content","columns":[3],"doc_offsets":[{"end":89,"start":82}],"page_offsets":[{"end":89,"start":82}],"position":{"bottom":776,"left":1580,"right":1822,"top":668},"rows":[2],"text":"Juliett"},{"cell_type":"content","columns":[0],"doc_offsets":[{"end":94,"start":90}],"page_offsets":[{"end":94,"start":90}],"position":{"bottom":891,"left":712,"right":1053,"top":778},"rows":[3],"text":"Kilo"},{"cell_type":"content","columns":[1,2],"doc_offsets":[],"page_offsets":[],"position":{"bottom":997,"left":1052,"right":1582,"top":776},"rows":[3,4],"text":"Lima"},{"cell_type":"content","columns":[3],"doc_offsets":[{"end":101,"start":97}],"page_offsets":[{"end":101,"start":97}],"position":{"bottom":889,"left":1581,"right":1822,"top":775},"rows":[3],"text":"Mike"},{"cell_type":"content","columns":[0],"doc_offsets":[{"end":110,"start":102}],"page_offsets":[{"end":110,"start":102}],"position":{"bottom":998,"left":713,"right":1053,"top":889},"rows":[4],"text":"November"},{"cell_type":"content","columns":[3],"doc_offsets":[{"end":122,"start":117}],"page_offsets":[{"end":122,"start":117}],"position":{"bottom":995,"left":1581,"right":1824,"top":888},"rows":[4],"text":"Oscar"}],"doc_offsets":[{"end":94,"start":25},{"end":122,"start":97}],"num_columns":4,"num_rows":5,"page_num":0,"page_offsets":[{"end":94,"start":25},{"end":122,"start":97}],"position":{"bottom":998,"left":711,"right":1824,"top":430},"table_id":0,"table_offset":{"column":0,"row":0}}] +[{"cells":[{"cell_type":"header","columns":[0],"doc_offsets":[{"end":29,"start":25}],"page_offsets":[{"end":29,"start":25}],"position":{"bottom":562,"left":713,"right":1052,"top":435},"rows":[0],"text":"Alfa"},{"cell_type":"header","columns":[1],"doc_offsets":[{"end":35,"start":30}],"page_offsets":[{"end":35,"start":30}],"position":{"bottom":561,"left":1051,"right":1301,"top":434},"rows":[0],"text":"Bravo"},{"cell_type":"header","columns":[2],"doc_offsets":[{"end":43,"start":36}],"page_offsets":[{"end":43,"start":36}],"position":{"bottom":560,"left":1300,"right":1578,"top":433},"rows":[0],"text":"Charlie"},{"cell_type":"header","columns":[3],"doc_offsets":[{"end":49,"start":44}],"page_offsets":[{"end":49,"start":44}],"position":{"bottom":561,"left":1580,"right":1821,"top":430},"rows":[0],"text":"Delta"},{"cell_type":"content","columns":[0],"doc_offsets":[{"end":54,"start":50}],"page_offsets":[{"end":54,"start":50}],"position":{"bottom":778,"left":712,"right":1052,"top":561},"rows":[1,2],"text":"Echo"},{"cell_type":"content","columns":[1,2],"doc_offsets":[{"end":62,"start":55}],"page_offsets":[{"end":62,"start":55}],"position":{"bottom":670,"left":1052,"right":1580,"top":561},"rows":[1],"text":"Foxtrot"},{"cell_type":"content","columns":[3],"doc_offsets":[{"end":68,"start":64}],"page_offsets":[{"end":68,"start":64}],"position":{"bottom":669,"left":1580,"right":1821,"top":560},"rows":[1],"text":"Golf"},{"cell_type":"content","columns":[1],"doc_offsets":[{"end":75,"start":70}],"page_offsets":[{"end":75,"start":70}],"position":{"bottom":778,"left":1052,"right":1301,"top":670},"rows":[2],"text":"Hotel"},{"cell_type":"content","columns":[2],"doc_offsets":[{"end":81,"start":76}],"page_offsets":[{"end":81,"start":76}],"position":{"bottom":777,"left":1301,"right":1581,"top":669},"rows":[2],"text":"India"},{"cell_type":"content","columns":[3],"doc_offsets":[{"end":89,"start":82}],"page_offsets":[{"end":89,"start":82}],"position":{"bottom":776,"left":1580,"right":1822,"top":668},"rows":[2],"text":"Juliett"},{"cell_type":"content","columns":[0],"doc_offsets":[{"end":94,"start":90}],"page_offsets":[{"end":94,"start":90}],"position":{"bottom":891,"left":712,"right":1053,"top":778},"rows":[3],"text":"Kilo"},{"cell_type":"content","columns":[1,2],"doc_offsets":[{"end":115,"start":111}],"page_offsets":[{"end":115, "start":111}],"position":{"bottom":997,"left":1052,"right":1582,"top":776},"rows":[3,4],"text":"Lima"},{"cell_type":"content","columns":[3],"doc_offsets":[{"end":101,"start":97}],"page_offsets":[{"end":101,"start":97}],"position":{"bottom":889,"left":1581,"right":1822,"top":775},"rows":[3],"text":"Mike"},{"cell_type":"content","columns":[0],"doc_offsets":[{"end":110,"start":102}],"page_offsets":[{"end":110,"start":102}],"position":{"bottom":998,"left":713,"right":1053,"top":889},"rows":[4],"text":"November"},{"cell_type":"content","columns":[3],"doc_offsets":[{"end":122,"start":117}],"page_offsets":[{"end":122,"start":117}],"position":{"bottom":995,"left":1581,"right":1824,"top":888},"rows":[4],"text":"Oscar"}],"doc_offsets":[{"end":94,"start":25},{"end":122,"start":97}],"num_columns":4,"num_rows":5,"page_num":0,"page_offsets":[{"end":94,"start":25},{"end":122,"start":97}],"position":{"bottom":998,"left":711,"right":1824,"top":430},"table_id":0,"table_offset":{"column":0,"row":0}}] diff --git a/tests/etloutput/test_rowspan_colspan.py b/tests/etloutput/test_rowspan_colspan.py index 2b1715e..58bde1c 100644 --- a/tests/etloutput/test_rowspan_colspan.py +++ b/tests/etloutput/test_rowspan_colspan.py @@ -120,6 +120,5 @@ def test_columns(table: Table) -> None: ], ) def test_table_cell_for(etl_output: EtlOutput, span: Span, expected_text: str) -> None: - token = etl_output.token_for(span) - table, cell = etl_output.table_cell_for(token) + (table, cell), *_ = etl_output.table_cells_for(span) assert cell.text == expected_text diff --git a/tests/etloutput/test_token_table_cell.py b/tests/etloutput/test_token_table_cell.py index 14c8f2e..2fc167c 100644 --- a/tests/etloutput/test_token_table_cell.py +++ b/tests/etloutput/test_token_table_cell.py @@ -6,10 +6,10 @@ from indico_toolkit import etloutput from indico_toolkit.etloutput import ( NULL_SPAN, + NULL_TOKEN, CellType, EtlOutput, Span, - TableCellNotFoundError, TokenNotFoundError, ) @@ -39,6 +39,16 @@ def content_span() -> Span: return Span(page=1, start=1343, end=1349) +@pytest.fixture +def line_item_span() -> Span: + return Span(page=1, start=1311, end=1244) + + +@pytest.fixture +def mulitple_table_span() -> Span: + return Span(page=1, start=1217, end=1299) + + def test_text_slice( etl_output: EtlOutput, header_span: Span, content_span: Span ) -> None: @@ -70,11 +80,8 @@ def test_null_span_not_found(etl_output: EtlOutput) -> None: def test_table_cell( etl_output: EtlOutput, header_span: Span, content_span: Span ) -> None: - header_token = etl_output.token_for(header_span) - content_token = etl_output.token_for(content_span) - - header_table, header_cell = etl_output.table_cell_for(header_token) - content_table, content_cell = etl_output.table_cell_for(content_token) + (header_table, header_cell), *_ = etl_output.table_cells_for(header_span) + (content_table, content_cell), *_ = etl_output.table_cells_for(content_span) assert header_cell.span == header_span assert content_cell.span == content_span @@ -86,10 +93,27 @@ def test_table_cell( assert content_cell.text == "720.00" +def test_table_cells(etl_output: EtlOutput, line_item_span: Span) -> None: + table_cells = etl_output.table_cells_for(line_item_span) + correct_table = etl_output.tables[3] + correct_row = correct_table.rows[1] + correct_cells = correct_row[1:4] + + for (table, cell), correct_cell in zip(table_cells, correct_cells): + assert table == correct_table + assert cell == correct_cell + + +def test_multiple_tables(etl_output: EtlOutput, mulitple_table_span: Span) -> None: + table_cells = etl_output.table_cells_for(mulitple_table_span) + cells = [cell for (table, cell) in table_cells] + _correct_cells = etl_output.tables[2].rows[-1] + etl_output.tables[3].rows[0] + correct_cells = [cell for cell in _correct_cells if cell.text] + assert cells == correct_cells + + def test_table_cell_not_found(etl_output: EtlOutput) -> None: - with pytest.raises(TableCellNotFoundError): - token = etl_output.token_for(Span(page=0, start=0, end=8)) - etl_output.table_cell_for(token) + assert not list(etl_output.table_cells_for(NULL_SPAN)) def test_empty_cell(etl_output: EtlOutput) -> None: From 85552eac597bf5b6375bb1459e3df3b7cc5a5519 Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Fri, 10 Oct 2025 11:18:34 -0500 Subject: [PATCH 08/19] Return `NULL_TOKEN` rather than raising an error for failed token lookups --- indico_toolkit/etloutput/etloutput.py | 11 +++++------ tests/etloutput/test_token_table_cell.py | 15 +++------------ 2 files changed, 8 insertions(+), 18 deletions(-) diff --git a/indico_toolkit/etloutput/etloutput.py b/indico_toolkit/etloutput/etloutput.py index e426659..7f42769 100644 --- a/indico_toolkit/etloutput/etloutput.py +++ b/indico_toolkit/etloutput/etloutput.py @@ -5,10 +5,9 @@ from typing import TYPE_CHECKING from .box import Box -from .errors import TokenNotFoundError from .span import Span from .table import Table -from .token import Token +from .token import NULL_TOKEN, Token if TYPE_CHECKING: from collections.abc import Iterable, Iterator @@ -57,8 +56,8 @@ def from_pages( def token_for(self, span: Span) -> Token: """ - Return a `Token` that contains every character from `span`. - Raise `TokenNotFoundError` if one can't be produced. + Return a `Token` that contains every character from `span` + or `NULL_TOKEN` if one doesn't exist. """ try: tokens = self.tokens_on_page[span.page] @@ -66,8 +65,8 @@ def token_for(self, span: Span) -> Token: last = bisect_left(tokens, span.end, lo=first, key=attrgetter("span.start")) tokens = tokens[first:last] assert tokens - except (AssertionError, IndexError, ValueError) as error: - raise TokenNotFoundError(f"no token contains {span!r}") from error + except (AssertionError, IndexError, ValueError): + return NULL_TOKEN return Token( text=self.text[span.slice], diff --git a/tests/etloutput/test_token_table_cell.py b/tests/etloutput/test_token_table_cell.py index 2fc167c..61479ab 100644 --- a/tests/etloutput/test_token_table_cell.py +++ b/tests/etloutput/test_token_table_cell.py @@ -4,14 +4,7 @@ import pytest from indico_toolkit import etloutput -from indico_toolkit.etloutput import ( - NULL_SPAN, - NULL_TOKEN, - CellType, - EtlOutput, - Span, - TokenNotFoundError, -) +from indico_toolkit.etloutput import NULL_SPAN, NULL_TOKEN, CellType, EtlOutput, Span data_folder = Path(__file__).parent.parent / "data" / "etloutput" etl_output_file = data_folder / "4725" / "111924" / "110239" / "etl_output.json" @@ -68,13 +61,11 @@ def test_token(etl_output: EtlOutput, header_span: Span, content_span: Span) -> def test_token_not_found(etl_output: EtlOutput, header_span: Span) -> None: - with pytest.raises(TokenNotFoundError): - etl_output.token_for(replace(header_span, page=3)) + assert etl_output.token_for(replace(header_span, page=3)) == NULL_TOKEN def test_null_span_not_found(etl_output: EtlOutput) -> None: - with pytest.raises(TokenNotFoundError): - etl_output.token_for(NULL_SPAN) + assert etl_output.token_for(NULL_SPAN) == NULL_TOKEN def test_table_cell( From b9712effa2bb0ac6406253f549cd8e5bec1fed89 Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Fri, 10 Oct 2025 11:45:45 -0500 Subject: [PATCH 09/19] Replace custom `ResultError` with idiomatic `ValueError` --- indico_toolkit/results/predictions/__init__.py | 3 +-- indico_toolkit/results/result.py | 3 +-- tests/results/test_files.py | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/indico_toolkit/results/predictions/__init__.py b/indico_toolkit/results/predictions/__init__.py index eda6709..dff7d51 100644 --- a/indico_toolkit/results/predictions/__init__.py +++ b/indico_toolkit/results/predictions/__init__.py @@ -1,6 +1,5 @@ from typing import TYPE_CHECKING -from ..errors import ResultError from ..normalization import normalize_prediction_dict from ..task import TaskType from .citation import NULL_CITATION, Citation @@ -55,4 +54,4 @@ def from_dict( elif task.type == TaskType.UNBUNDLING: return Unbundling.from_dict(document, task, review, prediction) else: - raise ResultError(f"unsupported task type {task.type!r}") + raise ValueError(f"unsupported task type {task.type!r}") diff --git a/indico_toolkit/results/result.py b/indico_toolkit/results/result.py index 0508cd9..cb55259 100644 --- a/indico_toolkit/results/result.py +++ b/indico_toolkit/results/result.py @@ -4,7 +4,6 @@ from . import predictions as prediction from .document import Document -from .errors import ResultError from .normalization import normalize_result_dict from .predictionlist import PredictionList from .predictions import Prediction @@ -53,7 +52,7 @@ def from_dict(result: object) -> "Result": file_version = get(result, int, "file_version") if file_version != 3: - raise ResultError(f"unsupported file version `{file_version}`") + raise ValueError(f"unsupported result file version `{file_version}`") normalize_result_dict(result) diff --git a/tests/results/test_files.py b/tests/results/test_files.py index 8d1513c..04e3fc5 100644 --- a/tests/results/test_files.py +++ b/tests/results/test_files.py @@ -3,7 +3,6 @@ import pytest from indico_toolkit import results -from indico_toolkit.results import ResultError data_folder = Path(__file__).parent.parent / "data" / "results" @@ -26,5 +25,5 @@ async def path_read_bytes_async(path: Path) -> bytes: def test_usupported_version() -> None: - with pytest.raises(ResultError): + with pytest.raises(ValueError): results.load({"file_version": 1}) From a436ebf23468133dc6eb42b8f6759d61ed483086 Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Fri, 10 Oct 2025 11:48:04 -0500 Subject: [PATCH 10/19] Remove unused custom ETL Output and Result error classes --- indico_toolkit/etloutput/__init__.py | 4 ---- indico_toolkit/etloutput/errors.py | 16 ---------------- indico_toolkit/results/__init__.py | 2 -- indico_toolkit/results/errors.py | 4 ---- 4 files changed, 26 deletions(-) delete mode 100644 indico_toolkit/etloutput/errors.py delete mode 100644 indico_toolkit/results/errors.py diff --git a/indico_toolkit/etloutput/__init__.py b/indico_toolkit/etloutput/__init__.py index 3bfa3b4..5d2329b 100644 --- a/indico_toolkit/etloutput/__init__.py +++ b/indico_toolkit/etloutput/__init__.py @@ -2,7 +2,6 @@ from .box import NULL_BOX, Box from .cell import NULL_CELL, Cell, CellType -from .errors import EtlOutputError, TableCellNotFoundError, TokenNotFoundError from .etloutput import EtlOutput from .range import NULL_RANGE, Range from .span import NULL_SPAN, Span @@ -18,7 +17,6 @@ "Cell", "CellType", "EtlOutput", - "EtlOutputError", "load", "load_async", "NULL_BOX", @@ -30,9 +28,7 @@ "Range", "Span", "Table", - "TableCellNotFoundError", "Token", - "TokenNotFoundError", ) Loadable: TypeAlias = "dict[str, object] | list[object] | str | bytes" diff --git a/indico_toolkit/etloutput/errors.py b/indico_toolkit/etloutput/errors.py deleted file mode 100644 index d468803..0000000 --- a/indico_toolkit/etloutput/errors.py +++ /dev/null @@ -1,16 +0,0 @@ -class EtlOutputError(Exception): - """ - Raised when an error occurs accessing `EtlOutput` values. - """ - - -class TokenNotFoundError(EtlOutputError): - """ - Raised when a `Token` can't be found for a `Span`. - """ - - -class TableCellNotFoundError(EtlOutputError): - """ - Raised when a `Table` and `Cell` can't be found for a `Token`. - """ diff --git a/indico_toolkit/results/__init__.py b/indico_toolkit/results/__init__.py index 660d687..53b9811 100644 --- a/indico_toolkit/results/__init__.py +++ b/indico_toolkit/results/__init__.py @@ -2,7 +2,6 @@ from ..etloutput import NULL_BOX, NULL_SPAN, Box, Span from .document import Document -from .errors import ResultError from .predictionlist import PredictionList from .predictions import ( NULL_CITATION, @@ -42,7 +41,6 @@ "Prediction", "PredictionList", "Result", - "ResultError", "Review", "ReviewType", "Span", diff --git a/indico_toolkit/results/errors.py b/indico_toolkit/results/errors.py deleted file mode 100644 index abebb54..0000000 --- a/indico_toolkit/results/errors.py +++ /dev/null @@ -1,4 +0,0 @@ -class ResultError(Exception): - """ - Raised when an error occurs while loading or dumping a result file. - """ From 6ebe77e1a0a52b7ede9f24105d757413dec14719 Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Fri, 10 Oct 2025 11:52:48 -0500 Subject: [PATCH 11/19] Clean up `TYPE_CHECKING` imports --- indico_toolkit/etloutput/box.py | 7 ++----- indico_toolkit/etloutput/cell.py | 7 ++----- indico_toolkit/etloutput/etloutput.py | 6 +++--- indico_toolkit/etloutput/range.py | 7 ++----- indico_toolkit/etloutput/span.py | 7 ++----- indico_toolkit/etloutput/table.py | 7 ++----- indico_toolkit/etloutput/token.py | 7 ++----- indico_toolkit/etloutput/utils.py | 5 +---- indico_toolkit/results/normalization.py | 5 +---- indico_toolkit/results/predictionlist.py | 8 +++----- indico_toolkit/results/predictions/citation.py | 7 ++----- indico_toolkit/results/predictions/classification.py | 6 ++---- indico_toolkit/results/predictions/documentextraction.py | 5 ++--- indico_toolkit/results/predictions/formextraction.py | 6 ++---- indico_toolkit/results/predictions/group.py | 5 +---- indico_toolkit/results/predictions/prediction.py | 7 ++----- indico_toolkit/results/predictions/summarization.py | 6 ++---- indico_toolkit/results/predictions/unbundling.py | 6 ++---- indico_toolkit/results/utils.py | 5 +---- 19 files changed, 36 insertions(+), 83 deletions(-) diff --git a/indico_toolkit/etloutput/box.py b/indico_toolkit/etloutput/box.py index 763be39..8c01adf 100644 --- a/indico_toolkit/etloutput/box.py +++ b/indico_toolkit/etloutput/box.py @@ -1,11 +1,8 @@ from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import Final from .utils import get -if TYPE_CHECKING: - from typing import Final - @dataclass(frozen=True) class Box: @@ -82,4 +79,4 @@ def from_dict(box: object) -> "Box": # object rather than using `None` or raising an error. This lets you e.g. sort by the # `box` attribute without having to constantly check for `None`, while still allowing # you do a "None check" with `bool(extraction.box)` or `extraction.box == NULL_BOX`. -NULL_BOX: "Final" = Box(page=0, top=0, left=0, right=0, bottom=0) +NULL_BOX: Final = Box(page=0, top=0, left=0, right=0, bottom=0) diff --git a/indico_toolkit/etloutput/cell.py b/indico_toolkit/etloutput/cell.py index d05c14d..03c032b 100644 --- a/indico_toolkit/etloutput/cell.py +++ b/indico_toolkit/etloutput/cell.py @@ -1,15 +1,12 @@ from dataclasses import dataclass from enum import Enum -from typing import TYPE_CHECKING +from typing import Final from .box import NULL_BOX, Box from .range import NULL_RANGE, Range from .span import NULL_SPAN, Span from .utils import get -if TYPE_CHECKING: - from typing import Final - class CellType(Enum): HEADER = "header" @@ -59,7 +56,7 @@ def from_dict(cell: object, page: int) -> "Cell": # rather than using `None` or raising an error. This lets you e.g. sort by the `cell` # attribute without having to constantly check for `None`, while still allowing you do # a "None check" with `bool(extraction.cell)` or `extraction.cell == NULL_CELL`. -NULL_CELL: "Final" = Cell( +NULL_CELL: Final = Cell( type=CellType.CONTENT, text="", box=NULL_BOX, diff --git a/indico_toolkit/etloutput/etloutput.py b/indico_toolkit/etloutput/etloutput.py index 7f42769..3e89e9b 100644 --- a/indico_toolkit/etloutput/etloutput.py +++ b/indico_toolkit/etloutput/etloutput.py @@ -5,7 +5,6 @@ from typing import TYPE_CHECKING from .box import Box -from .span import Span from .table import Table from .token import NULL_TOKEN, Token @@ -13,6 +12,7 @@ from collections.abc import Iterable, Iterator from .cell import Cell + from .span import Span @dataclass(frozen=True) @@ -54,7 +54,7 @@ def from_pages( tables_on_page=table_pages, ) - def token_for(self, span: Span) -> Token: + def token_for(self, span: "Span") -> Token: """ Return a `Token` that contains every character from `span` or `NULL_TOKEN` if one doesn't exist. @@ -80,7 +80,7 @@ def token_for(self, span: Span) -> Token: span=span, ) - def table_cells_for(self, span: Span) -> "Iterator[tuple[Table, Cell]]": + def table_cells_for(self, span: "Span") -> "Iterator[tuple[Table, Cell]]": """ Yield the table cells that overlap with `span`. """ diff --git a/indico_toolkit/etloutput/range.py b/indico_toolkit/etloutput/range.py index 6933dc4..0d0f8ef 100644 --- a/indico_toolkit/etloutput/range.py +++ b/indico_toolkit/etloutput/range.py @@ -1,11 +1,8 @@ from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import Final from .utils import get -if TYPE_CHECKING: - from typing import Final - @dataclass(order=True, frozen=True) class Range: @@ -41,7 +38,7 @@ def from_dict(cell: object) -> "Range": # rather than using `None` or raising an error. This lets you e.g. sort by the `range` # attribute without having to constantly check for `None`, while still allowing you do # a "None check" with `bool(cell.range)` or `cell.range == NULL_RANGE`. -NULL_RANGE: "Final" = Range( +NULL_RANGE: Final = Range( row=0, column=0, rowspan=0, diff --git a/indico_toolkit/etloutput/span.py b/indico_toolkit/etloutput/span.py index 26939f5..4e3d250 100644 --- a/indico_toolkit/etloutput/span.py +++ b/indico_toolkit/etloutput/span.py @@ -1,11 +1,8 @@ from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import Any, Final from .utils import get -if TYPE_CHECKING: - from typing import Any, Final - @dataclass(order=True, frozen=True) class Span: @@ -60,4 +57,4 @@ def to_dict(self) -> "dict[str, Any]": # rather than using `None` or raising an error. This lets you e.g. sort by the `span` # attribute without having to constantly check for `None`, while still allowing you do # a "None check" with `bool(extraction.span)` or `extraction.span == NULL_SPAN`. -NULL_SPAN: "Final" = Span(page=0, start=0, end=0) +NULL_SPAN: Final = Span(page=0, start=0, end=0) diff --git a/indico_toolkit/etloutput/table.py b/indico_toolkit/etloutput/table.py index 15d419c..c0fd7b9 100644 --- a/indico_toolkit/etloutput/table.py +++ b/indico_toolkit/etloutput/table.py @@ -1,15 +1,12 @@ from dataclasses import dataclass from operator import attrgetter -from typing import TYPE_CHECKING +from typing import Final from .box import NULL_BOX, Box from .cell import Cell from .span import NULL_SPAN, Span from .utils import get -if TYPE_CHECKING: - from typing import Final - @dataclass(frozen=True) class Table: @@ -82,7 +79,7 @@ def from_dict(table: object) -> "Table": # rather than using `None` or raising an error. This lets you e.g. group by the `table` # attribute without having to constantly check for `None`, while still allowing you do # a "None check" with `bool(extraction.table)` or `extraction.table == NULL_TABLE`. -NULL_TABLE: "Final" = Table( +NULL_TABLE: Final = Table( box=NULL_BOX, spans=tuple(), cells=tuple(), diff --git a/indico_toolkit/etloutput/token.py b/indico_toolkit/etloutput/token.py index 2c88002..8feb8e5 100644 --- a/indico_toolkit/etloutput/token.py +++ b/indico_toolkit/etloutput/token.py @@ -1,13 +1,10 @@ from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import Final from .box import NULL_BOX, Box from .span import NULL_SPAN, Span from .utils import get -if TYPE_CHECKING: - from typing import Final - @dataclass(frozen=True) class Token: @@ -37,7 +34,7 @@ def from_dict(token: object) -> "Token": # rather than using `None` or raising an error. This lets you e.g. sort by the `token` # attribute without having to constantly check for `None`, while still allowing you do # a "None check" with `bool(extraction.token)` or `extraction.token == NULL_TOKEN`. -NULL_TOKEN: "Final" = Token( +NULL_TOKEN: Final = Token( text="", box=NULL_BOX, span=NULL_SPAN, diff --git a/indico_toolkit/etloutput/utils.py b/indico_toolkit/etloutput/utils.py index 5a91a2a..0f7c3e5 100644 --- a/indico_toolkit/etloutput/utils.py +++ b/indico_toolkit/etloutput/utils.py @@ -1,8 +1,5 @@ import json -from typing import TYPE_CHECKING, TypeVar - -if TYPE_CHECKING: - from typing import Any +from typing import Any, TypeVar Value = TypeVar("Value") diff --git a/indico_toolkit/results/normalization.py b/indico_toolkit/results/normalization.py index 44f4da1..72cec52 100644 --- a/indico_toolkit/results/normalization.py +++ b/indico_toolkit/results/normalization.py @@ -1,12 +1,9 @@ import re -from typing import TYPE_CHECKING +from typing import Any from .task import TaskType from .utils import get, has -if TYPE_CHECKING: - from typing import Any - def normalize_result_dict(result: "Any") -> None: """ diff --git a/indico_toolkit/results/predictionlist.py b/indico_toolkit/results/predictionlist.py index 312fd9e..1e57dd0 100644 --- a/indico_toolkit/results/predictionlist.py +++ b/indico_toolkit/results/predictionlist.py @@ -1,6 +1,6 @@ from collections import defaultdict from operator import attrgetter -from typing import TYPE_CHECKING, List, TypeVar, overload +from typing import TYPE_CHECKING, Any, Final, List, SupportsIndex, TypeVar, overload from .predictions import ( Classification, @@ -13,25 +13,23 @@ Unbundling, ) from .review import Review, ReviewType -from .task import TaskType from .utils import nfilter if TYPE_CHECKING: from collections.abc import Callable, Collection, Container, Iterable - from typing import Any, Final, SupportsIndex from typing_extensions import Self from .document import Document from .result import Result - from .task import Task + from .task import Task, TaskType PredictionType = TypeVar("PredictionType", bound=Prediction) OfType = TypeVar("OfType", bound=Prediction) KeyType = TypeVar("KeyType") # Non-None sentinel value to support `PredictionList.where(review=None)`. -REVIEW_UNSPECIFIED: "Final" = Review( +REVIEW_UNSPECIFIED: Final = Review( id=None, reviewer_id=None, notes=None, rejected=None, type=None # type: ignore[arg-type] ) diff --git a/indico_toolkit/results/predictions/citation.py b/indico_toolkit/results/predictions/citation.py index 9cd42b0..7e765ec 100644 --- a/indico_toolkit/results/predictions/citation.py +++ b/indico_toolkit/results/predictions/citation.py @@ -1,12 +1,9 @@ from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import Any, Final from ...etloutput import NULL_SPAN, Span from ..utils import get -if TYPE_CHECKING: - from typing import Any, Final - @dataclass(order=True, frozen=True) class Citation: @@ -44,4 +41,4 @@ def to_dict(self) -> "dict[str, Any]": # `citation` attribute without having to constantly check for `None`, while still # allowing you do a "None check" with `bool(summarization.citation)` or # `summarization.citation == NULL_CITATION`. -NULL_CITATION: "Final" = Citation(start=0, end=0, span=NULL_SPAN) +NULL_CITATION: Final = Citation(start=0, end=0, span=NULL_SPAN) diff --git a/indico_toolkit/results/predictions/classification.py b/indico_toolkit/results/predictions/classification.py index 8ef33ea..b76b17e 100644 --- a/indico_toolkit/results/predictions/classification.py +++ b/indico_toolkit/results/predictions/classification.py @@ -1,14 +1,12 @@ from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any -from ..review import Review from ..utils import get, omit from .prediction import Prediction if TYPE_CHECKING: - from typing import Any - from ..document import Document + from ..review import Review from ..task import Task diff --git a/indico_toolkit/results/predictions/documentextraction.py b/indico_toolkit/results/predictions/documentextraction.py index 284b49a..2974480 100644 --- a/indico_toolkit/results/predictions/documentextraction.py +++ b/indico_toolkit/results/predictions/documentextraction.py @@ -1,5 +1,5 @@ from dataclasses import dataclass, field -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from ...etloutput import ( NULL_CELL, @@ -11,16 +11,15 @@ Table, Token, ) -from ..review import Review from ..utils import get, has, omit from .extraction import Extraction from .group import Group if TYPE_CHECKING: from collections.abc import Iterable, Iterator - from typing import Any from ..document import Document + from ..review import Review from ..task import Task diff --git a/indico_toolkit/results/predictions/formextraction.py b/indico_toolkit/results/predictions/formextraction.py index 7e90535..daa4124 100644 --- a/indico_toolkit/results/predictions/formextraction.py +++ b/indico_toolkit/results/predictions/formextraction.py @@ -1,16 +1,14 @@ from dataclasses import dataclass from enum import Enum -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from ...etloutput import Box -from ..review import Review from ..utils import get, has, omit from .extraction import Extraction if TYPE_CHECKING: - from typing import Any - from ..document import Document + from ..review import Review from ..task import Task diff --git a/indico_toolkit/results/predictions/group.py b/indico_toolkit/results/predictions/group.py index e9e000d..3f72bda 100644 --- a/indico_toolkit/results/predictions/group.py +++ b/indico_toolkit/results/predictions/group.py @@ -1,11 +1,8 @@ from dataclasses import dataclass, replace -from typing import TYPE_CHECKING +from typing import Any from ..utils import get -if TYPE_CHECKING: - from typing import Any - @dataclass(frozen=True, order=True) class Group: diff --git a/indico_toolkit/results/predictions/prediction.py b/indico_toolkit/results/predictions/prediction.py index 216e070..ab4bc30 100644 --- a/indico_toolkit/results/predictions/prediction.py +++ b/indico_toolkit/results/predictions/prediction.py @@ -1,12 +1,9 @@ from dataclasses import dataclass -from typing import TYPE_CHECKING - -from ..review import Review +from typing import TYPE_CHECKING, Any if TYPE_CHECKING: - from typing import Any - from ..document import Document + from ..review import Review from ..task import Task diff --git a/indico_toolkit/results/predictions/summarization.py b/indico_toolkit/results/predictions/summarization.py index bed82b4..88e618e 100644 --- a/indico_toolkit/results/predictions/summarization.py +++ b/indico_toolkit/results/predictions/summarization.py @@ -1,16 +1,14 @@ from dataclasses import dataclass, replace -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any -from ..review import Review from ..utils import get, has, omit from .citation import NULL_CITATION, Citation from .extraction import Extraction if TYPE_CHECKING: - from typing import Any - from ...etloutput import Span from ..document import Document + from ..review import Review from ..task import Task diff --git a/indico_toolkit/results/predictions/unbundling.py b/indico_toolkit/results/predictions/unbundling.py index cdca292..ddc5701 100644 --- a/indico_toolkit/results/predictions/unbundling.py +++ b/indico_toolkit/results/predictions/unbundling.py @@ -1,15 +1,13 @@ from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from ...etloutput import Span -from ..review import Review from ..utils import get, omit from .prediction import Prediction if TYPE_CHECKING: - from typing import Any - from ..document import Document + from ..review import Review from ..task import Task diff --git a/indico_toolkit/results/utils.py b/indico_toolkit/results/utils.py index ae6b41c..5b1aad5 100644 --- a/indico_toolkit/results/utils.py +++ b/indico_toolkit/results/utils.py @@ -1,11 +1,8 @@ from collections.abc import Iterable, Iterator -from typing import TYPE_CHECKING +from typing import Callable from ..etloutput.utils import Value, get, has, json_loaded, str_decoded -if TYPE_CHECKING: - from typing import Callable - __all__ = ( "get", "has", From dc3277356c233f24af1392a07c1b415a8bb9c981 Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Fri, 10 Oct 2025 11:54:19 -0500 Subject: [PATCH 12/19] Clean up some comments and formatting --- .../results/predictions/documentextraction.py | 7 +++---- .../results/predictions/summarization.py | 18 ++++++++---------- indico_toolkit/results/utils.py | 3 ++- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/indico_toolkit/results/predictions/documentextraction.py b/indico_toolkit/results/predictions/documentextraction.py index 2974480..3a7beca 100644 --- a/indico_toolkit/results/predictions/documentextraction.py +++ b/indico_toolkit/results/predictions/documentextraction.py @@ -37,7 +37,7 @@ def span(self) -> Span: """ Return the first `Span` the document extraction covers else `NULL_SPAN`. - Post-review, document extractions have no spans. + Predictions added in review may not have spans. """ return self.spans[0] if self.spans else NULL_SPAN @@ -46,9 +46,8 @@ def span(self, span: Span) -> None: """ Overwrite all spans with the one provided, handling `NULL_SPAN`. - This is implemented under the assumption that if you're setting a single span, - you want it to be the only one. And if you're working in a context that's - multiple-span sensetive, you'll set `extraction.spans` instead. + This is assumes if you're setting a single span you want it to be the only one. + Multiple-span sensitive contexts should work with `extraction.spans` instead. """ self.spans = [span] if span else [] diff --git a/indico_toolkit/results/predictions/summarization.py b/indico_toolkit/results/predictions/summarization.py index 88e618e..61b257c 100644 --- a/indico_toolkit/results/predictions/summarization.py +++ b/indico_toolkit/results/predictions/summarization.py @@ -21,7 +21,7 @@ def citation(self) -> Citation: """ Return the first `Citation` the summarization covers else `NULL_CITATION`. - Post-review, summarizations have no citations. + Predictions added in review may not have citations. """ return self.citations[0] if self.citations else NULL_CITATION @@ -30,17 +30,15 @@ def citation(self, citation: Citation) -> None: """ Overwrite all citations with the one provided, handling `NULL_CITATION`. - This is implemented under the assumption that if you're setting a single - citation, you want it to be the only one. And if you're working in a context - that's multiple-citation sensetive, you'll set `summarization.citations` - instead. + This is assumes if you're setting a single citation it should be the only one. + Multiple-citation sensitive contexts should work with `summarization.citations`. """ self.citations = [citation] if citation else [] @property def spans(self) -> "tuple[Span, ...]": """ - Return the spans covered by `self.citations`. + Return the `Span`s covered by `self.citations`. """ return tuple(citation.span for citation in self.citations) @@ -49,7 +47,7 @@ def span(self) -> "Span": """ Return the `Span` the first citation covers else `NULL_SPAN`. - Post-review, summarizations have no citations/spans. + Predictions added in review may not have citations. """ return self.citation.span @@ -62,9 +60,9 @@ def span(self, span: "Span") -> None: Using `NULL_SPAN` for a citation is not explicitly handled, and should be considered undefined behavior. - This is implemented under the assumption that if you're setting a single span, - there's only one citation and you want to update its span. And if you're - working in a context that's multiple-citation/span sensetive, you'll set + This is assumes if you're setting a single span, + there's only one citation and you want it to update its span. + Multiple-context/span sensitive contexts should work with `summarization.citations` instead. """ self.citation = replace(self.citation, span=span) diff --git a/indico_toolkit/results/utils.py b/indico_toolkit/results/utils.py index 5b1aad5..32ce4ea 100644 --- a/indico_toolkit/results/utils.py +++ b/indico_toolkit/results/utils.py @@ -14,7 +14,8 @@ def nfilter( - predicates: "Iterable[Callable[[Value], bool]]", values: "Iterable[Value]" + predicates: "Iterable[Callable[[Value], bool]]", + values: "Iterable[Value]", ) -> "Iterator[Value]": """ Apply multiple filter predicates to an iterable of values. From 6e1ad81190ec06358d02ec7b4ffc3ce782f52126 Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Fri, 10 Oct 2025 13:33:15 -0500 Subject: [PATCH 13/19] Add `PredictionList.assign_ocr()` method --- indico_toolkit/results/predictionlist.py | 41 +++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/indico_toolkit/results/predictionlist.py b/indico_toolkit/results/predictionlist.py index 1e57dd0..b85b4a4 100644 --- a/indico_toolkit/results/predictionlist.py +++ b/indico_toolkit/results/predictionlist.py @@ -1,4 +1,5 @@ from collections import defaultdict +from itertools import chain from operator import attrgetter from typing import TYPE_CHECKING, Any, Final, List, SupportsIndex, TypeVar, overload @@ -16,10 +17,11 @@ from .utils import nfilter if TYPE_CHECKING: - from collections.abc import Callable, Collection, Container, Iterable + from collections.abc import Callable, Collection, Container, Iterable, Mapping from typing_extensions import Self + from ..etloutput import EtlOutput from .document import Document from .result import Result from .task import Task, TaskType @@ -82,6 +84,43 @@ def apply(self, function: "Callable[[PredictionType], None]") -> "Self": return self + def assign_ocr( + self, + etl_outputs: "Mapping[Document, EtlOutput]", + *, + tokens: bool = True, + tables: bool = True, + ) -> "Self": + """ + Assign OCR tokens, tables, and/or cells using `etl_outputs`. + + Use `tokens` or `tables` to skip lookup and assignment of those attributes. + """ + extractions_by_document = self.oftype( + DocumentExtraction, + ).groupby( + attrgetter("document"), + ) + + for document, extractions in extractions_by_document.items(): + etl_output = etl_outputs[document] + + for extraction in extractions: + if tokens: + extraction.tokens = list( + filter( + None, + map(etl_output.token_for, extraction.spans), + ) + ) + + if tables: + extraction.table_cells = chain.from_iterable( + map(etl_output.table_cells_for, extraction.spans) + ) + + return self + def groupby( self, key: "Callable[[PredictionType], KeyType]" ) -> "dict[KeyType, Self]": From 7fdf57c89cda359ba69f3318674061ab0daee791 Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Fri, 10 Oct 2025 15:16:53 -0500 Subject: [PATCH 14/19] Rewrite `EtlOutput.table_cells_for()` using a bisection algorithm --- indico_toolkit/etloutput/etloutput.py | 46 ++++++++++++++++--- .../results/predictions/documentextraction.py | 8 +++- tests/etloutput/test_token_table_cell.py | 27 +++++++++-- 3 files changed, 69 insertions(+), 12 deletions(-) diff --git a/indico_toolkit/etloutput/etloutput.py b/indico_toolkit/etloutput/etloutput.py index 3e89e9b..6d9d3d1 100644 --- a/indico_toolkit/etloutput/etloutput.py +++ b/indico_toolkit/etloutput/etloutput.py @@ -1,6 +1,8 @@ import itertools from bisect import bisect_left, bisect_right +from collections import namedtuple from dataclasses import dataclass +from functools import cached_property from operator import attrgetter from typing import TYPE_CHECKING @@ -80,13 +82,45 @@ def token_for(self, span: "Span") -> Token: span=span, ) + _TableCellSpan = namedtuple("_TableCellSpan", ["table", "cell", "span"]) + + @cached_property + def _table_cell_spans(self) -> "tuple[_TableCellSpan, ...]": + """ + Order table cells by their spans such that they can be bisected. + """ + return tuple( + sorted( + ( + self._TableCellSpan(table, cell, span) + for table in self.tables + for cell in table.cells + for span in cell.spans + if span + ), + key=attrgetter("span"), + ) + ) + def table_cells_for(self, span: "Span") -> "Iterator[tuple[Table, Cell]]": """ Yield the table cells that overlap with `span`. + + Note: a single span may overlap the same cell multiple times causing it to be + yielded multiple times. Deduplication in `DocumentExtraction.table_cells` + accounts for this when OCR is assigned with `PredictionList.assign_ocr()`. """ - if 0 <= span.page < len(self.tables_on_page): - for table in self.tables_on_page[span.page]: - if any(span & table_span for table_span in table.spans): - for cell in table.cells: - if any(span & cell_span for cell_span in cell.spans): - yield table, cell + first = bisect_right( + self._table_cell_spans, + span.start, + key=attrgetter("span.end"), + ) + last = bisect_left( + self._table_cell_spans, + span.end, + lo=first, + key=attrgetter("span.start"), + ) + + for table, cell, span in self._table_cell_spans[first:last]: + yield table, cell diff --git a/indico_toolkit/results/predictions/documentextraction.py b/indico_toolkit/results/predictions/documentextraction.py index 3a7beca..475b032 100644 --- a/indico_toolkit/results/predictions/documentextraction.py +++ b/indico_toolkit/results/predictions/documentextraction.py @@ -116,13 +116,17 @@ def table_cells(self) -> "Iterator[tuple[Table, Cell]]": def table_cells(self, table_cells: "Iterable[tuple[Table, Cell]]") -> None: """ Set the tables cells the document extraction is in. + + Deduplicate cells to handle the case where multiple + spans are contained within the same cell. """ self.tables = [] self.cells = [] for table, cell in table_cells: - self.tables.append(table) - self.cells.append(cell) + if cell not in self.cells: + self.tables.append(table) + self.cells.append(cell) @staticmethod def from_dict( diff --git a/tests/etloutput/test_token_table_cell.py b/tests/etloutput/test_token_table_cell.py index 61479ab..e9f5694 100644 --- a/tests/etloutput/test_token_table_cell.py +++ b/tests/etloutput/test_token_table_cell.py @@ -22,6 +22,11 @@ def etl_output() -> EtlOutput: return etloutput.load(etl_output_file, reader=read_uri) +@pytest.fixture(scope="module") +def etl_output_no_tokens_tables() -> EtlOutput: + return etloutput.load(etl_output_file, reader=read_uri, tokens=False, tables=False) + + @pytest.fixture def header_span() -> Span: return Span(page=1, start=1281, end=1285) @@ -42,6 +47,11 @@ def mulitple_table_span() -> Span: return Span(page=1, start=1217, end=1299) +@pytest.fixture +def outside_table_span() -> Span: + return Span(page=1, start=1056, end=1067) + + def test_text_slice( etl_output: EtlOutput, header_span: Span, content_span: Span ) -> None: @@ -62,10 +72,12 @@ def test_token(etl_output: EtlOutput, header_span: Span, content_span: Span) -> def test_token_not_found(etl_output: EtlOutput, header_span: Span) -> None: assert etl_output.token_for(replace(header_span, page=3)) == NULL_TOKEN + assert etl_output.token_for(NULL_SPAN) == NULL_TOKEN -def test_null_span_not_found(etl_output: EtlOutput) -> None: - assert etl_output.token_for(NULL_SPAN) == NULL_TOKEN +def test_no_tokens(etl_output_no_tokens_tables: EtlOutput, header_span: Span) -> None: + assert etl_output_no_tokens_tables.token_for(header_span) == NULL_TOKEN + assert etl_output_no_tokens_tables.token_for(NULL_SPAN) == NULL_TOKEN def test_table_cell( @@ -103,8 +115,15 @@ def test_multiple_tables(etl_output: EtlOutput, mulitple_table_span: Span) -> No assert cells == correct_cells -def test_table_cell_not_found(etl_output: EtlOutput) -> None: - assert not list(etl_output.table_cells_for(NULL_SPAN)) +def test_table_cell_not_found(etl_output: EtlOutput, outside_table_span: Span) -> None: + assert not tuple(etl_output.table_cells_for(outside_table_span)) + assert not tuple(etl_output.table_cells_for(NULL_SPAN)) + assert not tuple(etl_output.table_cells_for(Span(-1, -1, -1))) + + +def test_no_tables(etl_output_no_tokens_tables: EtlOutput, header_span: Span) -> None: + assert not tuple(etl_output_no_tokens_tables.table_cells_for(header_span)) + assert not tuple(etl_output_no_tokens_tables.table_cells_for(NULL_SPAN)) def test_empty_cell(etl_output: EtlOutput) -> None: From 9ba9feae8825ea90abaae1ed3954e7e0d8739c5c Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Fri, 10 Oct 2025 15:22:33 -0500 Subject: [PATCH 15/19] Optimize table cell lookup by bisecting a single page instead of the whole document --- indico_toolkit/etloutput/etloutput.py | 54 +++++++++++++++------------ 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/indico_toolkit/etloutput/etloutput.py b/indico_toolkit/etloutput/etloutput.py index 6d9d3d1..da7e546 100644 --- a/indico_toolkit/etloutput/etloutput.py +++ b/indico_toolkit/etloutput/etloutput.py @@ -85,21 +85,24 @@ def token_for(self, span: "Span") -> Token: _TableCellSpan = namedtuple("_TableCellSpan", ["table", "cell", "span"]) @cached_property - def _table_cell_spans(self) -> "tuple[_TableCellSpan, ...]": + def _table_cell_spans_on_page(self) -> "tuple[tuple[_TableCellSpan, ...], ...]": """ - Order table cells by their spans such that they can be bisected. + Order table cells on each page by their spans such that they can be bisected. """ return tuple( - sorted( - ( - self._TableCellSpan(table, cell, span) - for table in self.tables - for cell in table.cells - for span in cell.spans - if span - ), - key=attrgetter("span"), + tuple( + sorted( + ( + self._TableCellSpan(table, cell, span) + for table in page_tables + for cell in table.cells + for span in cell.spans + if span + ), + key=attrgetter("span"), + ) ) + for page_tables in self.tables_on_page ) def table_cells_for(self, span: "Span") -> "Iterator[tuple[Table, Cell]]": @@ -110,17 +113,22 @@ def table_cells_for(self, span: "Span") -> "Iterator[tuple[Table, Cell]]": yielded multiple times. Deduplication in `DocumentExtraction.table_cells` accounts for this when OCR is assigned with `PredictionList.assign_ocr()`. """ - first = bisect_right( - self._table_cell_spans, - span.start, - key=attrgetter("span.end"), - ) - last = bisect_left( - self._table_cell_spans, - span.end, - lo=first, - key=attrgetter("span.start"), - ) + try: + page_table_cell_spans = self._table_cell_spans_on_page[span.page] + first = bisect_right( + page_table_cell_spans, + span.start, + key=attrgetter("span.end"), + ) + last = bisect_left( + page_table_cell_spans, + span.end, + lo=first, + key=attrgetter("span.start"), + ) + table_cell_spans = page_table_cell_spans[first:last] + except (IndexError, ValueError): + table_cell_spans = tuple() - for table, cell, span in self._table_cell_spans[first:last]: + for table, cell, span in table_cell_spans: yield table, cell From fe419ff55493be4acf0915bf73136d5c6715eb01 Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Tue, 14 Oct 2025 17:12:36 -0500 Subject: [PATCH 16/19] Bump version and update changelog --- CHANGELOG.md | 27 +++++++++++++++++++++++++++ indico_toolkit/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e5cbbd..b14de6b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,32 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and versions match the minimum IPA version required to use functionality. +## [v7.2.2] - 2025-10-14 + +### Added + +- Parse table spans from ETL Output as `Table.spans`. +- Add `NULL_CELL`, `NULL_RANGE`, `NULL_TABLE`, and `NULL_TOKEN` constants. +- Add Document Extraction attributes for assigning tokens, tables, and cells from OCR: + - `DocumentExtraction.tokens`, `DocumentExtraction.tables`, `DocumentExtraction.cells` +- Add Document Extraction convenience properties for singular token, table, and cell access: + - `DocumentExtraction.token`, `DocumentExtraction.table`, `DocumentExtraction.cell` +- Add `PredictionList.assign_ocr(etl_outputs, tokens=True, tables=True)` method. + +### Changed + +- Move `Box` and `Span` from results to etloutput to avoid circular imports. + (Both can still be imported from either module.) +- Return `NULL_TOKEN` instead of raising an exception from `EtlOutput.token_for(span)`. +- Rewrite table cell lookup `EtlOutput.table_cells_for(span)` using a fast, span-based, + binary search algorithm that can return multiple overlapped table cells. + +### Removed + +- Custom `results` and `etloutput` error classes that are nearly never used. + (Replaced with idiomatic Python error classes.) + + ## [v7.2.1] - 2025-09-09 ### Fixed @@ -265,6 +291,7 @@ This is the first major version release tested to work on Indico 6.X. - Row Association now also sorting on 'bbtop'. +[v7.2.1]: https://github.com/IndicoDataSolutions/indico-toolkit-python/compare/v7.2.1...v7.2.2 [v7.2.1]: https://github.com/IndicoDataSolutions/indico-toolkit-python/compare/v7.2.0...v7.2.1 [v7.2.0]: https://github.com/IndicoDataSolutions/indico-toolkit-python/compare/v6.14.2...v7.2.0 [v6.14.2]: https://github.com/IndicoDataSolutions/indico-toolkit-python/compare/v6.14.1...v6.14.2 diff --git a/indico_toolkit/__init__.py b/indico_toolkit/__init__.py index 57b848f..a10b5a2 100644 --- a/indico_toolkit/__init__.py +++ b/indico_toolkit/__init__.py @@ -21,4 +21,4 @@ "ToolkitStaggeredLoopError", "ToolkitStatusError", ) -__version__ = "7.2.1" +__version__ = "7.2.2" diff --git a/pyproject.toml b/pyproject.toml index 7b1bfe5..046fd46 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ authors = [ readme = "README.md" urls = { source = "https://github.com/IndicoDataSolutions/Indico-Solutions-Toolkit" } requires-python = ">=3.10" -version = "7.2.1" +version = "7.2.2" dependencies = ["indico-client (>=6.14.0,<7.0.0)"] [project.optional-dependencies] From 477367e92fc14fb408e6baafd9f60d2225812647 Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Mon, 20 Oct 2025 10:10:13 -0500 Subject: [PATCH 17/19] Speed up `.groupby("table")` and `.groupby("cell")` with custom __hash__ --- indico_toolkit/etloutput/cell.py | 9 +++++++++ indico_toolkit/etloutput/table.py | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/indico_toolkit/etloutput/cell.py b/indico_toolkit/etloutput/cell.py index 03c032b..3476644 100644 --- a/indico_toolkit/etloutput/cell.py +++ b/indico_toolkit/etloutput/cell.py @@ -24,6 +24,15 @@ class Cell: def __bool__(self) -> bool: return self != NULL_CELL + def __hash__(self) -> int: + """ + Uniquely identify cells by hashing their bounding box and spans. + + This is small speedup for `.groupby(attrgetter("cell"))` compared to + dataclasses's default __hash__ implementation. + """ + return hash((self.box, self.spans)) + @property def span(self) -> Span: """ diff --git a/indico_toolkit/etloutput/table.py b/indico_toolkit/etloutput/table.py index c0fd7b9..1b27850 100644 --- a/indico_toolkit/etloutput/table.py +++ b/indico_toolkit/etloutput/table.py @@ -19,6 +19,15 @@ class Table: def __bool__(self) -> bool: return self != NULL_TABLE + def __hash__(self) -> int: + """ + Uniquely identify tables by hashing their bounding box and spans. + + This is an order of magnitude speedup for `.groupby(attrgetter("table"))` + compared to dataclasses's default __hash__ implementation. + """ + return hash((self.box, self.spans)) + @property def span(self) -> Span: """ From fb797b6d5d474de99f0b8b9abad2d24a1f8f58bb Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Mon, 20 Oct 2025 10:28:52 -0500 Subject: [PATCH 18/19] Add prediction `.copy()` methods that only copy mutable state --- .../results/predictions/documentextraction.py | 17 ++++++++++++++++- .../results/predictions/prediction.py | 12 +++++++++++- .../results/predictions/summarization.py | 11 +++++++++++ .../results/predictions/unbundling.py | 13 ++++++++++++- 4 files changed, 50 insertions(+), 3 deletions(-) diff --git a/indico_toolkit/results/predictions/documentextraction.py b/indico_toolkit/results/predictions/documentextraction.py index 475b032..c942ab7 100644 --- a/indico_toolkit/results/predictions/documentextraction.py +++ b/indico_toolkit/results/predictions/documentextraction.py @@ -1,4 +1,5 @@ -from dataclasses import dataclass, field +from copy import copy, deepcopy +from dataclasses import dataclass, field, replace from typing import TYPE_CHECKING, Any from ...etloutput import ( @@ -18,6 +19,8 @@ if TYPE_CHECKING: from collections.abc import Iterable, Iterator + from typing_extensions import Self + from ..document import Document from ..review import Review from ..task import Task @@ -187,3 +190,15 @@ def to_dict(self) -> "dict[str, Any]": prediction["rejected"] = True return prediction + + def copy(self) -> "Self": + return replace( + self, + groups=copy(self.groups), + spans=copy(self.spans), + tokens=copy(self.tokens), + tables=copy(self.tables), + cells=copy(self.cells), + confidences=copy(self.confidences), + extras=deepcopy(self.extras), + ) diff --git a/indico_toolkit/results/predictions/prediction.py b/indico_toolkit/results/predictions/prediction.py index ab4bc30..6edf03c 100644 --- a/indico_toolkit/results/predictions/prediction.py +++ b/indico_toolkit/results/predictions/prediction.py @@ -1,7 +1,10 @@ -from dataclasses import dataclass +from copy import copy, deepcopy +from dataclasses import dataclass, replace from typing import TYPE_CHECKING, Any if TYPE_CHECKING: + from typing_extensions import Self + from ..document import Document from ..review import Review from ..task import Task @@ -30,3 +33,10 @@ def to_dict(self) -> "dict[str, Any]": Create a prediction dictionary for auto review changes. """ raise NotImplementedError() + + def copy(self) -> "Self": + return replace( + self, + confidences=copy(self.confidences), + extras=deepcopy(self.extras), + ) diff --git a/indico_toolkit/results/predictions/summarization.py b/indico_toolkit/results/predictions/summarization.py index 61b257c..6f2fa3e 100644 --- a/indico_toolkit/results/predictions/summarization.py +++ b/indico_toolkit/results/predictions/summarization.py @@ -1,3 +1,4 @@ +from copy import copy, deepcopy from dataclasses import dataclass, replace from typing import TYPE_CHECKING, Any @@ -6,6 +7,8 @@ from .extraction import Extraction if TYPE_CHECKING: + from typing_extensions import Self + from ...etloutput import Span from ..document import Document from ..review import Review @@ -122,3 +125,11 @@ def to_dict(self) -> "dict[str, Any]": prediction["rejected"] = True return prediction + + def copy(self) -> "Self": + return replace( + self, + citations=copy(self.citations), + confidences=copy(self.confidences), + extras=deepcopy(self.extras), + ) diff --git a/indico_toolkit/results/predictions/unbundling.py b/indico_toolkit/results/predictions/unbundling.py index ddc5701..7e998dd 100644 --- a/indico_toolkit/results/predictions/unbundling.py +++ b/indico_toolkit/results/predictions/unbundling.py @@ -1,4 +1,5 @@ -from dataclasses import dataclass +from copy import copy, deepcopy +from dataclasses import dataclass, replace from typing import TYPE_CHECKING, Any from ...etloutput import Span @@ -6,6 +7,8 @@ from .prediction import Prediction if TYPE_CHECKING: + from typing_extensions import Self + from ..document import Document from ..review import Review from ..task import Task @@ -52,3 +55,11 @@ def to_dict(self) -> "dict[str, Any]": "confidence": self.confidences, "spans": [span.to_dict() for span in self.spans], } + + def copy(self) -> "Self": + return replace( + self, + spans=copy(self.spans), + confidences=copy(self.confidences), + extras=deepcopy(self.extras), + ) From 68c407d33abfb0727a05886d0575d632d7eb91cf Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Mon, 20 Oct 2025 10:32:36 -0500 Subject: [PATCH 19/19] Update changelog --- CHANGELOG.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b14de6b..9f502a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,12 +11,14 @@ and versions match the minimum IPA version required to use functionality. ### Added - Parse table spans from ETL Output as `Table.spans`. -- Add `NULL_CELL`, `NULL_RANGE`, `NULL_TABLE`, and `NULL_TOKEN` constants. -- Add Document Extraction attributes for assigning tokens, tables, and cells from OCR: +- `NULL_CELL`, `NULL_RANGE`, `NULL_TABLE`, and `NULL_TOKEN` constants. +- Document Extraction attributes for assigning tokens, tables, and cells from OCR: - `DocumentExtraction.tokens`, `DocumentExtraction.tables`, `DocumentExtraction.cells` -- Add Document Extraction convenience properties for singular token, table, and cell access: +- Document Extraction convenience properties for singular token, table, and cell access: - `DocumentExtraction.token`, `DocumentExtraction.table`, `DocumentExtraction.cell` -- Add `PredictionList.assign_ocr(etl_outputs, tokens=True, tables=True)` method. +- `PredictionList.assign_ocr(etl_outputs, tokens=True, tables=True)` method. +- Custom `__hash__` methods for tables and cells to speed up `.groupby(...)`. +- Prediction `.copy()` methods that only copy mutable state. ### Changed