IndicoDataSolutions · mawelborn · Feb 11, 2025 · Feb 4, 2025 · Feb 4, 2025 · Feb 4, 2025
diff --git a/examples/results_dataclasses.py b/examples/results_dataclasses.py
@@ -48,11 +48,13 @@
 
 
 """
-Dataclass Reference
+Dataclass Reference Summary
+
+See class definitions for complete reference.
 """
 
 # Result Dataclass
-result.id  # Submission ID
+result.submission_id  # Submission ID
 result.version  # Result file version
 result.documents  # List of documents in this submission
 result.models  # List of documents in this submission
@@ -81,8 +83,7 @@
 document = result.documents[0]
 document.id
 document.name
-document.etl_output_url
-document.full_text_url
+document.etl_output_uri
 
 
 # Prediction list Dataclass
@@ -130,9 +131,9 @@
 # DocumentExtraction Dataclass (Subclass of Extraction)
 document_extraction = predictions.document_extractions[0]
 document_extraction.text
-document_extraction.page
-document_extraction.start
-document_extraction.end
+document_extraction.span.page
+document_extraction.span.start
+document_extraction.span.end
 document_extraction.groups  # Any linked label groups this prediction is a part of
 document_extraction.accepted
 document_extraction.rejected
@@ -145,12 +146,15 @@
 
 # FormExtraction Dataclass (Subclass of Extraction)
 form_extraction = predictions.form_extractions[0]
+form_extraction.type
 form_extraction.text
-form_extraction.page
-form_extraction.top
-form_extraction.left
-form_extraction.right
-form_extraction.bottom
+form_extraction.checked
+form_extraction.signed
+form_extraction.box.page
+form_extraction.box.top
+form_extraction.box.left
+form_extraction.box.right
+form_extraction.box.bottom
 form_extraction.accepted
 form_extraction.rejected
 

diff --git a/indico_toolkit/etloutput/__init__.py b/indico_toolkit/etloutput/__init__.py
@@ -1,23 +1,30 @@
 from typing import TYPE_CHECKING
 
+from ..results import NULL_BOX, NULL_SPAN, Box, Span
+from ..results.utilities import get, has
 from .cell import Cell, CellType
 from .errors import EtlOutputError, TableCellNotFoundError, TokenNotFoundError
 from .etloutput import EtlOutput
+from .range import Range
 from .table import Table
 from .token import Token
-from .utilities import get, has
 
 if TYPE_CHECKING:
     from collections.abc import Awaitable, Callable
     from typing import Any
 
 __all__ = (
+    "Box",
     "Cell",
     "CellType",
     "EtlOutput",
     "EtlOutputError",
     "load",
     "load_async",
+    "NULL_BOX",
+    "NULL_SPAN",
+    "Range",
+    "Span",
     "Table",
     "TableCellNotFoundError",
     "Token",
@@ -26,74 +33,76 @@
 
 
 def load(
-    etl_output_url: str,
+    etl_output_uri: str,
     *,
     reader: "Callable[..., Any]",
     text: bool = True,
     tokens: bool = True,
     tables: bool = False,
 ) -> EtlOutput:
     """
-    Load `etl_output_url` as an ETL Output dataclass. A `reader` function must be
+    Load `etl_output_uri` as an ETL Output dataclass. A `reader` function must be
     supplied to read JSON files from disk, storage API, or Indico client.
 
     Use `text`, `tokens`, and `tables` to specify what to load.
 
     ```
-    result = results.load(submission.result_file, reader=read_url)
+    result = results.load(submission.result_file, reader=read_uri)
     etl_outputs = {
-        document: etloutput.load(document.etl_output_url, reader=read_url)
+        document: etloutput.load(document.etl_output_uri, reader=read_uri)
         for document in result.documents
+        if not document.failed
     }
     ```
     """
-    etl_output = reader(etl_output_url)
-    tables_url = etl_output_url.replace("etl_output.json", "tables.json")
+    etl_output = reader(etl_output_uri)
+    tables_uri = etl_output_uri.replace("etl_output.json", "tables.json")
 
     if has(etl_output, str, "pages", 0, "page_info"):
-        return _load_v1(etl_output, tables_url, reader, text, tokens, tables)
+        return _load_v1(etl_output, tables_uri, reader, text, tokens, tables)
     else:
-        return _load_v3(etl_output, tables_url, reader, text, tokens, tables)
+        return _load_v3(etl_output, tables_uri, reader, text, tokens, tables)
 
 
 async def load_async(
-    etl_output_url: str,
+    etl_output_uri: str,
     *,
     reader: "Callable[..., Awaitable[Any]]",
     text: bool = True,
     tokens: bool = True,
     tables: bool = False,
 ) -> EtlOutput:
     """
-    Load `etl_output_url` as an ETL Output dataclass. A `reader` coroutine must be
+    Load `etl_output_uri` as an ETL Output dataclass. A `reader` coroutine must be
     supplied to read JSON files from disk, storage API, or Indico client.
 
     Use `text`, `tokens`, and `tables` to specify what to load.
 
     ```
-    result = await results.load_async(submission.result_file, reader=read_url)
+    result = await results.load_async(submission.result_file, reader=read_uri)
     etl_outputs = {
-        document: await etloutput.load_async(document.etl_output_url, reader=read_url)
+        document: await etloutput.load_async(document.etl_output_uri, reader=read_uri)
         for document in result.documents
+        if not document.failed
     }
     ```
     """
-    etl_output = await reader(etl_output_url)
-    tables_url = etl_output_url.replace("etl_output.json", "tables.json")
+    etl_output = await reader(etl_output_uri)
+    tables_uri = etl_output_uri.replace("etl_output.json", "tables.json")
 
     if has(etl_output, str, "pages", 0, "page_info"):
         return await _load_v1_async(
-            etl_output, tables_url, reader, text, tokens, tables
+            etl_output, tables_uri, reader, text, tokens, tables
         )
     else:
         return await _load_v3_async(
-            etl_output, tables_url, reader, text, tokens, tables
+            etl_output, tables_uri, reader, text, tokens, tables
         )
 
 
 def _load_v1(
     etl_output: "Any",
-    tables_url: str,
+    tables_uri: str,
     reader: "Callable[..., Any]",
     text: bool,
     tokens: bool,
@@ -111,7 +120,7 @@ def _load_v1(
         tokens_by_page = ()  # type: ignore[assignment]
 
     if tables:
-        tables_by_page = reader(tables_url)
+        tables_by_page = reader(tables_uri)
     else:
         tables_by_page = ()
 
@@ -120,7 +129,7 @@ def _load_v1(
 
 def _load_v3(
     etl_output: "Any",
-    tables_url: str,
+    tables_uri: str,
     reader: "Callable[..., Any]",
     text: bool,
     tokens: bool,
@@ -139,7 +148,7 @@ def _load_v3(
         tokens_by_page = ()  # type: ignore[assignment]
 
     if tables:
-        tables_by_page = reader(tables_url)
+        tables_by_page = reader(tables_uri)
     else:
         tables_by_page = ()
 
@@ -148,7 +157,7 @@ def _load_v3(
 
 async def _load_v1_async(
     etl_output: "Any",
-    tables_url: str,
+    tables_uri: str,
     reader: "Callable[..., Awaitable[Any]]",
     text: bool,
     tokens: bool,
@@ -166,7 +175,7 @@ async def _load_v1_async(
         tokens_by_page = ()  # type: ignore[assignment]
 
     if tables:
-        tables_by_page = await reader(tables_url)
+        tables_by_page = await reader(tables_uri)
     else:
         tables_by_page = ()
 
@@ -175,7 +184,7 @@ async def _load_v1_async(
 
 async def _load_v3_async(
     etl_output: "Any",
-    tables_url: str,
+    tables_uri: str,
     reader: "Callable[..., Awaitable[Any]]",
     text: bool,
     tokens: bool,
@@ -194,7 +203,7 @@ async def _load_v3_async(
         tokens_by_page = ()  # type: ignore[assignment]
 
     if tables:
-        tables_by_page = await reader(tables_url)
+        tables_by_page = await reader(tables_uri)
     else:
         tables_by_page = ()
 

diff --git a/indico_toolkit/etloutput/cell.py b/indico_toolkit/etloutput/cell.py
@@ -1,7 +1,9 @@
 from dataclasses import dataclass
 from enum import Enum
 
-from .utilities import get, has
+from ..results import NULL_SPAN, Box, Span
+from ..results.utilities import get
+from .range import Range
 
 
 class CellType(Enum):
@@ -13,60 +15,33 @@ class CellType(Enum):
 class Cell:
     type: CellType
     text: str
-    # Span
-    start: int
-    end: int
-    # Bounding box
-    page: int
-    top: int
-    left: int
-    right: int
-    bottom: int
-    # Table coordinates
-    row: int
-    rowspan: int
-    rows: "tuple[int, ...]"
-    column: int
-    columnspan: int
-    columns: "tuple[int, ...]"
+    box: Box
+    range: Range
+    spans: "tuple[Span, ...]"
 
-    def __lt__(self, other: "Cell") -> bool:
+    @property
+    def span(self) -> Span:
         """
-        By default, cells are sorted in table order (by row, then column).
-        Cells can also be sorted in span order: `tokens.sort(key=attrgetter("start"))`.
+        Return the first `Span` the cell covers or `NULL_SPAN` otherwise.
+
+        Empty cells have no spans.
         """
-        return self.row < other.row or (
-            self.row == other.row and self.column < other.column
-        )
+        return self.spans[0] if self.spans else NULL_SPAN
 
     @staticmethod
     def from_dict(cell: object, page: int) -> "Cell":
         """
-        Create a `Cell` from a v1 or v3 ETL Ouput cell dictionary.
+        Create a `Cell` from a v1 or v3 cell dictionary.
         """
+        get(cell, dict, "position")["page_num"] = page
+
+        for doc_offset in get(cell, list, "doc_offsets"):
+            doc_offset["page_num"] = page
+
         return Cell(
             type=CellType(get(cell, str, "cell_type")),
             text=get(cell, str, "text"),
-            # Empty cells have no start and end; so use [0:0] for a valid slice.
-            start=(
-                get(cell, int, "doc_offsets", 0, "start")
-                if has(cell, int, "doc_offsets", 0, "start")
-                else 0
-            ),
-            end=(
-                get(cell, int, "doc_offsets", 0, "end")
-                if has(cell, int, "doc_offsets", 0, "end")
-                else 0
-            ),
-            page=page,
-            top=get(cell, int, "position", "top"),
-            left=get(cell, int, "position", "left"),
-            right=get(cell, int, "position", "right"),
-            bottom=get(cell, int, "position", "bottom"),
-            row=get(cell, int, "rows", 0),
-            rowspan=len(get(cell, list, "rows")),
-            rows=tuple(get(cell, list, "rows")),
-            column=get(cell, int, "columns", 0),
-            columnspan=len(get(cell, list, "columns")),
-            columns=tuple(get(cell, list, "columns")),
+            box=Box.from_dict(get(cell, dict, "position")),
+            range=Range.from_dict(cell),
+            spans=tuple(map(Span.from_dict, get(cell, list, "doc_offsets"))),
         )