Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 16 additions & 12 deletions examples/results_dataclasses.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,13 @@


"""
Dataclass Reference
Dataclass Reference Summary

See class definitions for complete reference.
"""

# Result Dataclass
result.id # Submission ID
result.submission_id # Submission ID
result.version # Result file version
result.documents # List of documents in this submission
result.models # List of documents in this submission
Expand Down Expand Up @@ -81,8 +83,7 @@
document = result.documents[0]
document.id
document.name
document.etl_output_url
document.full_text_url
document.etl_output_uri


# Prediction list Dataclass
Expand Down Expand Up @@ -130,9 +131,9 @@
# DocumentExtraction Dataclass (Subclass of Extraction)
document_extraction = predictions.document_extractions[0]
document_extraction.text
document_extraction.page
document_extraction.start
document_extraction.end
document_extraction.span.page
document_extraction.span.start
document_extraction.span.end
document_extraction.groups # Any linked label groups this prediction is a part of
document_extraction.accepted
document_extraction.rejected
Expand All @@ -145,12 +146,15 @@

# FormExtraction Dataclass (Subclass of Extraction)
form_extraction = predictions.form_extractions[0]
form_extraction.type
form_extraction.text
form_extraction.page
form_extraction.top
form_extraction.left
form_extraction.right
form_extraction.bottom
form_extraction.checked
form_extraction.signed
form_extraction.box.page
form_extraction.box.top
form_extraction.box.left
form_extraction.box.right
form_extraction.box.bottom
form_extraction.accepted
form_extraction.rejected

Expand Down
59 changes: 34 additions & 25 deletions indico_toolkit/etloutput/__init__.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,30 @@
from typing import TYPE_CHECKING

from ..results import NULL_BOX, NULL_SPAN, Box, Span
from ..results.utilities import get, has
from .cell import Cell, CellType
from .errors import EtlOutputError, TableCellNotFoundError, TokenNotFoundError
from .etloutput import EtlOutput
from .range import Range
from .table import Table
from .token import Token
from .utilities import get, has

if TYPE_CHECKING:
from collections.abc import Awaitable, Callable
from typing import Any

__all__ = (
"Box",
"Cell",
"CellType",
"EtlOutput",
"EtlOutputError",
"load",
"load_async",
"NULL_BOX",
"NULL_SPAN",
"Range",
"Span",
"Table",
"TableCellNotFoundError",
"Token",
Expand All @@ -26,74 +33,76 @@


def load(
etl_output_url: str,
etl_output_uri: str,
*,
reader: "Callable[..., Any]",
text: bool = True,
tokens: bool = True,
tables: bool = False,
) -> EtlOutput:
"""
Load `etl_output_url` as an ETL Output dataclass. A `reader` function must be
Load `etl_output_uri` as an ETL Output dataclass. A `reader` function must be
supplied to read JSON files from disk, storage API, or Indico client.

Use `text`, `tokens`, and `tables` to specify what to load.

```
result = results.load(submission.result_file, reader=read_url)
result = results.load(submission.result_file, reader=read_uri)
etl_outputs = {
document: etloutput.load(document.etl_output_url, reader=read_url)
document: etloutput.load(document.etl_output_uri, reader=read_uri)
for document in result.documents
if not document.failed
}
```
"""
etl_output = reader(etl_output_url)
tables_url = etl_output_url.replace("etl_output.json", "tables.json")
etl_output = reader(etl_output_uri)
tables_uri = etl_output_uri.replace("etl_output.json", "tables.json")

if has(etl_output, str, "pages", 0, "page_info"):
return _load_v1(etl_output, tables_url, reader, text, tokens, tables)
return _load_v1(etl_output, tables_uri, reader, text, tokens, tables)
else:
return _load_v3(etl_output, tables_url, reader, text, tokens, tables)
return _load_v3(etl_output, tables_uri, reader, text, tokens, tables)


async def load_async(
etl_output_url: str,
etl_output_uri: str,
*,
reader: "Callable[..., Awaitable[Any]]",
text: bool = True,
tokens: bool = True,
tables: bool = False,
) -> EtlOutput:
"""
Load `etl_output_url` as an ETL Output dataclass. A `reader` coroutine must be
Load `etl_output_uri` as an ETL Output dataclass. A `reader` coroutine must be
supplied to read JSON files from disk, storage API, or Indico client.

Use `text`, `tokens`, and `tables` to specify what to load.

```
result = await results.load_async(submission.result_file, reader=read_url)
result = await results.load_async(submission.result_file, reader=read_uri)
etl_outputs = {
document: await etloutput.load_async(document.etl_output_url, reader=read_url)
document: await etloutput.load_async(document.etl_output_uri, reader=read_uri)
for document in result.documents
if not document.failed
}
```
"""
etl_output = await reader(etl_output_url)
tables_url = etl_output_url.replace("etl_output.json", "tables.json")
etl_output = await reader(etl_output_uri)
tables_uri = etl_output_uri.replace("etl_output.json", "tables.json")

if has(etl_output, str, "pages", 0, "page_info"):
return await _load_v1_async(
etl_output, tables_url, reader, text, tokens, tables
etl_output, tables_uri, reader, text, tokens, tables
)
else:
return await _load_v3_async(
etl_output, tables_url, reader, text, tokens, tables
etl_output, tables_uri, reader, text, tokens, tables
)


def _load_v1(
etl_output: "Any",
tables_url: str,
tables_uri: str,
reader: "Callable[..., Any]",
text: bool,
tokens: bool,
Expand All @@ -111,7 +120,7 @@ def _load_v1(
tokens_by_page = () # type: ignore[assignment]

if tables:
tables_by_page = reader(tables_url)
tables_by_page = reader(tables_uri)
else:
tables_by_page = ()

Expand All @@ -120,7 +129,7 @@ def _load_v1(

def _load_v3(
etl_output: "Any",
tables_url: str,
tables_uri: str,
reader: "Callable[..., Any]",
text: bool,
tokens: bool,
Expand All @@ -139,7 +148,7 @@ def _load_v3(
tokens_by_page = () # type: ignore[assignment]

if tables:
tables_by_page = reader(tables_url)
tables_by_page = reader(tables_uri)
else:
tables_by_page = ()

Expand All @@ -148,7 +157,7 @@ def _load_v3(

async def _load_v1_async(
etl_output: "Any",
tables_url: str,
tables_uri: str,
reader: "Callable[..., Awaitable[Any]]",
text: bool,
tokens: bool,
Expand All @@ -166,7 +175,7 @@ async def _load_v1_async(
tokens_by_page = () # type: ignore[assignment]

if tables:
tables_by_page = await reader(tables_url)
tables_by_page = await reader(tables_uri)
else:
tables_by_page = ()

Expand All @@ -175,7 +184,7 @@ async def _load_v1_async(

async def _load_v3_async(
etl_output: "Any",
tables_url: str,
tables_uri: str,
reader: "Callable[..., Awaitable[Any]]",
text: bool,
tokens: bool,
Expand All @@ -194,7 +203,7 @@ async def _load_v3_async(
tokens_by_page = () # type: ignore[assignment]

if tables:
tables_by_page = await reader(tables_url)
tables_by_page = await reader(tables_uri)
else:
tables_by_page = ()

Expand Down
67 changes: 21 additions & 46 deletions indico_toolkit/etloutput/cell.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from dataclasses import dataclass
from enum import Enum

from .utilities import get, has
from ..results import NULL_SPAN, Box, Span
from ..results.utilities import get
from .range import Range


class CellType(Enum):
Expand All @@ -13,60 +15,33 @@ class CellType(Enum):
class Cell:
type: CellType
text: str
# Span
start: int
end: int
# Bounding box
page: int
top: int
left: int
right: int
bottom: int
# Table coordinates
row: int
rowspan: int
rows: "tuple[int, ...]"
column: int
columnspan: int
columns: "tuple[int, ...]"
box: Box
range: Range
spans: "tuple[Span, ...]"

def __lt__(self, other: "Cell") -> bool:
@property
def span(self) -> Span:
"""
By default, cells are sorted in table order (by row, then column).
Cells can also be sorted in span order: `tokens.sort(key=attrgetter("start"))`.
Return the first `Span` the cell covers or `NULL_SPAN` otherwise.

Empty cells have no spans.
"""
return self.row < other.row or (
self.row == other.row and self.column < other.column
)
return self.spans[0] if self.spans else NULL_SPAN

@staticmethod
def from_dict(cell: object, page: int) -> "Cell":
"""
Create a `Cell` from a v1 or v3 ETL Ouput cell dictionary.
Create a `Cell` from a v1 or v3 cell dictionary.
"""
get(cell, dict, "position")["page_num"] = page

for doc_offset in get(cell, list, "doc_offsets"):
doc_offset["page_num"] = page

return Cell(
type=CellType(get(cell, str, "cell_type")),
text=get(cell, str, "text"),
# Empty cells have no start and end; so use [0:0] for a valid slice.
start=(
get(cell, int, "doc_offsets", 0, "start")
if has(cell, int, "doc_offsets", 0, "start")
else 0
),
end=(
get(cell, int, "doc_offsets", 0, "end")
if has(cell, int, "doc_offsets", 0, "end")
else 0
),
page=page,
top=get(cell, int, "position", "top"),
left=get(cell, int, "position", "left"),
right=get(cell, int, "position", "right"),
bottom=get(cell, int, "position", "bottom"),
row=get(cell, int, "rows", 0),
rowspan=len(get(cell, list, "rows")),
rows=tuple(get(cell, list, "rows")),
column=get(cell, int, "columns", 0),
columnspan=len(get(cell, list, "columns")),
columns=tuple(get(cell, list, "columns")),
box=Box.from_dict(get(cell, dict, "position")),
range=Range.from_dict(cell),
spans=tuple(map(Span.from_dict, get(cell, list, "doc_offsets"))),
)
Loading
Loading