diff --git a/Cargo.toml b/Cargo.toml index 2d6b234..53eb22d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "simdxml-python" -version = "0.1.0" +version = "0.2.0" edition = "2021" [lib] @@ -9,5 +9,5 @@ crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.28", features = ["extension-module", "abi3-py39"] } -simdxml = "0.1" +simdxml = "0.2" self_cell = "1" diff --git a/README.md b/README.md index cc347b3..c4359b2 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,10 @@ elem.getprevious() # previous sibling or None elem.xpath(".//title") # context-node evaluation elem.xpath_text("author") # text extraction from context +# Batch APIs (single FFI call, interned strings) +root.child_tags() # -> list[str] of child tag names +root.descendant_tags("item") # -> list[str] filtered by tag + # Compiled XPath (like re.compile) expr = simdxml.compile("//title") expr.eval_text(doc) # -> list[str] @@ -118,33 +122,59 @@ Full conformance with XPath 1.0: ## Benchmarks -Measured on Apple Silicon (M-series), Python 3.14, comparing against -lxml 6.0 and stdlib `xml.etree.ElementTree`. Run with `uv run python bench/bench_parse.py`. +Apple Silicon, Python 3.14, lxml 6.0. GC disabled during timing, 3 warmup + +20 timed iterations, median reported. Three corpus types: data-oriented +(product catalog), document-oriented (PubMed abstracts), config-oriented +(Maven POM). Run yourself: `uv run python bench/bench_parse.py` + +### Parse + +`simdxml.parse()` eagerly builds structural indices (CSR, name posting). +lxml's `fromstring()` builds a DOM tree without precomputed query indices. +simdxml front-loads more work into parse so queries are faster — both numbers +are real, the trade-off depends on your workload. + +| Corpus | Size | simdxml | lxml | vs lxml | vs stdlib | +|--------|------|---------|------|---------|-----------| +| Catalog (data) | 1.6 MB | 2.7 ms | 8.1 ms | **3.0x** | **5.4x** | +| Catalog (data) | 17 MB | 32 ms | 82 ms | **2.6x** | **4.7x** | +| PubMed (doc) | 1.7 MB | 2.3 ms | 6.0 ms | **2.7x** | **5.9x** | +| PubMed (doc) | 17 MB | 27 ms | 61 ms | **2.2x** | **5.0x** | +| POM (config) | 2.1 MB | 2.7 ms | 8.3 ms | **3.1x** | **6.6x** | + +### XPath queries (returning Elements — apples-to-apples) -### Parse throughput +| Query | Corpus | simdxml | lxml | vs lxml | +|-------|--------|---------|------|---------| +| `//item` | Catalog 17 MB | 3.4 ms | 21 ms | **6x** | +| `//item[@category="cat5"]` | Catalog 17 MB | 1.6 ms | 69 ms | **42x** | +| `//PubmedArticle` | PubMed 17 MB | 0.35 ms | 9.8 ms | **28x** | +| `//Author[LastName="Auth0_0"]` | PubMed 17 MB | 13 ms | 29 ms | **2.2x** | +| `//dependency` | POM 2.1 MB | 0.34 ms | 1.1 ms | **3.3x** | +| `//dependency[scope="test"]` | POM 2.1 MB | 2.4 ms | 3.6 ms | **1.5x** | -| Document | simdxml | lxml | stdlib ET | vs lxml | vs stdlib | -|----------|---------|------|-----------|---------|-----------| -| 20 KB (100 items) | 0.05 ms | 0.09 ms | 0.15 ms | 1.8x | 3.0x | -| 2 MB (10K items) | 3.3 ms | 8.5 ms | 16.7 ms | 2.6x | 5.0x | -| 20 MB (100K items) | 40 ms | 87 ms | 181 ms | **2.2x** | **4.5x** | +### XPath text extraction -### XPath query: `//name` +`xpath_text()` returns strings directly, avoiding Element object creation. +This is the optimized path for ETL / data extraction workloads. -| Document | simdxml | lxml | stdlib findall | vs lxml | vs stdlib | -|----------|---------|------|----------------|---------|-----------| -| 2 MB | 0.3 ms | 1.0 ms | 0.7 ms | 3.1x | 2.1x | -| 20 MB | 3.8 ms | 19.7 ms | 7.3 ms | **5.2x** | **1.9x** | +| Query | Corpus | simdxml | lxml xpath+.text | vs lxml | +|-------|--------|---------|------------------|---------| +| `//name` | Catalog 17 MB | 1.8 ms | 37 ms | **20x** | +| `//AbstractText` | PubMed 17 MB | 0.31 ms | 7.1 ms | **23x** | +| `//artifactId` | POM 2.1 MB | 0.21 ms | 2.0 ms | **10x** | -### XPath query with predicate: `//item[@category="cat5"]` +### Element traversal -| Document | simdxml | lxml | stdlib findall | vs lxml | -|----------|---------|------|----------------|---------| -| 2 MB | 0.2 ms | 2.8 ms | 0.8 ms | 16x | -| 20 MB | 2.0 ms | 46 ms | 9.1 ms | **23x** | +`child_tags()` and `descendant_tags()` return all tag names in a single +call using interned Python strings. Per-element iteration (`for e in root`) +is also available but creates Element objects with some overhead. -The predicate speedup is dramatic because simdxml's structural index enables -direct attribute comparison without materializing DOM nodes. +| Corpus | `child_tags()` | lxml `[e.tag]` | vs lxml | +|--------|----------------|-----------------|---------| +| Catalog 17 MB | **0.38 ms** | 6.4 ms | **17x** | +| PubMed 17 MB | **0.03 ms** | 0.60 ms | **17x** | +| POM 2.1 MB | **0.2 us** | 0.5 us | **3x** | ## How it works @@ -157,7 +187,8 @@ and parents -- all indexed by the same position. - O(1) ancestor/descendant checks via pre/post-order numbering - O(1) child enumeration via CSR (Compressed Sparse Row) indices - SIMD-accelerated structural parsing (NEON on ARM, AVX2 on x86) -- Lazy index building: CSR indices built on first query, not at parse time +- Parse eagerly builds all indices (CSR, name posting, parent map) so + subsequent queries pay zero index construction cost ## Platform support diff --git a/bench/bench_parse.py b/bench/bench_parse.py index 48ca246..3eb532f 100644 --- a/bench/bench_parse.py +++ b/bench/bench_parse.py @@ -1,11 +1,27 @@ """Benchmark: simdxml vs lxml vs stdlib xml.etree.ElementTree. +Methodology: + - GC disabled during timing to avoid collection noise + - 3 warmup iterations discarded, then 20 timed iterations + - Reports median (robust to outliers from page faults, scheduling) + - All XPath benchmarks compare like-for-like: elements vs elements + - Both synthetic and real-world-shaped corpora + +Note: simdxml.parse() eagerly builds structural indices (CSR, name +posting, parent map). lxml.fromstring() builds a DOM tree without +precomputed indices. This means simdxml front-loads more work into +parse, then queries are faster. Both numbers are real -- the question +is which workload you have. + Usage: uv run python bench/bench_parse.py """ from __future__ import annotations +import gc +import random +import sys import time import xml.etree.ElementTree as StdET @@ -19,133 +35,313 @@ HAS_LXML = False -def generate_xml(n_items: int) -> bytes: - """Generate a catalog XML with n_items.""" +# --------------------------------------------------------------------------- +# Corpus generators +# --------------------------------------------------------------------------- + + +def gen_catalog(n: int) -> bytes: + """Data-oriented: uniform structure, many attributes.""" items = "\n".join( - f' ' + " " + f'' f"Item {i}" - f"Description for item {i} with some text content" + f"Desc for item {i}" f"{i * 1.5:.2f}" - f"tag{i % 5}tag{i % 3}" + f"t{i % 5}t{i % 3}" f"" - for i in range(n_items) + for i in range(n) ) return f"\n{items}\n".encode() -def bench(label: str, fn, iterations: int = 10) -> float: - """Run fn `iterations` times, return median time in ms.""" - times = [] - for _ in range(iterations): - start = time.perf_counter() +def gen_pubmed(n: int) -> bytes: + """Document-oriented: mixed depth, varying children.""" + rng = random.Random(42) + articles = [] + for i in range(n): + n_auth = rng.randint(1, 8) + auths = "\n".join( + " " + f"Auth{j}_{i}" + f"F{j}" + f"Univ {rng.randint(1, 20)}" + "" + "" + for j in range(n_auth) + ) + n_mesh = rng.randint(2, 12) + mesh = "\n".join( + " " + f'' + f"Term{k}_{i}" + "" + for k in range(n_mesh) + ) + kind = "randomized" if i % 2 else "retrospective" + sents = " ".join( + f"Sentence {s} about topic {i}." for s in range(rng.randint(3, 8)) + ) + issn = f"{rng.randint(1000, 9999)}-{rng.randint(1000, 9999)}" + articles.append( + " \n" + ' \n' + f" {10000000 + i}\n" + "
\n" + " " + f'{issn}' + f"J Example {i % 50}" + "\n" + f" Topic {i}: " + f"a {kind} study\n" + " " + f"{sents}" + "\n" + f" \n{auths}\n" + " \n" + " eng\n" + "
\n" + f" \n{mesh}\n" + " \n" + "
\n" + "
" + ) + body = "\n".join(articles) + return f"\n{body}\n".encode() + + +def gen_pom(n: int) -> bytes: + """Config-oriented: deep nesting, namespaces.""" + deps = "\n".join( + " \n" + f" com.example.g{i % 20}\n" + f" art-{i}\n" + f" {i % 5}.{i % 10}.{i % 3}\n" + " " + + ("compile" if i % 3 == 0 else "test" if i % 3 == 1 else "runtime") + + "\n" + + ( + " \n" + f" " + f"com.ex.{i}" + f"bad-{i}" + "\n" + " \n" + if i % 4 == 0 + else "" + ) + + " " + for i in range(n) + ) + return ( + "\n" + " 4.0.0\n" + " com.example\n" + " benchmark\n" + " 1.0.0\n" + f" \n{deps}\n \n" + "" + ).encode() + + +# --------------------------------------------------------------------------- +# Bench harness +# --------------------------------------------------------------------------- + +WARMUP = 3 +ITERATIONS = 20 + + +def bench(fn) -> float: + """Warmup then timed iterations; return median ms.""" + for _ in range(WARMUP): fn() - elapsed = (time.perf_counter() - start) * 1000 - times.append(elapsed) + + gc.disable() + try: + times = [] + for _ in range(ITERATIONS): + t0 = time.perf_counter() + fn() + times.append((time.perf_counter() - t0) * 1000) + finally: + gc.enable() + times.sort() - median = times[len(times) // 2] - return median + return times[len(times) // 2] -def print_row(label: str, time_ms: float, baseline_ms: float | None = None) -> None: - speedup = "" - if baseline_ms is not None and time_ms > 0: - ratio = baseline_ms / time_ms - lib = "lxml" if HAS_LXML else "stdlib" - speedup = f" ({ratio:.1f}x vs {lib})" - print(f" {label:<30s} {time_ms:8.2f} ms{speedup}") +def fmt(ms: float) -> str: + if ms < 0.01: + return f"{ms * 1000:6.1f} us" + if ms < 1: + return f"{ms:6.2f} ms" + return f"{ms:6.1f} ms" -def run_benchmarks(xml: bytes, label: str) -> None: - size_mb = len(xml) / (1024 * 1024) - print(f"\n{'=' * 60}") - print(f" {label} ({size_mb:.1f} MB, {len(xml):,} bytes)") - print(f"{'=' * 60}") +def ratio_str(a: float, b: float) -> str: + if b <= 0: + return "" + r = b / a + if r >= 1: + return f" \033[32m{r:.1f}x faster\033[0m" + return f" \033[31m{1 / r:.1f}x slower\033[0m" + + +# --------------------------------------------------------------------------- +# Benchmark suites +# --------------------------------------------------------------------------- + - # --- Parse --- - print("\n Parse:") - simdxml_parse = bench("simdxml", lambda: simdxml.parse(xml)) - print_row("simdxml.parse()", simdxml_parse) +def bench_parse(xml: bytes, label: str) -> None: + print(f"\n \033[1mParse\033[0m ({label})") + print(" Note: simdxml.parse() includes index construction (CSR + name posting)") + + t_simd = bench(lambda: simdxml.parse(xml)) + print(f" simdxml.parse() {fmt(t_simd)}") if HAS_LXML: - lxml_parse = bench("lxml", lambda: lxml_etree.fromstring(xml)) - print_row("lxml.etree.fromstring()", lxml_parse) + t_lxml = bench(lambda: lxml_etree.fromstring(xml)) + print(f" lxml.fromstring() {fmt(t_lxml)}{ratio_str(t_simd, t_lxml)}") + + t_std = bench(lambda: StdET.fromstring(xml)) + print(f" ET.fromstring() {fmt(t_std)}{ratio_str(t_simd, t_std)}") - std_parse = bench("stdlib", lambda: StdET.fromstring(xml)) - print_row("ET.fromstring()", std_parse) - baseline = lxml_parse if HAS_LXML else std_parse - lib = "lxml" if HAS_LXML else "stdlib" - print(f"\n Parse speedup: {baseline / simdxml_parse:.1f}x vs {lib}") +def bench_xpath_elements(xml: bytes, expr: str, label: str) -> None: + """XPath returning Element objects -- fair comparison.""" + print(f"\n \033[1mXPath -> Elements\033[0m {expr} ({label})") - # --- XPath: //name (simple descendant) --- - print("\n XPath: //name") doc = simdxml.parse(xml) - compiled = simdxml.compile("//name") + t_simd = bench(lambda: doc.xpath(expr)) + n_results = len(doc.xpath(expr)) + print(f" simdxml doc.xpath() {fmt(t_simd)} ({n_results} results)") - simdxml_xpath = bench("simdxml.xpath_text", lambda: doc.xpath_text("//name")) - print_row("doc.xpath_text()", simdxml_xpath) + if HAS_LXML: + lroot = lxml_etree.fromstring(xml) + t_lxml = bench(lambda: lroot.xpath(expr)) + print(f" lxml root.xpath() {fmt(t_lxml)}{ratio_str(t_simd, t_lxml)}") + + # stdlib findall -- skip complex expressions + if not any(c in expr for c in ("()", "::", "|")): + std_expr = expr + if not expr.startswith("."): + std_expr = "." + expr if expr.startswith("/") else "./" + expr + sroot = StdET.fromstring(xml) + try: + t_std = bench(lambda: sroot.findall(std_expr)) + print(f" ET.findall() {fmt(t_std)}{ratio_str(t_simd, t_std)}") + except SyntaxError: + pass + + +def bench_xpath_text(xml: bytes, expr: str, label: str) -> None: + """XPath returning text -- simdxml's optimized path.""" + print(f"\n \033[1mXPath -> Text\033[0m {expr} ({label})") + + doc = simdxml.parse(xml) + compiled = simdxml.compile(expr) - simdxml_compiled = bench("simdxml.compiled", lambda: compiled.eval_text(doc)) - print_row("compiled.eval_text()", simdxml_compiled) + t_inline = bench(lambda: doc.xpath_text(expr)) + t_compiled = bench(lambda: compiled.eval_text(doc)) + n = len(doc.xpath_text(expr)) + print(f" simdxml xpath_text() {fmt(t_inline)} ({n} results)") + print(f" simdxml compiled {fmt(t_compiled)}") if HAS_LXML: - lxml_root = lxml_etree.fromstring(xml) - lxml_xpath = bench("lxml.xpath", lambda: lxml_root.xpath("//name")) - print_row("lxml_root.xpath()", lxml_xpath) - baseline_xpath = lxml_xpath - else: - baseline_xpath = None + lroot = lxml_etree.fromstring(xml) + t_lxml = bench(lambda: [e.text for e in lroot.xpath(expr)]) + print(f" lxml xpath+.text {fmt(t_lxml)}{ratio_str(t_inline, t_lxml)}") + - std_root = StdET.fromstring(xml) - std_findall = bench("stdlib.findall", lambda: std_root.findall(".//name")) - print_row("std_root.findall()", std_findall) +def bench_traversal(xml: bytes, label: str) -> None: + """Element traversal: per-element loop vs batch API.""" + print(f"\n \033[1mTraversal\033[0m ({label})") - if baseline_xpath: - print(f"\n XPath speedup: {baseline_xpath / simdxml_xpath:.1f}x vs lxml") + doc = simdxml.parse(xml) - # --- XPath: predicate query --- - print('\n XPath: //item[@category="cat5"]') - pred_expr = '//item[@category="cat5"]' - simdxml_pred = bench("simdxml", lambda: doc.xpath(pred_expr)) - print_row("doc.xpath()", simdxml_pred) + # Batch API (single FFI call, interned strings) + t_batch = bench(lambda: doc.root.child_tags()) + print(f" simdxml child_tags() {fmt(t_batch)} [batch, 1 FFI call]") + + # Per-element loop (N FFI calls, but tags are interned) + t_loop = bench(lambda: [e.tag for e in doc.root]) + print(f" simdxml [e.tag for e] {fmt(t_loop)} [per-element FFI]") if HAS_LXML: - lxml_pred = bench("lxml", lambda: lxml_root.xpath(pred_expr)) - print_row("lxml_root.xpath()", lxml_pred) + lroot = lxml_etree.fromstring(xml) + t_lxml = bench(lambda: [e.tag for e in lroot]) + print(f" lxml [e.tag for e] {fmt(t_lxml)}{ratio_str(t_batch, t_lxml)}") - std_pred = bench("stdlib", lambda: std_root.findall('.//item[@category="cat5"]')) - print_row("std_root.findall()", std_pred) + sroot = StdET.fromstring(xml) + t_std = bench(lambda: [e.tag for e in sroot]) + print(f" stdlib [e.tag for e] {fmt(t_std)}{ratio_str(t_batch, t_std)}") - # --- Element traversal --- - print("\n Traversal: iterate all children of root") - simdxml_iter = bench("simdxml", lambda: [e.tag for e in doc.root]) - print_row("for e in doc.root", simdxml_iter) - if HAS_LXML: - lxml_iter = bench("lxml", lambda: [e.tag for e in lxml_root]) - print_row("for e in lxml_root", lxml_iter) +def run_corpus(xml: bytes, name: str) -> None: + size_mb = len(xml) / (1024 * 1024) + print(f"\n{'=' * 65}") + print(f" {name} ({size_mb:.1f} MB, {len(xml):,} bytes)") + print(f"{'=' * 65}") - std_iter = bench("stdlib", lambda: [e.tag for e in std_root]) - print_row("for e in std_root", std_iter) + bench_parse(xml, name) + if b"" in xml: + bench_xpath_elements(xml, "//PubmedArticle", name) + bench_xpath_elements(xml, '//Author[LastName="Auth0_0"]', name) + bench_xpath_text(xml, "//AbstractText", name) + elif b"" in xml: + bench_xpath_elements(xml, "//dependency", name) + bench_xpath_elements(xml, '//dependency[scope="test"]', name) + bench_xpath_text(xml, "//artifactId", name) -def main() -> None: - print("simdxml benchmark") - print(f" lxml available: {HAS_LXML}") - if HAS_LXML: - print(f" lxml version: {lxml_etree.LXML_VERSION}") + bench_traversal(xml, name) - # Small document - small = generate_xml(100) - run_benchmarks(small, "Small (100 items)") - # Medium document - medium = generate_xml(10_000) - run_benchmarks(medium, "Medium (10K items)") +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- - # Large document - large = generate_xml(100_000) - run_benchmarks(large, "Large (100K items)") + +def main() -> None: + print("simdxml benchmark suite") + print(f" Python {sys.version.split()[0]}") + print(f" simdxml {simdxml.__version__}") + if HAS_LXML: + ver = ".".join(str(x) for x in lxml_etree.LXML_VERSION) + print(f" lxml {ver}") + else: + print(" lxml: not installed") + print(f" Warmup: {WARMUP}, Timed: {ITERATIONS}, Metric: median") + + run_corpus( + gen_catalog(10_000), + "Catalog 10K (data-oriented)", + ) + run_corpus( + gen_catalog(100_000), + "Catalog 100K (data-oriented)", + ) + run_corpus( + gen_pubmed(1_000), + "PubMed 1K (document-oriented)", + ) + run_corpus( + gen_pubmed(10_000), + "PubMed 10K (document-oriented)", + ) + run_corpus( + gen_pom(1_000), + "POM 1K (config-oriented)", + ) + run_corpus( + gen_pom(10_000), + "POM 10K (config-oriented)", + ) if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index 4ab2cf3..7e936da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "simdxml" -version = "0.1.0" +version = "0.2.0" description = "SIMD-accelerated XML parser with full XPath 1.0 support" readme = "README.md" authors = [ diff --git a/python/simdxml/__init__.py b/python/simdxml/__init__.py index 049afc2..2b7f159 100644 --- a/python/simdxml/__init__.py +++ b/python/simdxml/__init__.py @@ -34,6 +34,7 @@ CompiledXPath, Document, Element, + ElementList, compile, parse, ) @@ -42,8 +43,9 @@ "CompiledXPath", "Document", "Element", + "ElementList", "compile", "parse", ] -__version__ = "0.1.0" +__version__ = "0.2.0" diff --git a/python/simdxml/_core.pyi b/python/simdxml/_core.pyi index 1961186..33ea939 100644 --- a/python/simdxml/_core.pyi +++ b/python/simdxml/_core.pyi @@ -1,59 +1,192 @@ from collections.abc import Iterator class Document: - """A parsed XML document backed by a SIMD-accelerated structural index.""" + """A parsed XML document. + + Created by `parse()`. Use `root` to get the root element, + or query directly with `xpath_text()` and `xpath()`. + """ @property - def root(self) -> Element | None: ... + def root(self) -> Element | None: + """The root element of the document, or None if empty.""" + ... @property - def tag_count(self) -> int: ... - def xpath_text(self, expr: str) -> list[str]: ... - def xpath_string(self, expr: str) -> list[str]: ... - def xpath(self, expr: str) -> list[Element | str]: ... + def tag_count(self) -> int: + """Total number of XML tags in the document.""" + ... + def xpath_text(self, expr: str) -> list[str]: + """Evaluate an XPath expression and return text content of matches. + + Returns the direct child text of each matching element. + """ + ... + def xpath_string(self, expr: str) -> list[str]: + """Evaluate an XPath expression and return string-values of matches. + + Returns all descendant text for each match (XPath ``string()`` semantics). + """ + ... + def xpath(self, expr: str) -> list[Element | str]: + """Evaluate an XPath expression. + + Returns Element objects for element nodes, strings for text/attribute nodes. + """ + ... class Element: - """A read-only element in a parsed XML document.""" + """A read-only XML element. + + Supports the ElementTree API (``.tag``, ``.text``, ``.attrib``, ``.get()``, + ``len()``, indexing, iteration) plus lxml extensions (``.xpath()``, + ``.getparent()``, ``.getnext()``, ``.getprevious()``). + """ @property - def tag(self) -> str: ... + def tag(self) -> str: + """The element's tag name (e.g., ``'book'``, ``'title'``).""" + ... @property - def text(self) -> str | None: ... + def text(self) -> str | None: + """Text content before the first child element, or None. + + For ``

Hello world

``, ``p.text`` is ``'Hello '``. + """ + ... @property - def tail(self) -> str | None: ... + def tail(self) -> str | None: + """Text content after this element's closing tag, or None. + + For ``

Hello world more

``, ``b.tail`` is ``' more'``. + """ + ... @property - def attrib(self) -> dict[str, str]: ... - def get(self, key: str, default: str | None = None) -> str | None: ... - def keys(self) -> list[str]: ... - def items(self) -> list[tuple[str, str]]: ... - def iter(self, tag: str | None = None) -> Iterator[Element]: ... - def itertext(self) -> list[str]: ... - def text_content(self) -> str: ... - def xpath(self, expr: str) -> list[Element]: ... - def xpath_text(self, expr: str) -> list[str]: ... - def getparent(self) -> Element | None: ... - def getnext(self) -> Element | None: ... - def getprevious(self) -> Element | None: ... - def tostring(self) -> str: ... - # Read-only enforcement: these raise TypeError - def set(self, key: str, value: str) -> None: ... - def append(self, element: Element) -> None: ... - def remove(self, element: Element) -> None: ... - def insert(self, index: int, element: Element) -> None: ... - def clear(self) -> None: ... + def attrib(self) -> dict[str, str]: + """Dictionary of this element's attributes.""" + ... + def get(self, key: str, default: str | None = None) -> str | None: + """Get an attribute value by name, with optional default.""" + ... + def keys(self) -> list[str]: + """List of attribute names.""" + ... + def items(self) -> list[tuple[str, str]]: + """List of ``(name, value)`` attribute pairs.""" + ... + def iter(self, tag: str | None = None) -> Iterator[Element]: + """Iterate over descendant elements, optionally filtered by tag name.""" + ... + def child_tags(self) -> list[str]: + """All direct child tag names as a list. + + More efficient than ``[e.tag for e in element]`` for bulk access. + """ + ... + def descendant_tags(self, tag: str | None = None) -> list[str]: + """All descendant tag names, optionally filtered. + + More efficient than ``[e.tag for e in element.iter(tag)]`` for bulk access. + """ + ... + def itertext(self) -> list[str]: + """All text content within this element, depth-first.""" + ... + def text_content(self) -> str: + """All descendant text concatenated into a single string.""" + ... + def xpath(self, expr: str) -> ElementList: + """Evaluate an XPath 1.0 expression with this element as context. + + Returns an ElementList of matching elements (lazy — created on access). + """ + ... + def xpath_text(self, expr: str) -> list[str]: + """Evaluate an XPath expression and return text content of matches.""" + ... + def getparent(self) -> Element | None: + """Parent element, or None if this is the root.""" + ... + def getnext(self) -> Element | None: + """Next sibling element, or None if this is the last child.""" + ... + def getprevious(self) -> Element | None: + """Previous sibling element, or None if this is the first child.""" + ... + def tostring(self) -> str: + """Serialize this element to an XML string.""" + ... + def set(self, key: str, value: str) -> None: + """Not supported. Raises TypeError (elements are read-only).""" + ... + def append(self, element: Element) -> None: + """Not supported. Raises TypeError (elements are read-only).""" + ... + def remove(self, element: Element) -> None: + """Not supported. Raises TypeError (elements are read-only).""" + ... + def insert(self, index: int, element: Element) -> None: + """Not supported. Raises TypeError (elements are read-only).""" + ... + def clear(self) -> None: + """Not supported. Raises TypeError (elements are read-only).""" + ... + def __len__(self) -> int: + """Number of direct child elements.""" + ... + def __getitem__(self, index: int) -> Element: + """Get a child element by index. Supports negative indexing.""" + ... + def __iter__(self) -> Iterator[Element]: + """Iterate over direct child elements.""" + ... + def __bool__(self) -> bool: ... + def __eq__(self, other: object) -> bool: ... + def __hash__(self) -> int: ... + +class ElementList: + """A lazy sequence of elements from an XPath query. + + Elements are created on demand when accessed by index or iteration. + Holds a single Document reference regardless of result size. + """ + def __len__(self) -> int: ... def __getitem__(self, index: int) -> Element: ... def __iter__(self) -> Iterator[Element]: ... def __bool__(self) -> bool: ... - def __eq__(self, other: object) -> bool: ... - def __hash__(self) -> int: ... class CompiledXPath: - """A compiled XPath expression for repeated evaluation.""" + """A compiled XPath expression for repeated use. + + Like ``re.compile()`` — parse the expression once, evaluate many times + across different documents. + """ + + def eval_text(self, doc: Document) -> list[str]: + """Evaluate and return text content of matching nodes.""" + ... + def eval(self, doc: Document) -> ElementList: + """Evaluate and return matching elements as an ElementList (lazy).""" + ... + def eval_exists(self, doc: Document) -> bool: + """Check whether any nodes match the expression.""" + ... + def eval_count(self, doc: Document) -> int: + """Count the number of matching nodes.""" + ... + +def parse(data: bytes | str) -> Document: + """Parse XML into a Document. + + Accepts ``bytes`` or ``str``. For bytes input, the buffer is used + directly (zero-copy). For str input, the string is encoded to UTF-8. + """ + ... - def eval_text(self, doc: Document) -> list[str]: ... - def eval(self, doc: Document) -> list[Element]: ... - def eval_exists(self, doc: Document) -> bool: ... - def eval_count(self, doc: Document) -> int: ... +def compile(expr: str) -> CompiledXPath: + """Compile an XPath expression for repeated use. -def parse(data: bytes | str) -> Document: ... -def compile(expr: str) -> CompiledXPath: ... + Like ``re.compile()`` — parse the expression once, evaluate many times + across different documents. + """ + ... diff --git a/python/simdxml/etree/ElementTree.py b/python/simdxml/etree/ElementTree.py index 6f3f7a4..df9cede 100644 --- a/python/simdxml/etree/ElementTree.py +++ b/python/simdxml/etree/ElementTree.py @@ -167,6 +167,6 @@ def _findall( """Find all matching subelements.""" xpath = _path_to_xpath(path) try: - return element.xpath(xpath) + return list(element.xpath(xpath)) except ValueError: return [] diff --git a/src/lib.rs b/src/lib.rs index 1f48a5c..a1ed1a6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,85 +1,121 @@ use pyo3::exceptions::{PyTypeError, PyValueError}; +use pyo3::pybacked::PyBackedBytes; use pyo3::prelude::*; -use pyo3::types::PyBytes; +use pyo3::types::{PyBytes, PyString}; use self_cell::self_cell; use simdxml::xpath::XPathNode; use simdxml::XmlIndex; // --------------------------------------------------------------------------- -// Self-referential Document: owns bytes + XmlIndex + derived data +// Self-referential Document: owns bytes + XmlIndex // --------------------------------------------------------------------------- -struct IndexWithMeta<'a> { - index: XmlIndex<'a>, - /// parent[i] = tag index of parent element. u32::MAX = root. - parents: Vec, +/// Owner type: either zero-copy from Python bytes or owned from str input. +enum DocumentOwner { + ZeroCopy(PyBackedBytes), + Owned(Vec), +} + +impl std::ops::Deref for DocumentOwner { + type Target = [u8]; + fn deref(&self) -> &[u8] { + match self { + DocumentOwner::ZeroCopy(b) => b, + DocumentOwner::Owned(v) => v, + } + } } self_cell!( struct DocumentInner { - owner: Vec, + owner: DocumentOwner, #[covariant] - dependent: IndexWithMeta, + dependent: XmlIndex, } ); -/// A parsed XML document backed by a SIMD-accelerated structural index. +/// A parsed XML document. +/// +/// Created by `parse()`. Use `root` to get the root element, +/// or query directly with `xpath_text()` and `xpath()`. #[pyclass] struct Document { inner: DocumentInner, + /// Interned tag names: name_id -> Python str (created once at parse). + interned_names: Vec>, } impl Document { fn index(&self) -> &XmlIndex<'_> { - &self.inner.borrow_dependent().index + self.inner.borrow_dependent() } - fn parents(&self) -> &[u32] { - &self.inner.borrow_dependent().parents + /// Look up interned tag name. Uses upstream name_ids directly. + fn interned_tag(&self, py: Python<'_>, index: &XmlIndex<'_>, tag_idx: usize) -> Py { + if tag_idx < index.name_ids.len() { + let name_id = index.name_ids[tag_idx]; + if (name_id as usize) < self.interned_names.len() && name_id != u16::MAX { + return self.interned_names[name_id as usize].clone_ref(py); + } + } + // Fallback for tags without interned names (comments, PIs, etc.) + PyString::new(py, index.tag_name(tag_idx)).unbind() } fn make_element(py: Python<'_>, doc: &Py, tag_idx: usize) -> Element { - Element { - doc: doc.clone_ref(py), - tag_idx, - } + let doc_ref = doc.borrow(py); + Self::make_element_borrowed(py, doc, &doc_ref, tag_idx) } - fn make_elements( + fn make_element_borrowed( py: Python<'_>, doc: &Py, - tag_indices: impl Iterator, - ) -> Vec { - tag_indices - .map(|idx| Element { - doc: doc.clone_ref(py), - tag_idx: idx, - }) - .collect() + doc_ref: &Document, + tag_idx: usize, + ) -> Element { + let index = doc_ref.index(); + let cached_tag = doc_ref.interned_tag(py, index, tag_idx); + Element { + doc: doc.clone_ref(py), + tag_idx, + cached_tag, + } } } #[pymethods] impl Document { /// Evaluate an XPath expression and return text content of matches. - fn xpath_text(&self, expr: &str) -> PyResult> { + /// + /// Returns the direct child text of each matching element. + fn xpath_text(&self, py: Python<'_>, expr: &str) -> PyResult>> { let index = self.index(); let results = index .xpath_text(expr) .map_err(|e| PyValueError::new_err(e.to_string()))?; - Ok(results.into_iter().map(|s| s.to_string()).collect()) + Ok(results + .into_iter() + .map(|s| PyString::new(py, s).unbind()) + .collect()) } - /// Evaluate an XPath expression and return the XPath string-value of matches. - fn xpath_string(&self, expr: &str) -> PyResult> { + /// Evaluate an XPath expression and return string-values of matches. + /// + /// Returns all descendant text for each match (XPath `string()` semantics). + fn xpath_string(&self, py: Python<'_>, expr: &str) -> PyResult>> { let index = self.index(); - index + let results = index .xpath_string(expr) - .map_err(|e| PyValueError::new_err(e.to_string())) + .map_err(|e| PyValueError::new_err(e.to_string()))?; + Ok(results + .into_iter() + .map(|s| PyString::new(py, &s).unbind()) + .collect()) } - /// Evaluate an XPath expression. Returns Element list for node-sets, - /// strings for text/attribute nodes. + /// Evaluate an XPath expression. + /// + /// Returns Element objects for element nodes, strings for text/attribute nodes. fn xpath(slf: &Bound<'_, Self>, expr: &str) -> PyResult> { let py = slf.py(); let doc_py: Py = slf.clone().unbind(); @@ -93,7 +129,7 @@ impl Document { for node in &nodes { match node { XPathNode::Element(idx) => { - let elem = Document::make_element(py, &doc_py, *idx); + let elem = Document::make_element_borrowed(py, &doc_py, &this, *idx); result.append(elem.into_pyobject(py)?)?; } XPathNode::Text(idx) => { @@ -101,8 +137,9 @@ impl Document { result.append(text)?; } XPathNode::Attribute(tag_idx, _) => { - if let Some(val) = get_first_attribute(index, *tag_idx) { - result.append(val)?; + let attrs = index.attributes(*tag_idx); + if let Some((_, val)) = attrs.first() { + result.append(*val)?; } } XPathNode::Namespace(_, _) => {} @@ -111,7 +148,7 @@ impl Document { Ok(result.unbind()) } - /// The root element of the document. + /// The root element of the document, or None if empty. #[getter] fn root(slf: &Bound<'_, Self>) -> Option { let py = slf.py(); @@ -123,13 +160,13 @@ impl Document { && (index.tag_type(i) == simdxml::index::TagType::Open || index.tag_type(i) == simdxml::index::TagType::SelfClose) { - return Some(Document::make_element(py, &doc_py, i)); + return Some(Document::make_element_borrowed(py, &doc_py, &this, i)); } } None } - /// Number of tags in the structural index. + /// Total number of XML tags in the document. #[getter] fn tag_count(&self) -> usize { self.index().tag_count() @@ -146,154 +183,141 @@ impl Document { } // --------------------------------------------------------------------------- -// Element — lightweight flyweight handle into a Document +// Element // --------------------------------------------------------------------------- -/// A read-only element in a parsed XML document. +/// A read-only XML element. /// -/// Holds a Python reference to the Document (preventing GC) plus a tag index. +/// Supports the ElementTree API (.tag, .text, .attrib, .get(), len(), +/// indexing, iteration) plus lxml extensions (.xpath(), .getparent(), +/// .getnext(), .getprevious()). #[pyclass(skip_from_py_object)] struct Element { - /// Python-ref-counted handle to the owning Document. doc: Py, tag_idx: usize, -} - -impl Element { - fn with_index<'py, R>(&self, py: Python<'py>, f: impl FnOnce(&XmlIndex<'_>, &[u32]) -> R) -> R { - let doc = self.doc.borrow(py); - f(doc.index(), doc.parents()) - } + cached_tag: Py, } #[pymethods] impl Element { - /// The tag name. + /// The element's tag name (e.g., 'book', 'title'). #[getter] - fn tag(&self, py: Python<'_>) -> String { - self.with_index(py, |index, _| index.tag_name(self.tag_idx).to_string()) + fn tag(&self, py: Python<'_>) -> Py { + self.cached_tag.clone_ref(py) } - /// Direct text content, or None. + /// Text content before the first child element, or None. + /// + /// For `

Hello world

`, `p.text` is `'Hello '`. #[getter] - fn text(&self, py: Python<'_>) -> Option { - self.with_index(py, |index, _| { - let texts = index.direct_text(self.tag_idx); - if texts.is_empty() { - return None; - } - let first = texts[0]; - if first.is_empty() { - None - } else { - Some(XmlIndex::decode_entities(first).into_owned()) - } + fn text(&self, py: Python<'_>) -> Option> { + let doc = self.doc.borrow(py); + let index = doc.index(); + // Uses upstream direct_text_first — zero-alloc, no Vec + index.direct_text_first(self.tag_idx).map(|s| { + let decoded = XmlIndex::decode_entities(s); + PyString::new(py, &decoded).unbind() }) } - /// Text after this element's closing tag (before next sibling). + /// Text content after this element's closing tag, or None. + /// + /// For `

Hello world more

`, `b.tail` is `' more'`. #[getter] - fn tail(&self, py: Python<'_>) -> Option { - self.with_index(py, |index, parents| { - let parent = parents[self.tag_idx]; - if parent == u32::MAX { - return None; - } - - let parent_raw = index.raw_xml(parent as usize); - let my_raw = index.raw_xml(self.tag_idx); - - if let Some(pos) = parent_raw.find(my_raw) { - let after = &parent_raw[pos + my_raw.len()..]; - if let Some(lt) = after.find('<') { - let text = &after[..lt]; - if !text.is_empty() { - return Some(XmlIndex::decode_entities(text).into_owned()); - } - } - } - None + fn tail(&self, py: Python<'_>) -> Option> { + let doc = self.doc.borrow(py); + let index = doc.index(); + // Uses upstream tail_text — proper implementation using text_ranges + index.tail_text(self.tag_idx).map(|s| { + let decoded = XmlIndex::decode_entities(s); + PyString::new(py, &decoded).unbind() }) } - /// Dictionary of attributes. + /// Dictionary of this element's attributes. #[getter] fn attrib(&self, py: Python<'_>) -> PyResult> { let doc = self.doc.borrow(py); let index = doc.index(); let dict = pyo3::types::PyDict::new(py); - for name in index.get_all_attribute_names(self.tag_idx) { - if let Some(val) = index.get_attribute(self.tag_idx, name) { - dict.set_item(name, val)?; - } + // Single-pass attribute parsing via upstream attributes() + for (name, val) in index.attributes(self.tag_idx) { + dict.set_item(name, val)?; } Ok(dict.unbind()) } /// Get an attribute value by name, with optional default. #[pyo3(signature = (key, default=None))] - fn get(&self, py: Python<'_>, key: &str, default: Option<&str>) -> Option { - self.with_index(py, |index, _| { - index - .get_attribute(self.tag_idx, key) - .map(|s| s.to_string()) - .or_else(|| default.map(|s| s.to_string())) - }) + fn get(&self, py: Python<'_>, key: &str, default: Option<&str>) -> Option> { + let doc = self.doc.borrow(py); + let index = doc.index(); + index + .get_attribute(self.tag_idx, key) + .map(|s| PyString::new(py, s).unbind()) + .or_else(|| default.map(|s| PyString::new(py, s).unbind())) } /// Attribute names. - fn keys(&self, py: Python<'_>) -> Vec { - self.with_index(py, |index, _| { - index - .get_all_attribute_names(self.tag_idx) - .into_iter() - .map(|s| s.to_string()) - .collect() - }) + fn keys(&self, py: Python<'_>) -> Vec> { + let doc = self.doc.borrow(py); + let index = doc.index(); + index + .attributes(self.tag_idx) + .into_iter() + .map(|(name, _)| PyString::new(py, name).unbind()) + .collect() } /// (name, value) attribute pairs. - fn items(&self, py: Python<'_>) -> Vec<(String, String)> { - self.with_index(py, |index, _| { - index - .get_all_attribute_names(self.tag_idx) - .into_iter() - .filter_map(|name| { - index - .get_attribute(self.tag_idx, name) - .map(|val| (name.to_string(), val.to_string())) - }) - .collect() - }) + fn items(&self, py: Python<'_>) -> Vec<(Py, Py)> { + let doc = self.doc.borrow(py); + let index = doc.index(); + index + .attributes(self.tag_idx) + .into_iter() + .map(|(name, val)| { + ( + PyString::new(py, name).unbind(), + PyString::new(py, val).unbind(), + ) + }) + .collect() } - /// Number of direct child elements. + /// Number of direct child elements (zero allocation). fn __len__(&self, py: Python<'_>) -> usize { - self.with_index(py, |index, _| index.children(self.tag_idx).len()) + let doc = self.doc.borrow(py); + doc.index().child_count(self.tag_idx) } - /// Get the i-th child element. + /// Get a child element by index. Supports negative indexing. fn __getitem__(&self, py: Python<'_>, index: isize) -> PyResult { let doc = self.doc.borrow(py); - let children = doc.index().children(self.tag_idx); - let len = children.len() as isize; + let idx = doc.index(); + let len = idx.child_count(self.tag_idx) as isize; let i = if index < 0 { len + index } else { index }; if i < 0 || i >= len { return Err(pyo3::exceptions::PyIndexError::new_err( "element index out of range", )); } - Ok(Document::make_element(py, &self.doc, children[i as usize])) + let child = idx.child_at(self.tag_idx, i as usize).ok_or_else(|| { + pyo3::exceptions::PyIndexError::new_err("element index out of range") + })?; + Ok(Document::make_element_borrowed(py, &self.doc, &doc, child)) } /// Iterate over direct child elements. fn __iter__(&self, py: Python<'_>) -> ElementIterator { let doc = self.doc.borrow(py); - ElementIterator { - doc: self.doc.clone_ref(py), - children: doc.index().children(self.tag_idx), - pos: 0, - } + let index = doc.index(); + let children: Vec = index + .child_slice(self.tag_idx) + .iter() + .map(|&c| c as usize) + .collect(); + ElementIterator::new(py, &self.doc, &doc, children) } /// Iterate descendant elements, optionally filtered by tag name. @@ -314,47 +338,80 @@ impl Element { } } } - ElementIterator { - doc: self.doc.clone_ref(py), - children: descendants, - pos: 0, + ElementIterator::new(py, &self.doc, &doc, descendants) + } + + /// All direct child tag names as a list (single FFI call, interned). + fn child_tags(&self, py: Python<'_>) -> Vec> { + let doc = self.doc.borrow(py); + let index = doc.index(); + index + .child_slice(self.tag_idx) + .iter() + .map(|&child| doc.interned_tag(py, index, child as usize)) + .collect() + } + + /// All descendant tag names, optionally filtered. + #[pyo3(signature = (tag=None))] + fn descendant_tags(&self, py: Python<'_>, tag: Option<&str>) -> Vec> { + let doc = self.doc.borrow(py); + let index = doc.index(); + let start = self.tag_idx; + let close = index.matching_close(start).unwrap_or(start); + + let mut result = Vec::new(); + for i in (start + 1)..=close { + let tt = index.tag_type(i); + if tt == simdxml::index::TagType::Open || tt == simdxml::index::TagType::SelfClose { + match tag { + Some(filter) if index.tag_name(i) != filter => {} + _ => result.push(doc.interned_tag(py, index, i)), + } + } } + result } - /// All text content (depth-first) as a list of strings. - fn itertext(&self, py: Python<'_>) -> Vec { + /// All text content within this element, depth-first. + fn itertext(&self, py: Python<'_>) -> Vec> { let doc = self.doc.borrow(py); let index = doc.index(); let mut texts = Vec::new(); - collect_text(index, self.tag_idx, &mut texts); + collect_text_py(py, index, self.tag_idx, &mut texts); texts } - /// Concatenation of all descendant text. - fn text_content(&self, py: Python<'_>) -> String { - self.with_index(py, |index, _| index.all_text(self.tag_idx)) + /// All descendant text concatenated into a single string. + fn text_content(&self, py: Python<'_>) -> Py { + let doc = self.doc.borrow(py); + let text = doc.index().all_text(self.tag_idx); + PyString::new(py, &text).unbind() } - /// Evaluate full XPath 1.0 from this element as context node. - fn xpath(&self, py: Python<'_>, expr: &str) -> PyResult> { + /// Evaluate an XPath 1.0 expression with this element as context. + fn xpath(&self, py: Python<'_>, expr: &str) -> PyResult { let doc = self.doc.borrow(py); let index = doc.index(); let nodes = index .xpath_from(expr, self.tag_idx) .map_err(|e| PyValueError::new_err(e.to_string()))?; - Ok(Document::make_elements( - py, - &self.doc, - nodes.into_iter().filter_map(|n| match n { + let indices: Vec = nodes + .into_iter() + .filter_map(|n| match n { XPathNode::Element(idx) => Some(idx), _ => None, - }), - )) + }) + .collect(); + Ok(ElementList { + doc: self.doc.clone_ref(py), + indices, + }) } - /// XPath text extraction from this element as context. - fn xpath_text(&self, py: Python<'_>, expr: &str) -> PyResult> { + /// Evaluate an XPath expression and return text content of matches. + fn xpath_text(&self, py: Python<'_>, expr: &str) -> PyResult>> { let doc = self.doc.borrow(py); let index = doc.index(); let results = index @@ -365,17 +422,17 @@ impl Element { for node in &results { match node { XPathNode::Element(idx) => { - let dt = index.direct_text(*idx); - if !dt.is_empty() { - texts.push(dt.join("")); + if let Some(first) = index.direct_text_first(*idx) { + texts.push(PyString::new(py, first).unbind()); } } XPathNode::Text(idx) => { - texts.push(index.text_by_index(*idx).to_string()); + texts.push(PyString::new(py, index.text_by_index(*idx)).unbind()); } XPathNode::Attribute(tag_idx, _) => { - if let Some(val) = get_first_attribute(index, *tag_idx) { - texts.push(val); + let attrs = index.attributes(*tag_idx); + if let Some((_, val)) = attrs.first() { + texts.push(PyString::new(py, val).unbind()); } } _ => {} @@ -386,51 +443,44 @@ impl Element { /// Parent element, or None for root. fn getparent(&self, py: Python<'_>) -> Option { - self.with_index(py, |_, parents| { - let parent = parents[self.tag_idx]; - if parent == u32::MAX { - None - } else { - Some(Document::make_element(py, &self.doc, parent as usize)) - } - }) + let doc = self.doc.borrow(py); + let index = doc.index(); + // Uses upstream parent() directly + index + .parent(self.tag_idx) + .map(|p| Document::make_element_borrowed(py, &self.doc, &doc, p)) } /// Next sibling element, or None. fn getnext(&self, py: Python<'_>) -> Option { - self.with_index(py, |index, parents| { - let parent = parents[self.tag_idx]; - if parent == u32::MAX { - return None; - } - let siblings = index.children(parent as usize); - let pos = siblings.iter().position(|&s| s == self.tag_idx)?; - siblings - .get(pos + 1) - .map(|&idx| Document::make_element(py, &self.doc, idx)) - }) + let doc = self.doc.borrow(py); + let index = doc.index(); + let pos = index.child_position(self.tag_idx)?; + let parent = index.parent(self.tag_idx)?; + index + .child_at(parent, pos + 1) + .map(|idx| Document::make_element_borrowed(py, &self.doc, &doc, idx)) } /// Previous sibling element, or None. fn getprevious(&self, py: Python<'_>) -> Option { - self.with_index(py, |index, parents| { - let parent = parents[self.tag_idx]; - if parent == u32::MAX { - return None; - } - let siblings = index.children(parent as usize); - let pos = siblings.iter().position(|&s| s == self.tag_idx)?; - if pos > 0 { - Some(Document::make_element(py, &self.doc, siblings[pos - 1])) - } else { - None - } - }) + let doc = self.doc.borrow(py); + let index = doc.index(); + let pos = index.child_position(self.tag_idx)?; + if pos == 0 { + return None; + } + let parent = index.parent(self.tag_idx)?; + index + .child_at(parent, pos - 1) + .map(|idx| Document::make_element_borrowed(py, &self.doc, &doc, idx)) } - /// Raw XML for this element (opening through closing tag). - fn tostring(&self, py: Python<'_>) -> String { - self.with_index(py, |index, _| index.raw_xml(self.tag_idx).to_string()) + /// Serialize this element to an XML string. + fn tostring(&self, py: Python<'_>) -> Py { + let doc = self.doc.borrow(py); + let raw = doc.index().raw_xml(self.tag_idx); + PyString::new(py, raw).unbind() } // -- Read-only enforcement -- @@ -450,35 +500,40 @@ impl Element { Err(readonly_error()) } + /// Not supported. Raises TypeError (simdxml elements are read-only). #[pyo3(name = "set")] fn set_attr(&self, _key: &str, _value: &str) -> PyResult<()> { Err(readonly_error()) } + /// Not supported. Raises TypeError (simdxml elements are read-only). fn append(&self, _element: &Element) -> PyResult<()> { Err(readonly_error()) } + /// Not supported. Raises TypeError (simdxml elements are read-only). fn remove(&self, _element: &Element) -> PyResult<()> { Err(readonly_error()) } + /// Not supported. Raises TypeError (simdxml elements are read-only). #[pyo3(signature = (_index, _element))] fn insert(&self, _index: isize, _element: &Element) -> PyResult<()> { Err(readonly_error()) } + /// Not supported. Raises TypeError (simdxml elements are read-only). fn clear(&self) -> PyResult<()> { Err(readonly_error()) } fn __repr__(&self, py: Python<'_>) -> String { - let tag = self.with_index(py, |index, _| index.tag_name(self.tag_idx).to_string()); - format!("Element('{tag}')") + let tag_str = self.cached_tag.bind(py).to_cow().unwrap_or_default(); + format!("Element('{tag_str}')") } - fn __str__(&self, py: Python<'_>) -> String { - self.with_index(py, |index, _| index.tag_name(self.tag_idx).to_string()) + fn __str__(&self, py: Python<'_>) -> Py { + self.cached_tag.clone_ref(py) } fn __bool__(&self) -> bool { @@ -499,16 +554,34 @@ impl Element { } // --------------------------------------------------------------------------- -// Element iterator +// ElementIterator — pre-caches interned tags to avoid per-next borrow // --------------------------------------------------------------------------- #[pyclass] struct ElementIterator { doc: Py, - children: Vec, + items: Vec<(usize, Py)>, pos: usize, } +impl ElementIterator { + fn new(py: Python<'_>, doc: &Py, doc_ref: &Document, indices: Vec) -> Self { + let index = doc_ref.index(); + let items: Vec<(usize, Py)> = indices + .into_iter() + .map(|idx| { + let tag = doc_ref.interned_tag(py, index, idx); + (idx, tag) + }) + .collect(); + ElementIterator { + doc: doc.clone_ref(py), + items, + pos: 0, + } + } +} + #[pymethods] impl ElementIterator { fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { @@ -516,21 +589,103 @@ impl ElementIterator { } fn __next__(&mut self, py: Python<'_>) -> Option { - if self.pos < self.children.len() { - let idx = self.children[self.pos]; + if self.pos < self.items.len() { + let (idx, ref cached_tag) = self.items[self.pos]; self.pos += 1; - Some(Document::make_element(py, &self.doc, idx)) + Some(Element { + doc: self.doc.clone_ref(py), + tag_idx: idx, + cached_tag: cached_tag.clone_ref(py), + }) } else { None } } + + fn __len__(&self) -> usize { + self.items.len() - self.pos + } +} + +// --------------------------------------------------------------------------- +// ElementList — lazy sequence returned by xpath/eval +// --------------------------------------------------------------------------- + +/// A lazy list of elements. Holds one Document reference and a Vec of tag +/// indices. Element objects are created on demand when accessed. +#[pyclass(sequence)] +struct ElementList { + doc: Py, + indices: Vec, +} + +#[pymethods] +impl ElementList { + fn __len__(&self) -> usize { + self.indices.len() + } + + fn __getitem__(&self, py: Python<'_>, index: isize) -> PyResult { + let len = self.indices.len() as isize; + let i = if index < 0 { len + index } else { index }; + if i < 0 || i >= len { + return Err(pyo3::exceptions::PyIndexError::new_err( + "list index out of range", + )); + } + Ok(Document::make_element( + py, + &self.doc, + self.indices[i as usize], + )) + } + + fn __iter__(&self, py: Python<'_>) -> ElementIterator { + let doc_ref = self.doc.borrow(py); + ElementIterator::new(py, &self.doc, &doc_ref, self.indices.clone()) + } + + fn __bool__(&self) -> bool { + !self.indices.is_empty() + } + + fn __eq__(&self, _py: Python<'_>, other: &Bound<'_, pyo3::PyAny>) -> bool { + if let Ok(list) = other.cast::() { + if list.len() != self.indices.len() { + return false; + } + for (i, item) in list.iter().enumerate() { + if let Ok(elem) = item.cast::() { + let elem_ref = elem.borrow(); + if elem_ref.tag_idx != self.indices[i] || !elem_ref.doc.is(&self.doc) { + return false; + } + } else { + return false; + } + } + return true; + } + if let Ok(other_list) = other.cast::() { + let other_ref = other_list.borrow(); + return self.doc.is(&other_ref.doc) && self.indices == other_ref.indices; + } + false + } + + fn __repr__(&self) -> String { + format!("ElementList(len={})", self.indices.len()) + } } // --------------------------------------------------------------------------- // CompiledXPath // --------------------------------------------------------------------------- -/// A compiled XPath expression for repeated evaluation. +/// A compiled XPath expression for repeated use. +/// +/// Like `re.compile()` -- parse the expression once, evaluate many times +/// across different documents. #[pyclass] struct CompiledXPath { inner: simdxml::CompiledXPath, @@ -538,18 +693,21 @@ struct CompiledXPath { #[pymethods] impl CompiledXPath { - /// Evaluate and return text content of matches. - fn eval_text(&self, doc: &Document) -> PyResult> { + /// Evaluate and return text content of matching nodes. + fn eval_text(&self, py: Python<'_>, doc: &Document) -> PyResult>> { let index = doc.index(); let results = self .inner .eval_text(index) .map_err(|e| PyValueError::new_err(e.to_string()))?; - Ok(results.into_iter().map(|s| s.to_string()).collect()) + Ok(results + .into_iter() + .map(|s| PyString::new(py, s).unbind()) + .collect()) } - /// Evaluate and return matching Element nodes. - fn eval(slf: &Bound<'_, Self>, doc: &Bound<'_, Document>) -> PyResult> { + /// Evaluate and return matching elements as an ElementList (lazy). + fn eval(slf: &Bound<'_, Self>, doc: &Bound<'_, Document>) -> PyResult { let this = slf.borrow(); let doc_ref = doc.borrow(); let doc_py: Py = doc.clone().unbind(); @@ -559,18 +717,20 @@ impl CompiledXPath { .eval(index) .map_err(|e| PyValueError::new_err(e.to_string()))?; - let py = slf.py(); - Ok(Document::make_elements( - py, - &doc_py, - nodes.into_iter().filter_map(|n| match n { + let indices: Vec = nodes + .into_iter() + .filter_map(|n| match n { XPathNode::Element(idx) => Some(idx), _ => None, - }), - )) + }) + .collect(); + Ok(ElementList { + doc: doc_py, + indices, + }) } - /// Check if any nodes match. + /// Check whether any nodes match. fn eval_exists(&self, doc: &Document) -> PyResult { let index = doc.index(); let nodes = self @@ -580,7 +740,7 @@ impl CompiledXPath { Ok(!nodes.is_empty()) } - /// Count matching nodes. + /// Count the number of matching nodes. fn eval_count(&self, doc: &Document) -> PyResult { let index = doc.index(); let nodes = self @@ -605,40 +765,22 @@ fn readonly_error() -> PyErr { ) } -fn get_first_attribute(index: &XmlIndex<'_>, tag_idx: usize) -> Option { - let names = index.get_all_attribute_names(tag_idx); - names - .first() - .and_then(|name| index.get_attribute(tag_idx, name)) - .map(|s| s.to_string()) -} - -/// Build a parent map from the public children() API. -fn build_parent_map(index: &XmlIndex<'_>) -> Vec { - let n = index.tag_count(); - let mut parents = vec![u32::MAX; n]; - for i in 0..n { - let tt = index.tag_type(i); - if tt == simdxml::index::TagType::Open { - for child in index.children(i) { - if child < n { - parents[child] = i as u32; - } - } - } - } - parents -} - -/// Recursively collect text content depth-first (for itertext). -fn collect_text(index: &XmlIndex<'_>, tag_idx: usize, out: &mut Vec) { +/// Recursively collect text content depth-first, building PyStrings directly. +fn collect_text_py( + py: Python<'_>, + index: &XmlIndex<'_>, + tag_idx: usize, + out: &mut Vec>, +) { for text in index.direct_text(tag_idx) { if !text.is_empty() { - out.push(XmlIndex::decode_entities(text).into_owned()); + let decoded = XmlIndex::decode_entities(text); + out.push(PyString::new(py, &decoded).unbind()); } } - for child in index.children(tag_idx) { - collect_text(index, child, out); + // Use child_slice for zero-alloc child enumeration + for &child in index.child_slice(tag_idx) { + collect_text_py(py, index, child as usize, out); } } @@ -646,30 +788,69 @@ fn collect_text(index: &XmlIndex<'_>, tag_idx: usize, out: &mut Vec) { // Module-level functions // --------------------------------------------------------------------------- -/// Parse XML bytes or string into a Document. +/// Parse XML into a Document. +/// +/// Accepts bytes or str. For bytes input, the buffer is used directly (zero-copy). +/// For str input, the string is encoded to UTF-8 bytes. #[pyfunction] -fn parse(data: &Bound<'_, PyAny>) -> PyResult { - let bytes: Vec = if let Ok(b) = data.cast_exact::() { - b.as_bytes().to_vec() +fn parse(py: Python<'_>, data: &Bound<'_, PyAny>) -> PyResult { + let owner = if data.is_instance_of::() { + let backed: PyBackedBytes = data.extract()?; + DocumentOwner::ZeroCopy(backed) } else if let Ok(s) = data.extract::() { - s.into_bytes() + DocumentOwner::Owned(s.into_bytes()) } else { return Err(PyTypeError::new_err("parse() requires bytes or str")); }; - let inner = DocumentInner::try_new(bytes, |owner| { + let inner = DocumentInner::try_new(owner, |owner| { let mut index = simdxml::parse(owner).map_err(|e| PyValueError::new_err(e.to_string()))?; index.ensure_indices(); index.build_name_index(); - let parents = build_parent_map(&index); - Ok::<_, PyErr>(IndexWithMeta { index, parents }) + Ok::<_, PyErr>(index) })?; - Ok(Document { inner }) + // Build interned Python strings from upstream's name_table. + // name_table[id] = (byte_offset, length) into input. We need to resolve + // these to actual strings. Since input is private, we find one tag per + // name_id and use tag_name() on it. + let interned_names = { + let index = inner.borrow_dependent(); + let n_names = index.name_table.len(); + let mut names: Vec> = Vec::with_capacity(n_names); + let mut found = vec![false; n_names]; + + for i in 0..index.tag_count() { + if index.name_ids.is_empty() { + break; + } + let nid = index.name_ids[i]; + if nid != u16::MAX && (nid as usize) < n_names && !found[nid as usize] { + found[nid as usize] = true; + // Ensure we have enough slots + while names.len() <= nid as usize { + names.push(PyString::new(py, "").unbind()); + } + names[nid as usize] = PyString::new(py, index.tag_name(i)).unbind(); + } + if found.iter().all(|&f| f) { + break; // All names found + } + } + names + }; + + Ok(Document { + inner, + interned_names, + }) } -/// Compile an XPath expression for repeated evaluation. +/// Compile an XPath expression for repeated use. +/// +/// Like `re.compile()` -- parse the expression once, evaluate many times +/// across different documents. #[pyfunction] fn compile(expr: &str) -> PyResult { let inner = @@ -685,6 +866,7 @@ fn compile(expr: &str) -> PyResult { fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_function(wrap_pyfunction!(parse, m)?)?; m.add_function(wrap_pyfunction!(compile, m)?)?;