From 31210e2cf9702c8bfdac07872b68172f095000d6 Mon Sep 17 00:00:00 2001
From: Christopher Grainger <chris@amplified.ai>
Date: Fri, 27 Mar 2026 21:16:51 +1100
Subject: [PATCH 1/7] v0.2.0: Interned tag names, batch APIs, honest benchmarks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Performance:
- Intern unique tag names as Python strings at parse time (~20-200
  unique names). Element.tag is now a refcount bump, not a string copy.
- Eagerly cache tag on Element creation (zero FFI on .tag access).
- New batch APIs: child_tags(), descendant_tags() — single FFI call
  for all results using interned strings. 25x faster than lxml on
  large-document traversal.
- Eliminate double-borrow in make_element when callers already hold
  a Document reference.

Benchmarks:
- GC disabled, 3 warmup + 20 timed iterations, median reported.
- Three corpus types: catalog (data), PubMed (document), POM (config).
- XPath benchmarks compare elements-to-elements (fair).
- Fixed POM namespace artifact that made lxml appear faster (it was
  returning 0 results due to xmlns mismatch).
- Traversal section honestly shows per-element FFI overhead alongside
  the batch API that eliminates it.
- Removed POM xmlns from benchmark corpus for fair cross-library
  comparison.

README:
- Updated all benchmark tables with v0.2.0 numbers.
- Documents batch APIs in API section.
- Notes that parse() includes index construction.
- Honest framing of traversal trade-offs.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 Cargo.toml                 |   2 +-
 README.md                  |  85 ++++++---
 bench/bench_parse.py       | 372 ++++++++++++++++++++++++++++---------
 pyproject.toml             |   2 +-
 python/simdxml/__init__.py |   2 +-
 python/simdxml/_core.pyi   |   2 +
 src/lib.rs                 | 278 +++++++++++++++++++++------
 7 files changed, 570 insertions(+), 173 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 2d6b234..44cf957 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simdxml-python"
-version = "0.1.0"
+version = "0.2.0"
 edition = "2021"
 
 [lib]
diff --git a/README.md b/README.md
index cc347b3..0f04366 100644
--- a/README.md
+++ b/README.md
@@ -62,6 +62,10 @@ elem.getprevious()                 # previous sibling or None
 elem.xpath(".//title")             # context-node evaluation
 elem.xpath_text("author")         # text extraction from context
 
+# Batch APIs (single FFI call, interned strings)
+root.child_tags()                  # -> list[str] of child tag names
+root.descendant_tags("item")       # -> list[str] filtered by tag
+
 # Compiled XPath (like re.compile)
 expr = simdxml.compile("//title")
 expr.eval_text(doc)                # -> list[str]
@@ -118,33 +122,71 @@ Full conformance with XPath 1.0:
 
 ## Benchmarks
 
-Measured on Apple Silicon (M-series), Python 3.14, comparing against
-lxml 6.0 and stdlib `xml.etree.ElementTree`. Run with `uv run python bench/bench_parse.py`.
+Apple Silicon, Python 3.14, lxml 6.0. GC disabled during timing, 3 warmup +
+20 timed iterations, median reported. Three corpus types: data-oriented
+(product catalog), document-oriented (PubMed abstracts), config-oriented
+(Maven POM). Run yourself: `uv run python bench/bench_parse.py`
+
+### Parse
+
+`simdxml.parse()` eagerly builds structural indices (CSR, name posting,
+parent map). lxml's `fromstring()` builds a DOM tree without precomputed
+query indices. simdxml front-loads more work into parse so queries are faster
+-- both numbers are real, the trade-off depends on your workload.
+
+| Corpus | Size | simdxml | lxml | vs lxml | vs stdlib |
+|--------|------|---------|------|---------|-----------|
+| Catalog (data) | 1.6 MB | 4.8 ms | 8.5 ms | 1.8x | 3.1x |
+| Catalog (data) | 17 MB | 57 ms | 86 ms | 1.5x | 2.7x |
+| PubMed (doc) | 1.7 MB | 4.1 ms | 6.3 ms | 1.5x | 3.3x |
+| PubMed (doc) | 17 MB | 46 ms | 64 ms | 1.4x | 3.1x |
+| POM (config) | 2.1 MB | 4.8 ms | 8.6 ms | 1.8x | 3.8x |
+
+### XPath queries (returning Elements -- apples-to-apples)
+
+| Query | Corpus | simdxml | lxml | vs lxml |
+|-------|--------|---------|------|---------|
+| `//item` | Catalog 17 MB | 4.0 ms | 22.5 ms | **5.6x** |
+| `//item[@category="cat5"]` | Catalog 17 MB | 1.7 ms | 72 ms | **41x** |
+| `//PubmedArticle` | PubMed 17 MB | 0.41 ms | 10.4 ms | **25x** |
+| `//Author[LastName="Auth0_0"]` | PubMed 17 MB | 17.6 ms | 30.7 ms | **1.7x** |
+| `//dependency` | POM 2.1 MB | 0.41 ms | 0.72 ms | 1.8x |
+| `//dependency[scope="test"]` | POM 2.1 MB | 2.5 ms | 3.5 ms | 1.4x |
+
+The predicate speedup on large documents is dramatic because the structural
+index enables direct attribute comparison without materializing DOM nodes.
+
+### XPath text extraction
+
+`xpath_text()` returns strings directly, avoiding Python Element object
+creation. This is the optimized path for ETL / data extraction workloads.
 
-### Parse throughput
+| Query | Corpus | simdxml | lxml xpath+.text | vs lxml |
+|-------|--------|---------|------------------|---------|
+| `//name` | Catalog 17 MB | 3.1 ms | 42 ms | **14x** |
+| `//AbstractText` | PubMed 17 MB | 0.64 ms | 8.3 ms | **13x** |
+| `//artifactId` | POM 2.1 MB | 0.39 ms | 0.70 ms | 1.8x |
 
-| Document | simdxml | lxml | stdlib ET | vs lxml | vs stdlib |
-|----------|---------|------|-----------|---------|-----------|
-| 20 KB (100 items) | 0.05 ms | 0.09 ms | 0.15 ms | 1.8x | 3.0x |
-| 2 MB (10K items) | 3.3 ms | 8.5 ms | 16.7 ms | 2.6x | 5.0x |
-| 20 MB (100K items) | 40 ms | 87 ms | 181 ms | **2.2x** | **4.5x** |
+### Element traversal
 
-### XPath query: `//name`
+simdxml provides two traversal modes:
 
-| Document | simdxml | lxml | stdlib findall | vs lxml | vs stdlib |
-|----------|---------|------|----------------|---------|-----------|
-| 2 MB | 0.3 ms | 1.0 ms | 0.7 ms | 3.1x | 2.1x |
-| 20 MB | 3.8 ms | 19.7 ms | 7.3 ms | **5.2x** | **1.9x** |
+**Batch API** (`child_tags()`, `descendant_tags()`): returns all tag names
+in a single FFI call using interned Python strings. This is the fast path.
 
-### XPath query with predicate: `//item[@category="cat5"]`
+**Per-element iteration** (`for e in root`): creates flyweight Element
+objects. Each `.tag` access is a refcount bump on an interned string (no
+copy), but creating Element objects has unavoidable PyO3 overhead.
 
-| Document | simdxml | lxml | stdlib findall | vs lxml |
-|----------|---------|------|----------------|---------|
-| 2 MB | 0.2 ms | 2.8 ms | 0.8 ms | 16x |
-| 20 MB | 2.0 ms | 46 ms | 9.1 ms | **23x** |
+| Corpus | `child_tags()` | `[e.tag]` loop | lxml loop | stdlib loop | batch vs lxml |
+|--------|----------------|----------------|-----------|-------------|---------------|
+| Catalog 17 MB | **0.45 ms** | 5.4 ms | 11.3 ms | 2.1 ms | **25x** |
+| PubMed 17 MB | **0.05 ms** | 0.53 ms | 0.62 ms | 0.16 ms | **13x** |
+| POM 2.1 MB | **0.2 us** | 0.5 us | 0.7 us | 0.3 us | **3x** |
 
-The predicate speedup is dramatic because simdxml's structural index enables
-direct attribute comparison without materializing DOM nodes.
+Use `child_tags()` / `descendant_tags()` when you need tag names. Use
+`xpath_text()` when you need text. Reserve per-element iteration for when
+you need to navigate the tree interactively.
 
 ## How it works
 
@@ -157,7 +199,8 @@ and parents -- all indexed by the same position.
 - O(1) ancestor/descendant checks via pre/post-order numbering
 - O(1) child enumeration via CSR (Compressed Sparse Row) indices
 - SIMD-accelerated structural parsing (NEON on ARM, AVX2 on x86)
-- Lazy index building: CSR indices built on first query, not at parse time
+- Parse eagerly builds all indices (CSR, name posting, parent map) so
+  subsequent queries pay zero index construction cost
 
 ## Platform support
 
diff --git a/bench/bench_parse.py b/bench/bench_parse.py
index 48ca246..3eb532f 100644
--- a/bench/bench_parse.py
+++ b/bench/bench_parse.py
@@ -1,11 +1,27 @@
 """Benchmark: simdxml vs lxml vs stdlib xml.etree.ElementTree.
 
+Methodology:
+  - GC disabled during timing to avoid collection noise
+  - 3 warmup iterations discarded, then 20 timed iterations
+  - Reports median (robust to outliers from page faults, scheduling)
+  - All XPath benchmarks compare like-for-like: elements vs elements
+  - Both synthetic and real-world-shaped corpora
+
+Note: simdxml.parse() eagerly builds structural indices (CSR, name
+posting, parent map). lxml.fromstring() builds a DOM tree without
+precomputed indices. This means simdxml front-loads more work into
+parse, then queries are faster. Both numbers are real -- the question
+is which workload you have.
+
 Usage:
     uv run python bench/bench_parse.py
 """
 
 from __future__ import annotations
 
+import gc
+import random
+import sys
 import time
 import xml.etree.ElementTree as StdET
 
@@ -19,133 +35,313 @@
     HAS_LXML = False
 
 
-def generate_xml(n_items: int) -> bytes:
-    """Generate a catalog XML with n_items."""
+# ---------------------------------------------------------------------------
+# Corpus generators
+# ---------------------------------------------------------------------------
+
+
+def gen_catalog(n: int) -> bytes:
+    """Data-oriented: uniform structure, many attributes."""
     items = "\n".join(
-        f'  <item id="{i}" category="cat{i % 10}">'
+        "  "
+        f'<item id="{i}" category="cat{i % 10}">'
         f"<name>Item {i}</name>"
-        f"<description>Description for item {i} with some text content</description>"
+        f"<description>Desc for item {i}</description>"
         f"<price>{i * 1.5:.2f}</price>"
-        f"<tags><tag>tag{i % 5}</tag><tag>tag{i % 3}</tag></tags>"
+        f"<tags><tag>t{i % 5}</tag><tag>t{i % 3}</tag></tags>"
         f"</item>"
-        for i in range(n_items)
+        for i in range(n)
     )
     return f"<catalog>\n{items}\n</catalog>".encode()
 
 
-def bench(label: str, fn, iterations: int = 10) -> float:
-    """Run fn `iterations` times, return median time in ms."""
-    times = []
-    for _ in range(iterations):
-        start = time.perf_counter()
+def gen_pubmed(n: int) -> bytes:
+    """Document-oriented: mixed depth, varying children."""
+    rng = random.Random(42)
+    articles = []
+    for i in range(n):
+        n_auth = rng.randint(1, 8)
+        auths = "\n".join(
+            "        <Author>"
+            f"<LastName>Auth{j}_{i}</LastName>"
+            f"<ForeName>F{j}</ForeName>"
+            f"<Affiliation>Univ {rng.randint(1, 20)}"
+            "</Affiliation>"
+            "</Author>"
+            for j in range(n_auth)
+        )
+        n_mesh = rng.randint(2, 12)
+        mesh = "\n".join(
+            "        <MeshHeading>"
+            f'<DescriptorName UI="D{rng.randint(100000, 999999)}">'
+            f"Term{k}_{i}</DescriptorName>"
+            "</MeshHeading>"
+            for k in range(n_mesh)
+        )
+        kind = "randomized" if i % 2 else "retrospective"
+        sents = " ".join(
+            f"Sentence {s} about topic {i}." for s in range(rng.randint(3, 8))
+        )
+        issn = f"{rng.randint(1000, 9999)}-{rng.randint(1000, 9999)}"
+        articles.append(
+            "  <PubmedArticle>\n"
+            '    <MedlineCitation Status="MEDLINE">\n'
+            f"      <PMID>{10000000 + i}</PMID>\n"
+            "      <Article>\n"
+            "        <Journal>"
+            f'<ISSN IssnType="Print">{issn}</ISSN>'
+            f"<Title>J Example {i % 50}</Title>"
+            "</Journal>\n"
+            f"        <ArticleTitle>Topic {i}: "
+            f"a {kind} study</ArticleTitle>\n"
+            "        <Abstract>"
+            f"<AbstractText>{sents}</AbstractText>"
+            "</Abstract>\n"
+            f"        <AuthorList>\n{auths}\n"
+            "        </AuthorList>\n"
+            "        <Language>eng</Language>\n"
+            "      </Article>\n"
+            f"      <MeshHeadingList>\n{mesh}\n"
+            "      </MeshHeadingList>\n"
+            "    </MedlineCitation>\n"
+            "  </PubmedArticle>"
+        )
+    body = "\n".join(articles)
+    return f"<PubmedArticleSet>\n{body}\n</PubmedArticleSet>".encode()
+
+
+def gen_pom(n: int) -> bytes:
+    """Config-oriented: deep nesting, namespaces."""
+    deps = "\n".join(
+        "      <dependency>\n"
+        f"        <groupId>com.example.g{i % 20}</groupId>\n"
+        f"        <artifactId>art-{i}</artifactId>\n"
+        f"        <version>{i % 5}.{i % 10}.{i % 3}</version>\n"
+        "        <scope>"
+        + ("compile" if i % 3 == 0 else "test" if i % 3 == 1 else "runtime")
+        + "</scope>\n"
+        + (
+            "        <exclusions>\n"
+            f"          <exclusion>"
+            f"<groupId>com.ex.{i}</groupId>"
+            f"<artifactId>bad-{i}</artifactId>"
+            "</exclusion>\n"
+            "        </exclusions>\n"
+            if i % 4 == 0
+            else ""
+        )
+        + "      </dependency>"
+        for i in range(n)
+    )
+    return (
+        "<project>\n"
+        "  <modelVersion>4.0.0</modelVersion>\n"
+        "  <groupId>com.example</groupId>\n"
+        "  <artifactId>benchmark</artifactId>\n"
+        "  <version>1.0.0</version>\n"
+        f"  <dependencies>\n{deps}\n  </dependencies>\n"
+        "</project>"
+    ).encode()
+
+
+# ---------------------------------------------------------------------------
+# Bench harness
+# ---------------------------------------------------------------------------
+
+WARMUP = 3
+ITERATIONS = 20
+
+
+def bench(fn) -> float:
+    """Warmup then timed iterations; return median ms."""
+    for _ in range(WARMUP):
         fn()
-        elapsed = (time.perf_counter() - start) * 1000
-        times.append(elapsed)
+
+    gc.disable()
+    try:
+        times = []
+        for _ in range(ITERATIONS):
+            t0 = time.perf_counter()
+            fn()
+            times.append((time.perf_counter() - t0) * 1000)
+    finally:
+        gc.enable()
+
     times.sort()
-    median = times[len(times) // 2]
-    return median
+    return times[len(times) // 2]
 
 
-def print_row(label: str, time_ms: float, baseline_ms: float | None = None) -> None:
-    speedup = ""
-    if baseline_ms is not None and time_ms > 0:
-        ratio = baseline_ms / time_ms
-        lib = "lxml" if HAS_LXML else "stdlib"
-        speedup = f"  ({ratio:.1f}x vs {lib})"
-    print(f"  {label:<30s} {time_ms:8.2f} ms{speedup}")
+def fmt(ms: float) -> str:
+    if ms < 0.01:
+        return f"{ms * 1000:6.1f} us"
+    if ms < 1:
+        return f"{ms:6.2f} ms"
+    return f"{ms:6.1f} ms"
 
 
-def run_benchmarks(xml: bytes, label: str) -> None:
-    size_mb = len(xml) / (1024 * 1024)
-    print(f"\n{'=' * 60}")
-    print(f"  {label} ({size_mb:.1f} MB, {len(xml):,} bytes)")
-    print(f"{'=' * 60}")
+def ratio_str(a: float, b: float) -> str:
+    if b <= 0:
+        return ""
+    r = b / a
+    if r >= 1:
+        return f" \033[32m{r:.1f}x faster\033[0m"
+    return f" \033[31m{1 / r:.1f}x slower\033[0m"
+
+
+# ---------------------------------------------------------------------------
+# Benchmark suites
+# ---------------------------------------------------------------------------
+
 
-    # --- Parse ---
-    print("\n  Parse:")
-    simdxml_parse = bench("simdxml", lambda: simdxml.parse(xml))
-    print_row("simdxml.parse()", simdxml_parse)
+def bench_parse(xml: bytes, label: str) -> None:
+    print(f"\n  \033[1mParse\033[0m  ({label})")
+    print("  Note: simdxml.parse() includes index construction (CSR + name posting)")
+
+    t_simd = bench(lambda: simdxml.parse(xml))
+    print(f"    simdxml.parse()         {fmt(t_simd)}")
 
     if HAS_LXML:
-        lxml_parse = bench("lxml", lambda: lxml_etree.fromstring(xml))
-        print_row("lxml.etree.fromstring()", lxml_parse)
+        t_lxml = bench(lambda: lxml_etree.fromstring(xml))
+        print(f"    lxml.fromstring()       {fmt(t_lxml)}{ratio_str(t_simd, t_lxml)}")
+
+    t_std = bench(lambda: StdET.fromstring(xml))
+    print(f"    ET.fromstring()         {fmt(t_std)}{ratio_str(t_simd, t_std)}")
 
-    std_parse = bench("stdlib", lambda: StdET.fromstring(xml))
-    print_row("ET.fromstring()", std_parse)
 
-    baseline = lxml_parse if HAS_LXML else std_parse
-    lib = "lxml" if HAS_LXML else "stdlib"
-    print(f"\n  Parse speedup: {baseline / simdxml_parse:.1f}x vs {lib}")
+def bench_xpath_elements(xml: bytes, expr: str, label: str) -> None:
+    """XPath returning Element objects -- fair comparison."""
+    print(f"\n  \033[1mXPath -> Elements\033[0m  {expr}  ({label})")
 
-    # --- XPath: //name (simple descendant) ---
-    print("\n  XPath: //name")
     doc = simdxml.parse(xml)
-    compiled = simdxml.compile("//name")
+    t_simd = bench(lambda: doc.xpath(expr))
+    n_results = len(doc.xpath(expr))
+    print(f"    simdxml doc.xpath()     {fmt(t_simd)}  ({n_results} results)")
 
-    simdxml_xpath = bench("simdxml.xpath_text", lambda: doc.xpath_text("//name"))
-    print_row("doc.xpath_text()", simdxml_xpath)
+    if HAS_LXML:
+        lroot = lxml_etree.fromstring(xml)
+        t_lxml = bench(lambda: lroot.xpath(expr))
+        print(f"    lxml root.xpath()       {fmt(t_lxml)}{ratio_str(t_simd, t_lxml)}")
+
+    # stdlib findall -- skip complex expressions
+    if not any(c in expr for c in ("()", "::", "|")):
+        std_expr = expr
+        if not expr.startswith("."):
+            std_expr = "." + expr if expr.startswith("/") else "./" + expr
+        sroot = StdET.fromstring(xml)
+        try:
+            t_std = bench(lambda: sroot.findall(std_expr))
+            print(f"    ET.findall()            {fmt(t_std)}{ratio_str(t_simd, t_std)}")
+        except SyntaxError:
+            pass
+
+
+def bench_xpath_text(xml: bytes, expr: str, label: str) -> None:
+    """XPath returning text -- simdxml's optimized path."""
+    print(f"\n  \033[1mXPath -> Text\033[0m  {expr}  ({label})")
+
+    doc = simdxml.parse(xml)
+    compiled = simdxml.compile(expr)
 
-    simdxml_compiled = bench("simdxml.compiled", lambda: compiled.eval_text(doc))
-    print_row("compiled.eval_text()", simdxml_compiled)
+    t_inline = bench(lambda: doc.xpath_text(expr))
+    t_compiled = bench(lambda: compiled.eval_text(doc))
+    n = len(doc.xpath_text(expr))
+    print(f"    simdxml xpath_text()    {fmt(t_inline)}  ({n} results)")
+    print(f"    simdxml compiled        {fmt(t_compiled)}")
 
     if HAS_LXML:
-        lxml_root = lxml_etree.fromstring(xml)
-        lxml_xpath = bench("lxml.xpath", lambda: lxml_root.xpath("//name"))
-        print_row("lxml_root.xpath()", lxml_xpath)
-        baseline_xpath = lxml_xpath
-    else:
-        baseline_xpath = None
+        lroot = lxml_etree.fromstring(xml)
+        t_lxml = bench(lambda: [e.text for e in lroot.xpath(expr)])
+        print(f"    lxml xpath+.text        {fmt(t_lxml)}{ratio_str(t_inline, t_lxml)}")
+
 
-    std_root = StdET.fromstring(xml)
-    std_findall = bench("stdlib.findall", lambda: std_root.findall(".//name"))
-    print_row("std_root.findall()", std_findall)
+def bench_traversal(xml: bytes, label: str) -> None:
+    """Element traversal: per-element loop vs batch API."""
+    print(f"\n  \033[1mTraversal\033[0m  ({label})")
 
-    if baseline_xpath:
-        print(f"\n  XPath speedup: {baseline_xpath / simdxml_xpath:.1f}x vs lxml")
+    doc = simdxml.parse(xml)
 
-    # --- XPath: predicate query ---
-    print('\n  XPath: //item[@category="cat5"]')
-    pred_expr = '//item[@category="cat5"]'
-    simdxml_pred = bench("simdxml", lambda: doc.xpath(pred_expr))
-    print_row("doc.xpath()", simdxml_pred)
+    # Batch API (single FFI call, interned strings)
+    t_batch = bench(lambda: doc.root.child_tags())
+    print(f"    simdxml child_tags()    {fmt(t_batch)}  [batch, 1 FFI call]")
+
+    # Per-element loop (N FFI calls, but tags are interned)
+    t_loop = bench(lambda: [e.tag for e in doc.root])
+    print(f"    simdxml [e.tag for e]   {fmt(t_loop)}  [per-element FFI]")
 
     if HAS_LXML:
-        lxml_pred = bench("lxml", lambda: lxml_root.xpath(pred_expr))
-        print_row("lxml_root.xpath()", lxml_pred)
+        lroot = lxml_etree.fromstring(xml)
+        t_lxml = bench(lambda: [e.tag for e in lroot])
+        print(f"    lxml [e.tag for e]      {fmt(t_lxml)}{ratio_str(t_batch, t_lxml)}")
 
-    std_pred = bench("stdlib", lambda: std_root.findall('.//item[@category="cat5"]'))
-    print_row("std_root.findall()", std_pred)
+    sroot = StdET.fromstring(xml)
+    t_std = bench(lambda: [e.tag for e in sroot])
+    print(f"    stdlib [e.tag for e]    {fmt(t_std)}{ratio_str(t_batch, t_std)}")
 
-    # --- Element traversal ---
-    print("\n  Traversal: iterate all children of root")
-    simdxml_iter = bench("simdxml", lambda: [e.tag for e in doc.root])
-    print_row("for e in doc.root", simdxml_iter)
 
-    if HAS_LXML:
-        lxml_iter = bench("lxml", lambda: [e.tag for e in lxml_root])
-        print_row("for e in lxml_root", lxml_iter)
+def run_corpus(xml: bytes, name: str) -> None:
+    size_mb = len(xml) / (1024 * 1024)
+    print(f"\n{'=' * 65}")
+    print(f"  {name}  ({size_mb:.1f} MB, {len(xml):,} bytes)")
+    print(f"{'=' * 65}")
 
-    std_iter = bench("stdlib", lambda: [e.tag for e in std_root])
-    print_row("for e in std_root", std_iter)
+    bench_parse(xml, name)
 
+    if b"<item " in xml:
+        bench_xpath_elements(xml, "//item", name)
+        bench_xpath_elements(xml, '//item[@category="cat5"]', name)
+        bench_xpath_text(xml, "//name", name)
+    elif b"<PubmedArticle>" in xml:
+        bench_xpath_elements(xml, "//PubmedArticle", name)
+        bench_xpath_elements(xml, '//Author[LastName="Auth0_0"]', name)
+        bench_xpath_text(xml, "//AbstractText", name)
+    elif b"<dependency>" in xml:
+        bench_xpath_elements(xml, "//dependency", name)
+        bench_xpath_elements(xml, '//dependency[scope="test"]', name)
+        bench_xpath_text(xml, "//artifactId", name)
 
-def main() -> None:
-    print("simdxml benchmark")
-    print(f"  lxml available: {HAS_LXML}")
-    if HAS_LXML:
-        print(f"  lxml version: {lxml_etree.LXML_VERSION}")
+    bench_traversal(xml, name)
 
-    # Small document
-    small = generate_xml(100)
-    run_benchmarks(small, "Small (100 items)")
 
-    # Medium document
-    medium = generate_xml(10_000)
-    run_benchmarks(medium, "Medium (10K items)")
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
 
-    # Large document
-    large = generate_xml(100_000)
-    run_benchmarks(large, "Large (100K items)")
+
+def main() -> None:
+    print("simdxml benchmark suite")
+    print(f"  Python {sys.version.split()[0]}")
+    print(f"  simdxml {simdxml.__version__}")
+    if HAS_LXML:
+        ver = ".".join(str(x) for x in lxml_etree.LXML_VERSION)
+        print(f"  lxml {ver}")
+    else:
+        print("  lxml: not installed")
+    print(f"  Warmup: {WARMUP}, Timed: {ITERATIONS}, Metric: median")
+
+    run_corpus(
+        gen_catalog(10_000),
+        "Catalog 10K (data-oriented)",
+    )
+    run_corpus(
+        gen_catalog(100_000),
+        "Catalog 100K (data-oriented)",
+    )
+    run_corpus(
+        gen_pubmed(1_000),
+        "PubMed 1K (document-oriented)",
+    )
+    run_corpus(
+        gen_pubmed(10_000),
+        "PubMed 10K (document-oriented)",
+    )
+    run_corpus(
+        gen_pom(1_000),
+        "POM 1K (config-oriented)",
+    )
+    run_corpus(
+        gen_pom(10_000),
+        "POM 10K (config-oriented)",
+    )
 
 
 if __name__ == "__main__":
diff --git a/pyproject.toml b/pyproject.toml
index 4ab2cf3..7e936da 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "simdxml"
-version = "0.1.0"
+version = "0.2.0"
 description = "SIMD-accelerated XML parser with full XPath 1.0 support"
 readme = "README.md"
 authors = [
diff --git a/python/simdxml/__init__.py b/python/simdxml/__init__.py
index 049afc2..3f0fb6a 100644
--- a/python/simdxml/__init__.py
+++ b/python/simdxml/__init__.py
@@ -46,4 +46,4 @@
     "parse",
 ]
 
-__version__ = "0.1.0"
+__version__ = "0.2.0"
diff --git a/python/simdxml/_core.pyi b/python/simdxml/_core.pyi
index 1961186..6dc5433 100644
--- a/python/simdxml/_core.pyi
+++ b/python/simdxml/_core.pyi
@@ -26,6 +26,8 @@ class Element:
     def keys(self) -> list[str]: ...
     def items(self) -> list[tuple[str, str]]: ...
     def iter(self, tag: str | None = None) -> Iterator[Element]: ...
+    def child_tags(self) -> list[str]: ...
+    def descendant_tags(self, tag: str | None = None) -> list[str]: ...
     def itertext(self) -> list[str]: ...
     def text_content(self) -> str: ...
     def xpath(self, expr: str) -> list[Element]: ...
diff --git a/src/lib.rs b/src/lib.rs
index 1f48a5c..d28bb64 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,6 +1,6 @@
 use pyo3::exceptions::{PyTypeError, PyValueError};
 use pyo3::prelude::*;
-use pyo3::types::PyBytes;
+use pyo3::types::{PyBytes, PyString};
 use self_cell::self_cell;
 use simdxml::xpath::XPathNode;
 use simdxml::XmlIndex;
@@ -13,6 +13,8 @@ struct IndexWithMeta<'a> {
     index: XmlIndex<'a>,
     /// parent[i] = tag index of parent element. u32::MAX = root.
     parents: Vec<u32>,
+    /// name_id[i] = index into interned names for tag i. usize::MAX = none.
+    name_ids: Vec<usize>,
 }
 
 self_cell!(
@@ -27,6 +29,9 @@ self_cell!(
 #[pyclass]
 struct Document {
     inner: DocumentInner,
+    /// Interned tag names: unique tag name -> Python str (created once at parse).
+    /// Avoids per-access string copies across FFI.
+    interned_names: Vec<Py<PyString>>,
 }
 
 impl Document {
@@ -34,27 +39,51 @@ impl Document {
         &self.inner.borrow_dependent().index
     }
 
-    fn parents(&self) -> &[u32] {
-        &self.inner.borrow_dependent().parents
+    fn meta(&self) -> &IndexWithMeta<'_> {
+        self.inner.borrow_dependent()
     }
 
+    fn interned_tag(&self, py: Python<'_>, tag_idx: usize) -> Py<PyString> {
+        let meta = self.meta();
+        let name_id = meta.name_ids[tag_idx];
+        if name_id < self.interned_names.len() {
+            self.interned_names[name_id].clone_ref(py)
+        } else {
+            // Fallback for tags without interned names (comments, PIs, etc.)
+            let name = meta.index.tag_name(tag_idx);
+            PyString::new(py, name).unbind()
+        }
+    }
+
+    /// Create an Element when you don't already hold a borrow.
     fn make_element(py: Python<'_>, doc: &Py<Document>, tag_idx: usize) -> Element {
+        let doc_ref = doc.borrow(py);
+        Self::make_element_borrowed(py, doc, &doc_ref, tag_idx)
+    }
+
+    /// Create an Element when you already hold a borrow (avoids double-borrow).
+    fn make_element_borrowed(
+        py: Python<'_>,
+        doc: &Py<Document>,
+        doc_ref: &Document,
+        tag_idx: usize,
+    ) -> Element {
+        let cached_tag = doc_ref.interned_tag(py, tag_idx);
         Element {
             doc: doc.clone_ref(py),
             tag_idx,
+            cached_tag,
         }
     }
 
     fn make_elements(
         py: Python<'_>,
         doc: &Py<Document>,
+        doc_ref: &Document,
         tag_indices: impl Iterator<Item = usize>,
     ) -> Vec<Element> {
         tag_indices
-            .map(|idx| Element {
-                doc: doc.clone_ref(py),
-                tag_idx: idx,
-            })
+            .map(|idx| Self::make_element_borrowed(py, doc, doc_ref, idx))
             .collect()
     }
 }
@@ -93,7 +122,7 @@ impl Document {
         for node in &nodes {
             match node {
                 XPathNode::Element(idx) => {
-                    let elem = Document::make_element(py, &doc_py, *idx);
+                    let elem = Document::make_element_borrowed(py, &doc_py, &this, *idx);
                     result.append(elem.into_pyobject(py)?)?;
                 }
                 XPathNode::Text(idx) => {
@@ -123,7 +152,7 @@ impl Document {
                 && (index.tag_type(i) == simdxml::index::TagType::Open
                     || index.tag_type(i) == simdxml::index::TagType::SelfClose)
             {
-                return Some(Document::make_element(py, &doc_py, i));
+                return Some(Document::make_element_borrowed(py, &doc_py, &this, i));
             }
         }
         None
@@ -152,26 +181,33 @@ impl Document {
 /// A read-only element in a parsed XML document.
 ///
 /// Holds a Python reference to the Document (preventing GC) plus a tag index.
+/// The tag name is eagerly cached as a Python string to avoid FFI on every access.
 #[pyclass(skip_from_py_object)]
 struct Element {
-    /// Python-ref-counted handle to the owning Document.
     doc: Py<Document>,
     tag_idx: usize,
+    /// Cached tag name (interned Python string). Avoids FFI on .tag access.
+    cached_tag: Py<PyString>,
 }
 
 impl Element {
-    fn with_index<'py, R>(&self, py: Python<'py>, f: impl FnOnce(&XmlIndex<'_>, &[u32]) -> R) -> R {
+    fn with_index<'py, R>(
+        &self,
+        py: Python<'py>,
+        f: impl FnOnce(&XmlIndex<'_>, &IndexWithMeta<'_>) -> R,
+    ) -> R {
         let doc = self.doc.borrow(py);
-        f(doc.index(), doc.parents())
+        let meta = doc.meta();
+        f(&meta.index, meta)
     }
 }
 
 #[pymethods]
 impl Element {
-    /// The tag name.
+    /// The tag name (interned, eagerly cached — zero FFI on access).
     #[getter]
-    fn tag(&self, py: Python<'_>) -> String {
-        self.with_index(py, |index, _| index.tag_name(self.tag_idx).to_string())
+    fn tag(&self, py: Python<'_>) -> Py<PyString> {
+        self.cached_tag.clone_ref(py)
     }
 
     /// Direct text content, or None.
@@ -194,8 +230,8 @@ impl Element {
     /// Text after this element's closing tag (before next sibling).
     #[getter]
     fn tail(&self, py: Python<'_>) -> Option<String> {
-        self.with_index(py, |index, parents| {
-            let parent = parents[self.tag_idx];
+        self.with_index(py, |index, meta| {
+            let parent = meta.parents[self.tag_idx];
             if parent == u32::MAX {
                 return None;
             }
@@ -283,7 +319,12 @@ impl Element {
                 "element index out of range",
             ));
         }
-        Ok(Document::make_element(py, &self.doc, children[i as usize]))
+        Ok(Document::make_element_borrowed(
+            py,
+            &self.doc,
+            &doc,
+            children[i as usize],
+        ))
     }
 
     /// Iterate over direct child elements.
@@ -321,6 +362,40 @@ impl Element {
         }
     }
 
+    // -- Batch APIs (single FFI crossing for N results) --
+
+    /// All child tag names as a list of strings (one FFI call, interned).
+    fn child_tags(&self, py: Python<'_>) -> Vec<Py<PyString>> {
+        let doc = self.doc.borrow(py);
+        let index = doc.index();
+        index
+            .children(self.tag_idx)
+            .iter()
+            .map(|&child| doc.interned_tag(py, child))
+            .collect()
+    }
+
+    /// All descendant tag names matching optional filter.
+    #[pyo3(signature = (tag=None))]
+    fn descendant_tags(&self, py: Python<'_>, tag: Option<&str>) -> Vec<Py<PyString>> {
+        let doc = self.doc.borrow(py);
+        let index = doc.index();
+        let start = self.tag_idx;
+        let close = index.matching_close(start).unwrap_or(start);
+
+        let mut result = Vec::new();
+        for i in (start + 1)..=close {
+            let tt = index.tag_type(i);
+            if tt == simdxml::index::TagType::Open || tt == simdxml::index::TagType::SelfClose {
+                match tag {
+                    Some(filter) if index.tag_name(i) != filter => {}
+                    _ => result.push(doc.interned_tag(py, i)),
+                }
+            }
+        }
+        result
+    }
+
     /// All text content (depth-first) as a list of strings.
     fn itertext(&self, py: Python<'_>) -> Vec<String> {
         let doc = self.doc.borrow(py);
@@ -346,6 +421,7 @@ impl Element {
         Ok(Document::make_elements(
             py,
             &self.doc,
+            &doc,
             nodes.into_iter().filter_map(|n| match n {
                 XPathNode::Element(idx) => Some(idx),
                 _ => None,
@@ -386,46 +462,55 @@ impl Element {
 
     /// Parent element, or None for root.
     fn getparent(&self, py: Python<'_>) -> Option<Element> {
-        self.with_index(py, |_, parents| {
-            let parent = parents[self.tag_idx];
-            if parent == u32::MAX {
-                None
-            } else {
-                Some(Document::make_element(py, &self.doc, parent as usize))
-            }
-        })
+        let doc = self.doc.borrow(py);
+        let parent = doc.meta().parents[self.tag_idx];
+        if parent == u32::MAX {
+            None
+        } else {
+            Some(Document::make_element_borrowed(
+                py,
+                &self.doc,
+                &doc,
+                parent as usize,
+            ))
+        }
     }
 
     /// Next sibling element, or None.
     fn getnext(&self, py: Python<'_>) -> Option<Element> {
-        self.with_index(py, |index, parents| {
-            let parent = parents[self.tag_idx];
-            if parent == u32::MAX {
-                return None;
-            }
-            let siblings = index.children(parent as usize);
-            let pos = siblings.iter().position(|&s| s == self.tag_idx)?;
-            siblings
-                .get(pos + 1)
-                .map(|&idx| Document::make_element(py, &self.doc, idx))
-        })
+        let doc = self.doc.borrow(py);
+        let meta = doc.meta();
+        let parent = meta.parents[self.tag_idx];
+        if parent == u32::MAX {
+            return None;
+        }
+        let siblings = meta.index.children(parent as usize);
+        let pos = siblings.iter().position(|&s| s == self.tag_idx)?;
+        siblings
+            .get(pos + 1)
+            .map(|&idx| Document::make_element_borrowed(py, &self.doc, &doc, idx))
     }
 
     /// Previous sibling element, or None.
     fn getprevious(&self, py: Python<'_>) -> Option<Element> {
-        self.with_index(py, |index, parents| {
-            let parent = parents[self.tag_idx];
-            if parent == u32::MAX {
-                return None;
-            }
-            let siblings = index.children(parent as usize);
-            let pos = siblings.iter().position(|&s| s == self.tag_idx)?;
-            if pos > 0 {
-                Some(Document::make_element(py, &self.doc, siblings[pos - 1]))
-            } else {
-                None
-            }
-        })
+        let doc = self.doc.borrow(py);
+        let meta = doc.meta();
+        let parent = meta.parents[self.tag_idx];
+        if parent == u32::MAX {
+            return None;
+        }
+        let siblings = meta.index.children(parent as usize);
+        let pos = siblings.iter().position(|&s| s == self.tag_idx)?;
+        if pos > 0 {
+            Some(Document::make_element_borrowed(
+                py,
+                &self.doc,
+                &doc,
+                siblings[pos - 1],
+            ))
+        } else {
+            None
+        }
     }
 
     /// Raw XML for this element (opening through closing tag).
@@ -473,12 +558,12 @@ impl Element {
     }
 
     fn __repr__(&self, py: Python<'_>) -> String {
-        let tag = self.with_index(py, |index, _| index.tag_name(self.tag_idx).to_string());
-        format!("Element('{tag}')")
+        let tag_str = self.cached_tag.bind(py).to_cow().unwrap_or_default();
+        format!("Element('{tag_str}')")
     }
 
-    fn __str__(&self, py: Python<'_>) -> String {
-        self.with_index(py, |index, _| index.tag_name(self.tag_idx).to_string())
+    fn __str__(&self, py: Python<'_>) -> Py<PyString> {
+        self.cached_tag.clone_ref(py)
     }
 
     fn __bool__(&self) -> bool {
@@ -563,6 +648,7 @@ impl CompiledXPath {
         Ok(Document::make_elements(
             py,
             &doc_py,
+            &doc_ref,
             nodes.into_iter().filter_map(|n| match n {
                 XPathNode::Element(idx) => Some(idx),
                 _ => None,
@@ -613,9 +699,11 @@ fn get_first_attribute(index: &XmlIndex<'_>, tag_idx: usize) -> Option<String> {
         .map(|s| s.to_string())
 }
 
-/// Build a parent map from the public children() API.
-fn build_parent_map(index: &XmlIndex<'_>) -> Vec<u32> {
+/// Build a parent map + name_id map from the public API.
+fn build_meta(index: &XmlIndex<'_>) -> (Vec<u32>, Vec<usize>) {
     let n = index.tag_count();
+
+    // Parent map
     let mut parents = vec![u32::MAX; n];
     for i in 0..n {
         let tt = index.tag_type(i);
@@ -627,7 +715,62 @@ fn build_parent_map(index: &XmlIndex<'_>) -> Vec<u32> {
             }
         }
     }
-    parents
+
+    // Name interning: map tag_name -> sequential ID
+    let mut unique_names: Vec<String> = Vec::new();
+    let mut name_map: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
+    let mut name_ids = vec![usize::MAX; n];
+
+    for i in 0..n {
+        let tt = index.tag_type(i);
+        if tt == simdxml::index::TagType::Open || tt == simdxml::index::TagType::SelfClose {
+            let name = index.tag_name(i);
+            if !name.is_empty() {
+                let id = match name_map.get(name) {
+                    Some(&id) => id,
+                    None => {
+                        let id = unique_names.len();
+                        unique_names.push(name.to_string());
+                        name_map.insert(name.to_string(), id);
+                        id
+                    }
+                };
+                name_ids[i] = id;
+            }
+        }
+    }
+
+    (parents, name_ids)
+}
+
+/// Build interned Python strings for all unique tag names.
+fn build_interned_names(py: Python<'_>, index: &XmlIndex<'_>, name_ids: &[usize]) -> Vec<Py<PyString>> {
+    // Find max name_id to size the vec
+    let max_id = name_ids.iter().copied().filter(|&id| id != usize::MAX).max();
+    let n_unique = match max_id {
+        Some(m) => m + 1,
+        None => return Vec::new(),
+    };
+
+    // Collect the name string for each ID
+    let mut names: Vec<Py<PyString>> = Vec::with_capacity(n_unique);
+    // Initialize with empty strings
+    let empty = PyString::new(py, "").unbind();
+    for _ in 0..n_unique {
+        names.push(empty.clone_ref(py));
+    }
+
+    for (i, &id) in name_ids.iter().enumerate() {
+        if id != usize::MAX && id < n_unique {
+            // Only set the first time we see this ID
+            let name = index.tag_name(i);
+            if !name.is_empty() {
+                names[id] = PyString::new(py, name).unbind();
+            }
+        }
+    }
+
+    names
 }
 
 /// Recursively collect text content depth-first (for itertext).
@@ -648,7 +791,7 @@ fn collect_text(index: &XmlIndex<'_>, tag_idx: usize, out: &mut Vec<String>) {
 
 /// Parse XML bytes or string into a Document.
 #[pyfunction]
-fn parse(data: &Bound<'_, PyAny>) -> PyResult<Document> {
+fn parse(py: Python<'_>, data: &Bound<'_, PyAny>) -> PyResult<Document> {
     let bytes: Vec<u8> = if let Ok(b) = data.cast_exact::<PyBytes>() {
         b.as_bytes().to_vec()
     } else if let Ok(s) = data.extract::<String>() {
@@ -662,11 +805,24 @@ fn parse(data: &Bound<'_, PyAny>) -> PyResult<Document> {
             simdxml::parse(owner).map_err(|e| PyValueError::new_err(e.to_string()))?;
         index.ensure_indices();
         index.build_name_index();
-        let parents = build_parent_map(&index);
-        Ok::<_, PyErr>(IndexWithMeta { index, parents })
+        let (parents, name_ids) = build_meta(&index);
+        Ok::<_, PyErr>(IndexWithMeta {
+            index,
+            parents,
+            name_ids,
+        })
     })?;
 
-    Ok(Document { inner })
+    // Build interned Python strings (one copy per unique name)
+    let interned_names = {
+        let meta = inner.borrow_dependent();
+        build_interned_names(py, &meta.index, &meta.name_ids)
+    };
+
+    Ok(Document {
+        inner,
+        interned_names,
+    })
 }
 
 /// Compile an XPath expression for repeated evaluation.

From 30b07028caae87fee74dbbf14c34a4592b2e7a39 Mon Sep 17 00:00:00 2001
From: Christopher Grainger <chris@amplified.ai>
Date: Fri, 27 Mar 2026 21:25:35 +1100
Subject: [PATCH 2/7] Address PR review: simplify build_meta, improve
 docstrings

Review fixes:
- build_meta returns unique_names directly (was built then dropped)
- build_interned_names reduced from 27 lines to 3 (takes &[String])
- name_map borrows &str from index instead of cloning per entry
- Mutation methods documented as raising TypeError
- All user-facing docstrings cleaned: no implementation details,
  no FFI/Rust/interning/refcount language
- Rich .pyi stubs with docstrings for IDE hover

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 python/simdxml/_core.pyi | 201 +++++++++++++++++++++++++++++++--------
 src/lib.rs               | 157 +++++++++++++++---------------
 2 files changed, 242 insertions(+), 116 deletions(-)

diff --git a/python/simdxml/_core.pyi b/python/simdxml/_core.pyi
index 6dc5433..ff298f4 100644
--- a/python/simdxml/_core.pyi
+++ b/python/simdxml/_core.pyi
@@ -1,61 +1,180 @@
 from collections.abc import Iterator
 
 class Document:
-    """A parsed XML document backed by a SIMD-accelerated structural index."""
+    """A parsed XML document.
+
+    Created by `parse()`. Use `root` to get the root element,
+    or query directly with `xpath_text()` and `xpath()`.
+    """
 
     @property
-    def root(self) -> Element | None: ...
+    def root(self) -> Element | None:
+        """The root element of the document, or None if empty."""
+        ...
     @property
-    def tag_count(self) -> int: ...
-    def xpath_text(self, expr: str) -> list[str]: ...
-    def xpath_string(self, expr: str) -> list[str]: ...
-    def xpath(self, expr: str) -> list[Element | str]: ...
+    def tag_count(self) -> int:
+        """Total number of XML tags in the document."""
+        ...
+    def xpath_text(self, expr: str) -> list[str]:
+        """Evaluate an XPath expression and return text content of matches.
+
+        Returns the direct child text of each matching element.
+        """
+        ...
+    def xpath_string(self, expr: str) -> list[str]:
+        """Evaluate an XPath expression and return string-values of matches.
+
+        Returns all descendant text for each match (XPath ``string()`` semantics).
+        """
+        ...
+    def xpath(self, expr: str) -> list[Element | str]:
+        """Evaluate an XPath expression.
+
+        Returns Element objects for element nodes, strings for text/attribute nodes.
+        """
+        ...
 
 class Element:
-    """A read-only element in a parsed XML document."""
+    """A read-only XML element.
+
+    Supports the ElementTree API (``.tag``, ``.text``, ``.attrib``, ``.get()``,
+    ``len()``, indexing, iteration) plus lxml extensions (``.xpath()``,
+    ``.getparent()``, ``.getnext()``, ``.getprevious()``).
+    """
 
     @property
-    def tag(self) -> str: ...
+    def tag(self) -> str:
+        """The element's tag name (e.g., ``'book'``, ``'title'``)."""
+        ...
     @property
-    def text(self) -> str | None: ...
+    def text(self) -> str | None:
+        """Text content before the first child element, or None.
+
+        For ``<p>Hello <b>world</b></p>``, ``p.text`` is ``'Hello '``.
+        """
+        ...
     @property
-    def tail(self) -> str | None: ...
+    def tail(self) -> str | None:
+        """Text content after this element's closing tag, or None.
+
+        For ``<p>Hello <b>world</b> more</p>``, ``b.tail`` is ``' more'``.
+        """
+        ...
     @property
-    def attrib(self) -> dict[str, str]: ...
-    def get(self, key: str, default: str | None = None) -> str | None: ...
-    def keys(self) -> list[str]: ...
-    def items(self) -> list[tuple[str, str]]: ...
-    def iter(self, tag: str | None = None) -> Iterator[Element]: ...
-    def child_tags(self) -> list[str]: ...
-    def descendant_tags(self, tag: str | None = None) -> list[str]: ...
-    def itertext(self) -> list[str]: ...
-    def text_content(self) -> str: ...
-    def xpath(self, expr: str) -> list[Element]: ...
-    def xpath_text(self, expr: str) -> list[str]: ...
-    def getparent(self) -> Element | None: ...
-    def getnext(self) -> Element | None: ...
-    def getprevious(self) -> Element | None: ...
-    def tostring(self) -> str: ...
-    # Read-only enforcement: these raise TypeError
-    def set(self, key: str, value: str) -> None: ...
-    def append(self, element: Element) -> None: ...
-    def remove(self, element: Element) -> None: ...
-    def insert(self, index: int, element: Element) -> None: ...
-    def clear(self) -> None: ...
-    def __len__(self) -> int: ...
-    def __getitem__(self, index: int) -> Element: ...
-    def __iter__(self) -> Iterator[Element]: ...
+    def attrib(self) -> dict[str, str]:
+        """Dictionary of this element's attributes."""
+        ...
+    def get(self, key: str, default: str | None = None) -> str | None:
+        """Get an attribute value by name, with optional default."""
+        ...
+    def keys(self) -> list[str]:
+        """List of attribute names."""
+        ...
+    def items(self) -> list[tuple[str, str]]:
+        """List of ``(name, value)`` attribute pairs."""
+        ...
+    def iter(self, tag: str | None = None) -> Iterator[Element]:
+        """Iterate over descendant elements, optionally filtered by tag name."""
+        ...
+    def child_tags(self) -> list[str]:
+        """All direct child tag names as a list.
+
+        More efficient than ``[e.tag for e in element]`` for bulk access.
+        """
+        ...
+    def descendant_tags(self, tag: str | None = None) -> list[str]:
+        """All descendant tag names, optionally filtered.
+
+        More efficient than ``[e.tag for e in element.iter(tag)]`` for bulk access.
+        """
+        ...
+    def itertext(self) -> list[str]:
+        """All text content within this element, depth-first."""
+        ...
+    def text_content(self) -> str:
+        """All descendant text concatenated into a single string."""
+        ...
+    def xpath(self, expr: str) -> list[Element]:
+        """Evaluate an XPath 1.0 expression with this element as context.
+
+        Returns a list of matching Element objects.
+        """
+        ...
+    def xpath_text(self, expr: str) -> list[str]:
+        """Evaluate an XPath expression and return text content of matches."""
+        ...
+    def getparent(self) -> Element | None:
+        """Parent element, or None if this is the root."""
+        ...
+    def getnext(self) -> Element | None:
+        """Next sibling element, or None if this is the last child."""
+        ...
+    def getprevious(self) -> Element | None:
+        """Previous sibling element, or None if this is the first child."""
+        ...
+    def tostring(self) -> str:
+        """Serialize this element to an XML string."""
+        ...
+    def set(self, key: str, value: str) -> None:
+        """Not supported. Raises TypeError (elements are read-only)."""
+        ...
+    def append(self, element: Element) -> None:
+        """Not supported. Raises TypeError (elements are read-only)."""
+        ...
+    def remove(self, element: Element) -> None:
+        """Not supported. Raises TypeError (elements are read-only)."""
+        ...
+    def insert(self, index: int, element: Element) -> None:
+        """Not supported. Raises TypeError (elements are read-only)."""
+        ...
+    def clear(self) -> None:
+        """Not supported. Raises TypeError (elements are read-only)."""
+        ...
+    def __len__(self) -> int:
+        """Number of direct child elements."""
+        ...
+    def __getitem__(self, index: int) -> Element:
+        """Get a child element by index. Supports negative indexing."""
+        ...
+    def __iter__(self) -> Iterator[Element]:
+        """Iterate over direct child elements."""
+        ...
     def __bool__(self) -> bool: ...
     def __eq__(self, other: object) -> bool: ...
     def __hash__(self) -> int: ...
 
 class CompiledXPath:
-    """A compiled XPath expression for repeated evaluation."""
+    """A compiled XPath expression for repeated use.
+
+    Like ``re.compile()`` — parse the expression once, evaluate many times
+    across different documents.
+    """
+
+    def eval_text(self, doc: Document) -> list[str]:
+        """Evaluate and return text content of matching nodes."""
+        ...
+    def eval(self, doc: Document) -> list[Element]:
+        """Evaluate and return matching Element objects."""
+        ...
+    def eval_exists(self, doc: Document) -> bool:
+        """Check whether any nodes match the expression."""
+        ...
+    def eval_count(self, doc: Document) -> int:
+        """Count the number of matching nodes."""
+        ...
+
+def parse(data: bytes | str) -> Document:
+    """Parse XML into a Document.
+
+    Accepts ``bytes`` or ``str``. Returns a Document that can be queried
+    with XPath or traversed element-by-element.
+    """
+    ...
 
-    def eval_text(self, doc: Document) -> list[str]: ...
-    def eval(self, doc: Document) -> list[Element]: ...
-    def eval_exists(self, doc: Document) -> bool: ...
-    def eval_count(self, doc: Document) -> int: ...
+def compile(expr: str) -> CompiledXPath:
+    """Compile an XPath expression for repeated use.
 
-def parse(data: bytes | str) -> Document: ...
-def compile(expr: str) -> CompiledXPath: ...
+    Like ``re.compile()`` — parse the expression once, evaluate many times
+    across different documents.
+    """
+    ...
diff --git a/src/lib.rs b/src/lib.rs
index d28bb64..bf8bdb0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -15,6 +15,8 @@ struct IndexWithMeta<'a> {
     parents: Vec<u32>,
     /// name_id[i] = index into interned names for tag i. usize::MAX = none.
     name_ids: Vec<usize>,
+    /// Unique tag name strings (one per unique name, used to build Python interned strings).
+    unique_names: Vec<String>,
 }
 
 self_cell!(
@@ -25,12 +27,13 @@ self_cell!(
     }
 );
 
-/// A parsed XML document backed by a SIMD-accelerated structural index.
+/// A parsed XML document.
+///
+/// Created by :func:`parse`. Use :attr:`root` to get the root element,
+/// or query directly with :meth:`xpath_text` and :meth:`xpath`.
 #[pyclass]
 struct Document {
     inner: DocumentInner,
-    /// Interned tag names: unique tag name -> Python str (created once at parse).
-    /// Avoids per-access string copies across FFI.
     interned_names: Vec<Py<PyString>>,
 }
 
@@ -91,6 +94,8 @@ impl Document {
 #[pymethods]
 impl Document {
     /// Evaluate an XPath expression and return text content of matches.
+    ///
+    /// Returns the direct child text of each matching element.
     fn xpath_text(&self, expr: &str) -> PyResult<Vec<String>> {
         let index = self.index();
         let results = index
@@ -99,7 +104,9 @@ impl Document {
         Ok(results.into_iter().map(|s| s.to_string()).collect())
     }
 
-    /// Evaluate an XPath expression and return the XPath string-value of matches.
+    /// Evaluate an XPath expression and return string-values of matches.
+    ///
+    /// Returns all descendant text for each match (XPath string() semantics).
     fn xpath_string(&self, expr: &str) -> PyResult<Vec<String>> {
         let index = self.index();
         index
@@ -107,8 +114,9 @@ impl Document {
             .map_err(|e| PyValueError::new_err(e.to_string()))
     }
 
-    /// Evaluate an XPath expression. Returns Element list for node-sets,
-    /// strings for text/attribute nodes.
+    /// Evaluate an XPath expression.
+    ///
+    /// Returns Element objects for element nodes, strings for text/attribute nodes.
     fn xpath(slf: &Bound<'_, Self>, expr: &str) -> PyResult<Py<pyo3::types::PyList>> {
         let py = slf.py();
         let doc_py: Py<Document> = slf.clone().unbind();
@@ -140,7 +148,7 @@ impl Document {
         Ok(result.unbind())
     }
 
-    /// The root element of the document.
+    /// The root element of the document, or None if empty.
     #[getter]
     fn root(slf: &Bound<'_, Self>) -> Option<Element> {
         let py = slf.py();
@@ -158,7 +166,7 @@ impl Document {
         None
     }
 
-    /// Number of tags in the structural index.
+    /// Total number of XML tags in the document.
     #[getter]
     fn tag_count(&self) -> usize {
         self.index().tag_count()
@@ -175,18 +183,18 @@ impl Document {
 }
 
 // ---------------------------------------------------------------------------
-// Element — lightweight flyweight handle into a Document
+// Element
 // ---------------------------------------------------------------------------
 
-/// A read-only element in a parsed XML document.
+/// A read-only XML element.
 ///
-/// Holds a Python reference to the Document (preventing GC) plus a tag index.
-/// The tag name is eagerly cached as a Python string to avoid FFI on every access.
+/// Supports the ElementTree API (.tag, .text, .attrib, .get(), len(),
+/// indexing, iteration) plus lxml extensions (.xpath(), .getparent(),
+/// .getnext(), .getprevious()).
 #[pyclass(skip_from_py_object)]
 struct Element {
     doc: Py<Document>,
     tag_idx: usize,
-    /// Cached tag name (interned Python string). Avoids FFI on .tag access.
     cached_tag: Py<PyString>,
 }
 
@@ -204,13 +212,15 @@ impl Element {
 
 #[pymethods]
 impl Element {
-    /// The tag name (interned, eagerly cached — zero FFI on access).
+    /// The element's tag name (e.g., 'book', 'title').
     #[getter]
     fn tag(&self, py: Python<'_>) -> Py<PyString> {
         self.cached_tag.clone_ref(py)
     }
 
-    /// Direct text content, or None.
+    /// Text content before the first child element, or None.
+    ///
+    /// For '<p>Hello <b>world</b></p>', p.text is 'Hello '.
     #[getter]
     fn text(&self, py: Python<'_>) -> Option<String> {
         self.with_index(py, |index, _| {
@@ -227,7 +237,9 @@ impl Element {
         })
     }
 
-    /// Text after this element's closing tag (before next sibling).
+    /// Text content after this element's closing tag, or None.
+    ///
+    /// For '<p>Hello <b>world</b> more</p>', b.tail is ' more'.
     #[getter]
     fn tail(&self, py: Python<'_>) -> Option<String> {
         self.with_index(py, |index, meta| {
@@ -252,7 +264,7 @@ impl Element {
         })
     }
 
-    /// Dictionary of attributes.
+    /// Dictionary of this element's attributes.
     #[getter]
     fn attrib(&self, py: Python<'_>) -> PyResult<Py<pyo3::types::PyDict>> {
         let doc = self.doc.borrow(py);
@@ -304,11 +316,14 @@ impl Element {
     }
 
     /// Number of direct child elements.
+    ///
+    /// >>> len(element)
+    /// 3
     fn __len__(&self, py: Python<'_>) -> usize {
         self.with_index(py, |index, _| index.children(self.tag_idx).len())
     }
 
-    /// Get the i-th child element.
+    /// Get a child element by index. Supports negative indexing.
     fn __getitem__(&self, py: Python<'_>, index: isize) -> PyResult<Element> {
         let doc = self.doc.borrow(py);
         let children = doc.index().children(self.tag_idx);
@@ -362,9 +377,9 @@ impl Element {
         }
     }
 
-    // -- Batch APIs (single FFI crossing for N results) --
-
-    /// All child tag names as a list of strings (one FFI call, interned).
+    /// All direct child tag names as a list.
+    ///
+    /// More efficient than [e.tag for e in element] for bulk access.
     fn child_tags(&self, py: Python<'_>) -> Vec<Py<PyString>> {
         let doc = self.doc.borrow(py);
         let index = doc.index();
@@ -375,7 +390,9 @@ impl Element {
             .collect()
     }
 
-    /// All descendant tag names matching optional filter.
+    /// All descendant tag names, optionally filtered.
+    ///
+    /// More efficient than [e.tag for e in element.iter(tag)] for bulk access.
     #[pyo3(signature = (tag=None))]
     fn descendant_tags(&self, py: Python<'_>, tag: Option<&str>) -> Vec<Py<PyString>> {
         let doc = self.doc.borrow(py);
@@ -396,7 +413,7 @@ impl Element {
         result
     }
 
-    /// All text content (depth-first) as a list of strings.
+    /// All text content within this element, depth-first.
     fn itertext(&self, py: Python<'_>) -> Vec<String> {
         let doc = self.doc.borrow(py);
         let index = doc.index();
@@ -405,12 +422,14 @@ impl Element {
         texts
     }
 
-    /// Concatenation of all descendant text.
+    /// All descendant text concatenated into a single string.
     fn text_content(&self, py: Python<'_>) -> String {
         self.with_index(py, |index, _| index.all_text(self.tag_idx))
     }
 
-    /// Evaluate full XPath 1.0 from this element as context node.
+    /// Evaluate an XPath 1.0 expression with this element as context.
+    ///
+    /// Returns a list of matching Element objects.
     fn xpath(&self, py: Python<'_>, expr: &str) -> PyResult<Vec<Element>> {
         let doc = self.doc.borrow(py);
         let index = doc.index();
@@ -429,7 +448,7 @@ impl Element {
         ))
     }
 
-    /// XPath text extraction from this element as context.
+    /// Evaluate an XPath expression and return text content of matches.
     fn xpath_text(&self, py: Python<'_>, expr: &str) -> PyResult<Vec<String>> {
         let doc = self.doc.borrow(py);
         let index = doc.index();
@@ -535,24 +554,29 @@ impl Element {
         Err(readonly_error())
     }
 
+    /// Not supported. Raises TypeError (simdxml elements are read-only).
     #[pyo3(name = "set")]
     fn set_attr(&self, _key: &str, _value: &str) -> PyResult<()> {
         Err(readonly_error())
     }
 
+    /// Not supported. Raises TypeError (simdxml elements are read-only).
     fn append(&self, _element: &Element) -> PyResult<()> {
         Err(readonly_error())
     }
 
+    /// Not supported. Raises TypeError (simdxml elements are read-only).
     fn remove(&self, _element: &Element) -> PyResult<()> {
         Err(readonly_error())
     }
 
+    /// Not supported. Raises TypeError (simdxml elements are read-only).
     #[pyo3(signature = (_index, _element))]
     fn insert(&self, _index: isize, _element: &Element) -> PyResult<()> {
         Err(readonly_error())
     }
 
+    /// Not supported. Raises TypeError (simdxml elements are read-only).
     fn clear(&self) -> PyResult<()> {
         Err(readonly_error())
     }
@@ -615,7 +639,10 @@ impl ElementIterator {
 // CompiledXPath
 // ---------------------------------------------------------------------------
 
-/// A compiled XPath expression for repeated evaluation.
+/// A compiled XPath expression for repeated use.
+///
+/// Like re.compile() -- parse the expression once, evaluate many times
+/// across different documents.
 #[pyclass]
 struct CompiledXPath {
     inner: simdxml::CompiledXPath,
@@ -699,15 +726,14 @@ fn get_first_attribute(index: &XmlIndex<'_>, tag_idx: usize) -> Option<String> {
         .map(|s| s.to_string())
 }
 
-/// Build a parent map + name_id map from the public API.
-fn build_meta(index: &XmlIndex<'_>) -> (Vec<u32>, Vec<usize>) {
+/// Build parent map, name-id map, and unique name list from the public API.
+fn build_meta(index: &XmlIndex<'_>) -> (Vec<u32>, Vec<usize>, Vec<String>) {
     let n = index.tag_count();
 
     // Parent map
     let mut parents = vec![u32::MAX; n];
     for i in 0..n {
-        let tt = index.tag_type(i);
-        if tt == simdxml::index::TagType::Open {
+        if index.tag_type(i) == simdxml::index::TagType::Open {
             for child in index.children(i) {
                 if child < n {
                     parents[child] = i as u32;
@@ -716,9 +742,9 @@ fn build_meta(index: &XmlIndex<'_>) -> (Vec<u32>, Vec<usize>) {
         }
     }
 
-    // Name interning: map tag_name -> sequential ID
+    // Name interning: borrow tag names from the index to avoid extra clones.
     let mut unique_names: Vec<String> = Vec::new();
-    let mut name_map: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
+    let mut name_map: std::collections::HashMap<&str, usize> = std::collections::HashMap::new();
     let mut name_ids = vec![usize::MAX; n];
 
     for i in 0..n {
@@ -726,51 +752,25 @@ fn build_meta(index: &XmlIndex<'_>) -> (Vec<u32>, Vec<usize>) {
         if tt == simdxml::index::TagType::Open || tt == simdxml::index::TagType::SelfClose {
             let name = index.tag_name(i);
             if !name.is_empty() {
-                let id = match name_map.get(name) {
-                    Some(&id) => id,
-                    None => {
-                        let id = unique_names.len();
-                        unique_names.push(name.to_string());
-                        name_map.insert(name.to_string(), id);
-                        id
-                    }
-                };
+                let id = *name_map.entry(name).or_insert_with(|| {
+                    let id = unique_names.len();
+                    unique_names.push(name.to_string());
+                    id
+                });
                 name_ids[i] = id;
             }
         }
     }
 
-    (parents, name_ids)
+    (parents, name_ids, unique_names)
 }
 
-/// Build interned Python strings for all unique tag names.
-fn build_interned_names(py: Python<'_>, index: &XmlIndex<'_>, name_ids: &[usize]) -> Vec<Py<PyString>> {
-    // Find max name_id to size the vec
-    let max_id = name_ids.iter().copied().filter(|&id| id != usize::MAX).max();
-    let n_unique = match max_id {
-        Some(m) => m + 1,
-        None => return Vec::new(),
-    };
-
-    // Collect the name string for each ID
-    let mut names: Vec<Py<PyString>> = Vec::with_capacity(n_unique);
-    // Initialize with empty strings
-    let empty = PyString::new(py, "").unbind();
-    for _ in 0..n_unique {
-        names.push(empty.clone_ref(py));
-    }
-
-    for (i, &id) in name_ids.iter().enumerate() {
-        if id != usize::MAX && id < n_unique {
-            // Only set the first time we see this ID
-            let name = index.tag_name(i);
-            if !name.is_empty() {
-                names[id] = PyString::new(py, name).unbind();
-            }
-        }
-    }
-
-    names
+/// Build interned Python strings from the unique name list.
+fn build_interned_names(py: Python<'_>, unique_names: &[String]) -> Vec<Py<PyString>> {
+    unique_names
+        .iter()
+        .map(|s| PyString::new(py, s).unbind())
+        .collect()
 }
 
 /// Recursively collect text content depth-first (for itertext).
@@ -789,7 +789,10 @@ fn collect_text(index: &XmlIndex<'_>, tag_idx: usize, out: &mut Vec<String>) {
 // Module-level functions
 // ---------------------------------------------------------------------------
 
-/// Parse XML bytes or string into a Document.
+/// Parse XML into a Document.
+///
+/// Accepts bytes or str. Returns a Document that can be queried
+/// with XPath or traversed element-by-element.
 #[pyfunction]
 fn parse(py: Python<'_>, data: &Bound<'_, PyAny>) -> PyResult<Document> {
     let bytes: Vec<u8> = if let Ok(b) = data.cast_exact::<PyBytes>() {
@@ -805,18 +808,19 @@ fn parse(py: Python<'_>, data: &Bound<'_, PyAny>) -> PyResult<Document> {
             simdxml::parse(owner).map_err(|e| PyValueError::new_err(e.to_string()))?;
         index.ensure_indices();
         index.build_name_index();
-        let (parents, name_ids) = build_meta(&index);
+        let (parents, name_ids, unique_names) = build_meta(&index);
         Ok::<_, PyErr>(IndexWithMeta {
             index,
             parents,
             name_ids,
+            unique_names,
         })
     })?;
 
     // Build interned Python strings (one copy per unique name)
     let interned_names = {
         let meta = inner.borrow_dependent();
-        build_interned_names(py, &meta.index, &meta.name_ids)
+        build_interned_names(py, &meta.unique_names)
     };
 
     Ok(Document {
@@ -825,7 +829,10 @@ fn parse(py: Python<'_>, data: &Bound<'_, PyAny>) -> PyResult<Document> {
     })
 }
 
-/// Compile an XPath expression for repeated evaluation.
+/// Compile an XPath expression for repeated use.
+///
+/// Like re.compile() -- parse the expression once, evaluate many times
+/// across different documents.
 #[pyfunction]
 fn compile(expr: &str) -> PyResult<CompiledXPath> {
     let inner =

From d79d8bea29d810d86480d1bfafd5be54a6b02578 Mon Sep 17 00:00:00 2001
From: Christopher Grainger <chris@amplified.ai>
Date: Fri, 27 Mar 2026 21:34:19 +1100
Subject: [PATCH 3/7] Zero-copy parse, eliminate string copies, optimize hot
 paths

Performance optimizations addressing PyO3 overhead analysis:

1. Zero-copy parse for bytes input (#6): DocumentOwner enum uses
   PyBackedBytes to borrow directly from Python bytes object's
   internal buffer, avoiding a full memcpy of the XML document.
   str input still copies (Python str -> UTF-8 encoding required).

2. Eliminate String intermediaries (#4): All text-returning methods
   (xpath_text, xpath_string, .text, .tail, .get, .keys, .items,
   itertext, text_content, tostring) now return Py<PyString> built
   directly from &str slices. Skips Rust String allocation that
   PyO3 would then copy again into Python.

3. interned_tag_fast (#3): Hot paths (child_tags, descendant_tags,
   make_element_borrowed, make_elements) now accept &IndexWithMeta
   directly, avoiding redundant borrow_dependent() calls in tight
   loops.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 python/simdxml/_core.pyi |   4 +-
 src/lib.rs               | 311 +++++++++++++++++++++++----------------
 2 files changed, 186 insertions(+), 129 deletions(-)

diff --git a/python/simdxml/_core.pyi b/python/simdxml/_core.pyi
index ff298f4..c32837b 100644
--- a/python/simdxml/_core.pyi
+++ b/python/simdxml/_core.pyi
@@ -166,8 +166,8 @@ class CompiledXPath:
 def parse(data: bytes | str) -> Document:
     """Parse XML into a Document.
 
-    Accepts ``bytes`` or ``str``. Returns a Document that can be queried
-    with XPath or traversed element-by-element.
+    Accepts ``bytes`` or ``str``. For bytes input, the buffer is used
+    directly (zero-copy). For str input, the string is encoded to UTF-8.
     """
     ...
 
diff --git a/src/lib.rs b/src/lib.rs
index bf8bdb0..74f2b9f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,4 +1,5 @@
 use pyo3::exceptions::{PyTypeError, PyValueError};
+use pyo3::pybacked::PyBackedBytes;
 use pyo3::prelude::*;
 use pyo3::types::{PyBytes, PyString};
 use self_cell::self_cell;
@@ -9,19 +10,37 @@ use simdxml::XmlIndex;
 // Self-referential Document: owns bytes + XmlIndex + derived data
 // ---------------------------------------------------------------------------
 
+/// Owner type: either zero-copy from Python bytes or owned from str input.
+enum DocumentOwner {
+    /// Zero-copy: borrows directly from Python bytes object's internal buffer.
+    ZeroCopy(PyBackedBytes),
+    /// Owned: copied from str input (Python str -> UTF-8 bytes).
+    Owned(Vec<u8>),
+}
+
+impl std::ops::Deref for DocumentOwner {
+    type Target = [u8];
+    fn deref(&self) -> &[u8] {
+        match self {
+            DocumentOwner::ZeroCopy(b) => b,
+            DocumentOwner::Owned(v) => v,
+        }
+    }
+}
+
 struct IndexWithMeta<'a> {
     index: XmlIndex<'a>,
     /// parent[i] = tag index of parent element. u32::MAX = root.
     parents: Vec<u32>,
     /// name_id[i] = index into interned names for tag i. usize::MAX = none.
     name_ids: Vec<usize>,
-    /// Unique tag name strings (one per unique name, used to build Python interned strings).
+    /// Unique tag name strings (used to build Python interned strings at parse time).
     unique_names: Vec<String>,
 }
 
 self_cell!(
     struct DocumentInner {
-        owner: Vec<u8>,
+        owner: DocumentOwner,
         #[covariant]
         dependent: IndexWithMeta,
     }
@@ -29,11 +48,12 @@ self_cell!(
 
 /// A parsed XML document.
 ///
-/// Created by :func:`parse`. Use :attr:`root` to get the root element,
-/// or query directly with :meth:`xpath_text` and :meth:`xpath`.
+/// Created by `parse()`. Use `root` to get the root element,
+/// or query directly with `xpath_text()` and `xpath()`.
 #[pyclass]
 struct Document {
     inner: DocumentInner,
+    /// Interned tag names: unique tag name -> Python str (created once at parse).
     interned_names: Vec<Py<PyString>>,
 }
 
@@ -46,15 +66,18 @@ impl Document {
         self.inner.borrow_dependent()
     }
 
-    fn interned_tag(&self, py: Python<'_>, tag_idx: usize) -> Py<PyString> {
-        let meta = self.meta();
+    /// Look up interned tag when you already have a meta borrow (hot path).
+    fn interned_tag_fast(
+        &self,
+        py: Python<'_>,
+        meta: &IndexWithMeta<'_>,
+        tag_idx: usize,
+    ) -> Py<PyString> {
         let name_id = meta.name_ids[tag_idx];
         if name_id < self.interned_names.len() {
             self.interned_names[name_id].clone_ref(py)
         } else {
-            // Fallback for tags without interned names (comments, PIs, etc.)
-            let name = meta.index.tag_name(tag_idx);
-            PyString::new(py, name).unbind()
+            PyString::new(py, meta.index.tag_name(tag_idx)).unbind()
         }
     }
 
@@ -71,7 +94,8 @@ impl Document {
         doc_ref: &Document,
         tag_idx: usize,
     ) -> Element {
-        let cached_tag = doc_ref.interned_tag(py, tag_idx);
+        let meta = doc_ref.meta();
+        let cached_tag = doc_ref.interned_tag_fast(py, meta, tag_idx);
         Element {
             doc: doc.clone_ref(py),
             tag_idx,
@@ -85,8 +109,16 @@ impl Document {
         doc_ref: &Document,
         tag_indices: impl Iterator<Item = usize>,
     ) -> Vec<Element> {
+        let meta = doc_ref.meta();
         tag_indices
-            .map(|idx| Self::make_element_borrowed(py, doc, doc_ref, idx))
+            .map(|idx| {
+                let cached_tag = doc_ref.interned_tag_fast(py, meta, idx);
+                Element {
+                    doc: doc.clone_ref(py),
+                    tag_idx: idx,
+                    cached_tag,
+                }
+            })
             .collect()
     }
 }
@@ -96,22 +128,30 @@ impl Document {
     /// Evaluate an XPath expression and return text content of matches.
     ///
     /// Returns the direct child text of each matching element.
-    fn xpath_text(&self, expr: &str) -> PyResult<Vec<String>> {
+    fn xpath_text(&self, py: Python<'_>, expr: &str) -> PyResult<Vec<Py<PyString>>> {
         let index = self.index();
         let results = index
             .xpath_text(expr)
             .map_err(|e| PyValueError::new_err(e.to_string()))?;
-        Ok(results.into_iter().map(|s| s.to_string()).collect())
+        // Return Py<PyString> directly from &str — avoids Rust String intermediary
+        Ok(results
+            .into_iter()
+            .map(|s| PyString::new(py, s).unbind())
+            .collect())
     }
 
     /// Evaluate an XPath expression and return string-values of matches.
     ///
-    /// Returns all descendant text for each match (XPath string() semantics).
-    fn xpath_string(&self, expr: &str) -> PyResult<Vec<String>> {
+    /// Returns all descendant text for each match (XPath `string()` semantics).
+    fn xpath_string(&self, py: Python<'_>, expr: &str) -> PyResult<Vec<Py<PyString>>> {
         let index = self.index();
-        index
+        let results = index
             .xpath_string(expr)
-            .map_err(|e| PyValueError::new_err(e.to_string()))
+            .map_err(|e| PyValueError::new_err(e.to_string()))?;
+        Ok(results
+            .into_iter()
+            .map(|s| PyString::new(py, &s).unbind())
+            .collect())
     }
 
     /// Evaluate an XPath expression.
@@ -195,21 +235,10 @@ impl Document {
 struct Element {
     doc: Py<Document>,
     tag_idx: usize,
+    /// Cached tag name (interned Python string).
     cached_tag: Py<PyString>,
 }
 
-impl Element {
-    fn with_index<'py, R>(
-        &self,
-        py: Python<'py>,
-        f: impl FnOnce(&XmlIndex<'_>, &IndexWithMeta<'_>) -> R,
-    ) -> R {
-        let doc = self.doc.borrow(py);
-        let meta = doc.meta();
-        f(&meta.index, meta)
-    }
-}
-
 #[pymethods]
 impl Element {
     /// The element's tag name (e.g., 'book', 'title').
@@ -220,48 +249,51 @@ impl Element {
 
     /// Text content before the first child element, or None.
     ///
-    /// For '<p>Hello <b>world</b></p>', p.text is 'Hello '.
+    /// For `<p>Hello <b>world</b></p>`, `p.text` is `'Hello '`.
     #[getter]
-    fn text(&self, py: Python<'_>) -> Option<String> {
-        self.with_index(py, |index, _| {
-            let texts = index.direct_text(self.tag_idx);
-            if texts.is_empty() {
-                return None;
-            }
-            let first = texts[0];
-            if first.is_empty() {
-                None
-            } else {
-                Some(XmlIndex::decode_entities(first).into_owned())
-            }
-        })
+    fn text(&self, py: Python<'_>) -> Option<Py<PyString>> {
+        let doc = self.doc.borrow(py);
+        let index = doc.index();
+        let texts = index.direct_text(self.tag_idx);
+        if texts.is_empty() {
+            return None;
+        }
+        let first = texts[0];
+        if first.is_empty() {
+            None
+        } else {
+            let decoded = XmlIndex::decode_entities(first);
+            Some(PyString::new(py, &decoded).unbind())
+        }
     }
 
     /// Text content after this element's closing tag, or None.
     ///
-    /// For '<p>Hello <b>world</b> more</p>', b.tail is ' more'.
+    /// For `<p>Hello <b>world</b> more</p>`, `b.tail` is `' more'`.
     #[getter]
-    fn tail(&self, py: Python<'_>) -> Option<String> {
-        self.with_index(py, |index, meta| {
-            let parent = meta.parents[self.tag_idx];
-            if parent == u32::MAX {
-                return None;
-            }
-
-            let parent_raw = index.raw_xml(parent as usize);
-            let my_raw = index.raw_xml(self.tag_idx);
+    fn tail(&self, py: Python<'_>) -> Option<Py<PyString>> {
+        let doc = self.doc.borrow(py);
+        let meta = doc.meta();
+        let parent = meta.parents[self.tag_idx];
+        if parent == u32::MAX {
+            return None;
+        }
 
-            if let Some(pos) = parent_raw.find(my_raw) {
-                let after = &parent_raw[pos + my_raw.len()..];
-                if let Some(lt) = after.find('<') {
-                    let text = &after[..lt];
-                    if !text.is_empty() {
-                        return Some(XmlIndex::decode_entities(text).into_owned());
-                    }
+        let index = &meta.index;
+        let parent_raw = index.raw_xml(parent as usize);
+        let my_raw = index.raw_xml(self.tag_idx);
+
+        if let Some(pos) = parent_raw.find(my_raw) {
+            let after = &parent_raw[pos + my_raw.len()..];
+            if let Some(lt) = after.find('<') {
+                let text = &after[..lt];
+                if !text.is_empty() {
+                    let decoded = XmlIndex::decode_entities(text);
+                    return Some(PyString::new(py, &decoded).unbind());
                 }
             }
-            None
-        })
+        }
+        None
     }
 
     /// Dictionary of this element's attributes.
@@ -280,47 +312,48 @@ impl Element {
 
     /// Get an attribute value by name, with optional default.
     #[pyo3(signature = (key, default=None))]
-    fn get(&self, py: Python<'_>, key: &str, default: Option<&str>) -> Option<String> {
-        self.with_index(py, |index, _| {
-            index
-                .get_attribute(self.tag_idx, key)
-                .map(|s| s.to_string())
-                .or_else(|| default.map(|s| s.to_string()))
-        })
+    fn get(&self, py: Python<'_>, key: &str, default: Option<&str>) -> Option<Py<PyString>> {
+        let doc = self.doc.borrow(py);
+        let index = doc.index();
+        index
+            .get_attribute(self.tag_idx, key)
+            .map(|s| PyString::new(py, s).unbind())
+            .or_else(|| default.map(|s| PyString::new(py, s).unbind()))
     }
 
     /// Attribute names.
-    fn keys(&self, py: Python<'_>) -> Vec<String> {
-        self.with_index(py, |index, _| {
-            index
-                .get_all_attribute_names(self.tag_idx)
-                .into_iter()
-                .map(|s| s.to_string())
-                .collect()
-        })
+    fn keys(&self, py: Python<'_>) -> Vec<Py<PyString>> {
+        let doc = self.doc.borrow(py);
+        let index = doc.index();
+        index
+            .get_all_attribute_names(self.tag_idx)
+            .into_iter()
+            .map(|s| PyString::new(py, s).unbind())
+            .collect()
     }
 
     /// (name, value) attribute pairs.
-    fn items(&self, py: Python<'_>) -> Vec<(String, String)> {
-        self.with_index(py, |index, _| {
-            index
-                .get_all_attribute_names(self.tag_idx)
-                .into_iter()
-                .filter_map(|name| {
-                    index
-                        .get_attribute(self.tag_idx, name)
-                        .map(|val| (name.to_string(), val.to_string()))
+    fn items(&self, py: Python<'_>) -> Vec<(Py<PyString>, Py<PyString>)> {
+        let doc = self.doc.borrow(py);
+        let index = doc.index();
+        index
+            .get_all_attribute_names(self.tag_idx)
+            .into_iter()
+            .filter_map(|name| {
+                index.get_attribute(self.tag_idx, name).map(|val| {
+                    (
+                        PyString::new(py, name).unbind(),
+                        PyString::new(py, val).unbind(),
+                    )
                 })
-                .collect()
-        })
+            })
+            .collect()
     }
 
     /// Number of direct child elements.
-    ///
-    /// >>> len(element)
-    /// 3
     fn __len__(&self, py: Python<'_>) -> usize {
-        self.with_index(py, |index, _| index.children(self.tag_idx).len())
+        let doc = self.doc.borrow(py);
+        doc.index().children(self.tag_idx).len()
     }
 
     /// Get a child element by index. Supports negative indexing.
@@ -360,6 +393,7 @@ impl Element {
         let start = self.tag_idx;
         let close = index.matching_close(start).unwrap_or(start);
 
+        // Linear scan over tag range (not index-accelerated).
         let mut descendants = Vec::new();
         for i in (start + 1)..=close {
             let tt = index.tag_type(i);
@@ -379,24 +413,25 @@ impl Element {
 
     /// All direct child tag names as a list.
     ///
-    /// More efficient than [e.tag for e in element] for bulk access.
+    /// More efficient than `[e.tag for e in element]` for bulk access.
     fn child_tags(&self, py: Python<'_>) -> Vec<Py<PyString>> {
         let doc = self.doc.borrow(py);
-        let index = doc.index();
-        index
+        let meta = doc.meta();
+        meta.index
             .children(self.tag_idx)
             .iter()
-            .map(|&child| doc.interned_tag(py, child))
+            .map(|&child| doc.interned_tag_fast(py, meta, child))
             .collect()
     }
 
     /// All descendant tag names, optionally filtered.
     ///
-    /// More efficient than [e.tag for e in element.iter(tag)] for bulk access.
+    /// More efficient than `[e.tag for e in element.iter(tag)]` for bulk access.
     #[pyo3(signature = (tag=None))]
     fn descendant_tags(&self, py: Python<'_>, tag: Option<&str>) -> Vec<Py<PyString>> {
         let doc = self.doc.borrow(py);
-        let index = doc.index();
+        let meta = doc.meta();
+        let index = &meta.index;
         let start = self.tag_idx;
         let close = index.matching_close(start).unwrap_or(start);
 
@@ -406,7 +441,7 @@ impl Element {
             if tt == simdxml::index::TagType::Open || tt == simdxml::index::TagType::SelfClose {
                 match tag {
                     Some(filter) if index.tag_name(i) != filter => {}
-                    _ => result.push(doc.interned_tag(py, i)),
+                    _ => result.push(doc.interned_tag_fast(py, meta, i)),
                 }
             }
         }
@@ -414,17 +449,19 @@ impl Element {
     }
 
     /// All text content within this element, depth-first.
-    fn itertext(&self, py: Python<'_>) -> Vec<String> {
+    fn itertext(&self, py: Python<'_>) -> Vec<Py<PyString>> {
         let doc = self.doc.borrow(py);
         let index = doc.index();
         let mut texts = Vec::new();
-        collect_text(index, self.tag_idx, &mut texts);
+        collect_text_py(py, index, self.tag_idx, &mut texts);
         texts
     }
 
     /// All descendant text concatenated into a single string.
-    fn text_content(&self, py: Python<'_>) -> String {
-        self.with_index(py, |index, _| index.all_text(self.tag_idx))
+    fn text_content(&self, py: Python<'_>) -> Py<PyString> {
+        let doc = self.doc.borrow(py);
+        let text = doc.index().all_text(self.tag_idx);
+        PyString::new(py, &text).unbind()
     }
 
     /// Evaluate an XPath 1.0 expression with this element as context.
@@ -449,7 +486,7 @@ impl Element {
     }
 
     /// Evaluate an XPath expression and return text content of matches.
-    fn xpath_text(&self, py: Python<'_>, expr: &str) -> PyResult<Vec<String>> {
+    fn xpath_text(&self, py: Python<'_>, expr: &str) -> PyResult<Vec<Py<PyString>>> {
         let doc = self.doc.borrow(py);
         let index = doc.index();
         let results = index
@@ -462,15 +499,17 @@ impl Element {
                 XPathNode::Element(idx) => {
                     let dt = index.direct_text(*idx);
                     if !dt.is_empty() {
-                        texts.push(dt.join(""));
+                        // Build PyString directly from &str slices
+                        let joined: String = dt.iter().copied().collect();
+                        texts.push(PyString::new(py, &joined).unbind());
                     }
                 }
                 XPathNode::Text(idx) => {
-                    texts.push(index.text_by_index(*idx).to_string());
+                    texts.push(PyString::new(py, index.text_by_index(*idx)).unbind());
                 }
                 XPathNode::Attribute(tag_idx, _) => {
-                    if let Some(val) = get_first_attribute(index, *tag_idx) {
-                        texts.push(val);
+                    if let Some(s) = get_first_attribute_str(index, *tag_idx) {
+                        texts.push(PyString::new(py, s).unbind());
                     }
                 }
                 _ => {}
@@ -532,9 +571,11 @@ impl Element {
         }
     }
 
-    /// Raw XML for this element (opening through closing tag).
-    fn tostring(&self, py: Python<'_>) -> String {
-        self.with_index(py, |index, _| index.raw_xml(self.tag_idx).to_string())
+    /// Serialize this element to an XML string.
+    fn tostring(&self, py: Python<'_>) -> Py<PyString> {
+        let doc = self.doc.borrow(py);
+        let raw = doc.index().raw_xml(self.tag_idx);
+        PyString::new(py, raw).unbind()
     }
 
     // -- Read-only enforcement --
@@ -641,7 +682,7 @@ impl ElementIterator {
 
 /// A compiled XPath expression for repeated use.
 ///
-/// Like re.compile() -- parse the expression once, evaluate many times
+/// Like `re.compile()` -- parse the expression once, evaluate many times
 /// across different documents.
 #[pyclass]
 struct CompiledXPath {
@@ -650,17 +691,20 @@ struct CompiledXPath {
 
 #[pymethods]
 impl CompiledXPath {
-    /// Evaluate and return text content of matches.
-    fn eval_text(&self, doc: &Document) -> PyResult<Vec<String>> {
+    /// Evaluate and return text content of matching nodes.
+    fn eval_text(&self, py: Python<'_>, doc: &Document) -> PyResult<Vec<Py<PyString>>> {
         let index = doc.index();
         let results = self
             .inner
             .eval_text(index)
             .map_err(|e| PyValueError::new_err(e.to_string()))?;
-        Ok(results.into_iter().map(|s| s.to_string()).collect())
+        Ok(results
+            .into_iter()
+            .map(|s| PyString::new(py, s).unbind())
+            .collect())
     }
 
-    /// Evaluate and return matching Element nodes.
+    /// Evaluate and return matching Element objects.
     fn eval(slf: &Bound<'_, Self>, doc: &Bound<'_, Document>) -> PyResult<Vec<Element>> {
         let this = slf.borrow();
         let doc_ref = doc.borrow();
@@ -683,7 +727,7 @@ impl CompiledXPath {
         ))
     }
 
-    /// Check if any nodes match.
+    /// Check whether any nodes match.
     fn eval_exists(&self, doc: &Document) -> PyResult<bool> {
         let index = doc.index();
         let nodes = self
@@ -693,7 +737,7 @@ impl CompiledXPath {
         Ok(!nodes.is_empty())
     }
 
-    /// Count matching nodes.
+    /// Count the number of matching nodes.
     fn eval_count(&self, doc: &Document) -> PyResult<usize> {
         let index = doc.index();
         let nodes = self
@@ -726,6 +770,13 @@ fn get_first_attribute(index: &XmlIndex<'_>, tag_idx: usize) -> Option<String> {
         .map(|s| s.to_string())
 }
 
+fn get_first_attribute_str<'a>(index: &'a XmlIndex<'_>, tag_idx: usize) -> Option<&'a str> {
+    let names = index.get_all_attribute_names(tag_idx);
+    names
+        .first()
+        .and_then(|name| index.get_attribute(tag_idx, name))
+}
+
 /// Build parent map, name-id map, and unique name list from the public API.
 fn build_meta(index: &XmlIndex<'_>) -> (Vec<u32>, Vec<usize>, Vec<String>) {
     let n = index.tag_count();
@@ -773,15 +824,16 @@ fn build_interned_names(py: Python<'_>, unique_names: &[String]) -> Vec<Py<PyStr
         .collect()
 }
 
-/// Recursively collect text content depth-first (for itertext).
-fn collect_text(index: &XmlIndex<'_>, tag_idx: usize, out: &mut Vec<String>) {
+/// Recursively collect text content depth-first, building PyStrings directly.
+fn collect_text_py(py: Python<'_>, index: &XmlIndex<'_>, tag_idx: usize, out: &mut Vec<Py<PyString>>) {
     for text in index.direct_text(tag_idx) {
         if !text.is_empty() {
-            out.push(XmlIndex::decode_entities(text).into_owned());
+            let decoded = XmlIndex::decode_entities(text);
+            out.push(PyString::new(py, &decoded).unbind());
         }
     }
     for child in index.children(tag_idx) {
-        collect_text(index, child, out);
+        collect_text_py(py, index, child, out);
     }
 }
 
@@ -793,17 +845,22 @@ fn collect_text(index: &XmlIndex<'_>, tag_idx: usize, out: &mut Vec<String>) {
 ///
 /// Accepts bytes or str. Returns a Document that can be queried
 /// with XPath or traversed element-by-element.
+///
+/// For bytes input, the buffer is used directly (zero-copy).
+/// For str input, the string is encoded to UTF-8 bytes.
 #[pyfunction]
 fn parse(py: Python<'_>, data: &Bound<'_, PyAny>) -> PyResult<Document> {
-    let bytes: Vec<u8> = if let Ok(b) = data.cast_exact::<PyBytes>() {
-        b.as_bytes().to_vec()
+    let owner = if data.is_instance_of::<PyBytes>() {
+        // Zero-copy: PyBackedBytes borrows from the Python bytes object
+        let backed: PyBackedBytes = data.extract()?;
+        DocumentOwner::ZeroCopy(backed)
     } else if let Ok(s) = data.extract::<String>() {
-        s.into_bytes()
+        DocumentOwner::Owned(s.into_bytes())
     } else {
         return Err(PyTypeError::new_err("parse() requires bytes or str"));
     };
 
-    let inner = DocumentInner::try_new(bytes, |owner| {
+    let inner = DocumentInner::try_new(owner, |owner| {
         let mut index =
             simdxml::parse(owner).map_err(|e| PyValueError::new_err(e.to_string()))?;
         index.ensure_indices();
@@ -831,7 +888,7 @@ fn parse(py: Python<'_>, data: &Bound<'_, PyAny>) -> PyResult<Document> {
 
 /// Compile an XPath expression for repeated use.
 ///
-/// Like re.compile() -- parse the expression once, evaluate many times
+/// Like `re.compile()` -- parse the expression once, evaluate many times
 /// across different documents.
 #[pyfunction]
 fn compile(expr: &str) -> PyResult<CompiledXPath> {

From 8597d871dc541c174ec483a38cab81481540bbfa Mon Sep 17 00:00:00 2001
From: Christopher Grainger <chris@amplified.ai>
Date: Fri, 27 Mar 2026 21:41:48 +1100
Subject: [PATCH 4/7] Pre-cached iterator, lazy ElementList, O(1) sibling
 lookup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

#1 Iterator pre-caching: ElementIterator pre-builds all (tag_idx,
   cached_tag) pairs in a single Document borrow at creation time.
   __next__ no longer borrows Document — just clone_ref on pre-cached
   values.

#2 Lazy ElementList: Element.xpath() and CompiledXPath.eval() now
   return ElementList — holds one Py<Document> + Vec<usize> of tag
   indices. Elements created on demand via __getitem__/__iter__.
   compiled.eval() for 100K results: 4ms -> 0.07ms (57x faster).
   Supports __len__, __getitem__, __iter__, __bool__, __eq__ (with
   list comparison).

#7 O(1) sibling lookup: child_positions[i] stored in IndexWithMeta
   at parse time. getnext/getprevious use direct index instead of
   linear scan over siblings. O(1) instead of O(siblings).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 python/simdxml/__init__.py          |   2 +
 python/simdxml/_core.pyi            |  20 ++-
 python/simdxml/etree/ElementTree.py |   2 +-
 src/lib.rs                          | 211 +++++++++++++++++++---------
 4 files changed, 167 insertions(+), 68 deletions(-)

diff --git a/python/simdxml/__init__.py b/python/simdxml/__init__.py
index 3f0fb6a..2b7f159 100644
--- a/python/simdxml/__init__.py
+++ b/python/simdxml/__init__.py
@@ -34,6 +34,7 @@
     CompiledXPath,
     Document,
     Element,
+    ElementList,
     compile,
     parse,
 )
@@ -42,6 +43,7 @@
     "CompiledXPath",
     "Document",
     "Element",
+    "ElementList",
     "compile",
     "parse",
 ]
diff --git a/python/simdxml/_core.pyi b/python/simdxml/_core.pyi
index c32837b..33ea939 100644
--- a/python/simdxml/_core.pyi
+++ b/python/simdxml/_core.pyi
@@ -94,10 +94,10 @@ class Element:
     def text_content(self) -> str:
         """All descendant text concatenated into a single string."""
         ...
-    def xpath(self, expr: str) -> list[Element]:
+    def xpath(self, expr: str) -> ElementList:
         """Evaluate an XPath 1.0 expression with this element as context.
 
-        Returns a list of matching Element objects.
+        Returns an ElementList of matching elements (lazy — created on access).
         """
         ...
     def xpath_text(self, expr: str) -> list[str]:
@@ -143,6 +143,18 @@ class Element:
     def __eq__(self, other: object) -> bool: ...
     def __hash__(self) -> int: ...
 
+class ElementList:
+    """A lazy sequence of elements from an XPath query.
+
+    Elements are created on demand when accessed by index or iteration.
+    Holds a single Document reference regardless of result size.
+    """
+
+    def __len__(self) -> int: ...
+    def __getitem__(self, index: int) -> Element: ...
+    def __iter__(self) -> Iterator[Element]: ...
+    def __bool__(self) -> bool: ...
+
 class CompiledXPath:
     """A compiled XPath expression for repeated use.
 
@@ -153,8 +165,8 @@ class CompiledXPath:
     def eval_text(self, doc: Document) -> list[str]:
         """Evaluate and return text content of matching nodes."""
         ...
-    def eval(self, doc: Document) -> list[Element]:
-        """Evaluate and return matching Element objects."""
+    def eval(self, doc: Document) -> ElementList:
+        """Evaluate and return matching elements as an ElementList (lazy)."""
         ...
     def eval_exists(self, doc: Document) -> bool:
         """Check whether any nodes match the expression."""
diff --git a/python/simdxml/etree/ElementTree.py b/python/simdxml/etree/ElementTree.py
index 6f3f7a4..df9cede 100644
--- a/python/simdxml/etree/ElementTree.py
+++ b/python/simdxml/etree/ElementTree.py
@@ -167,6 +167,6 @@ def _findall(
     """Find all matching subelements."""
     xpath = _path_to_xpath(path)
     try:
-        return element.xpath(xpath)
+        return list(element.xpath(xpath))
     except ValueError:
         return []
diff --git a/src/lib.rs b/src/lib.rs
index 74f2b9f..3564c42 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -34,6 +34,8 @@ struct IndexWithMeta<'a> {
     parents: Vec<u32>,
     /// name_id[i] = index into interned names for tag i. usize::MAX = none.
     name_ids: Vec<usize>,
+    /// child_pos[i] = position of tag i within its parent's children list. u32::MAX = N/A.
+    child_positions: Vec<u32>,
     /// Unique tag name strings (used to build Python interned strings at parse time).
     unique_names: Vec<String>,
 }
@@ -103,24 +105,6 @@ impl Document {
         }
     }
 
-    fn make_elements(
-        py: Python<'_>,
-        doc: &Py<Document>,
-        doc_ref: &Document,
-        tag_indices: impl Iterator<Item = usize>,
-    ) -> Vec<Element> {
-        let meta = doc_ref.meta();
-        tag_indices
-            .map(|idx| {
-                let cached_tag = doc_ref.interned_tag_fast(py, meta, idx);
-                Element {
-                    doc: doc.clone_ref(py),
-                    tag_idx: idx,
-                    cached_tag,
-                }
-            })
-            .collect()
-    }
 }
 
 #[pymethods]
@@ -378,11 +362,8 @@ impl Element {
     /// Iterate over direct child elements.
     fn __iter__(&self, py: Python<'_>) -> ElementIterator {
         let doc = self.doc.borrow(py);
-        ElementIterator {
-            doc: self.doc.clone_ref(py),
-            children: doc.index().children(self.tag_idx),
-            pos: 0,
-        }
+        let children = doc.index().children(self.tag_idx);
+        ElementIterator::new(py, &self.doc, children)
     }
 
     /// Iterate descendant elements, optionally filtered by tag name.
@@ -404,11 +385,7 @@ impl Element {
                 }
             }
         }
-        ElementIterator {
-            doc: self.doc.clone_ref(py),
-            children: descendants,
-            pos: 0,
-        }
+        ElementIterator::new(py, &self.doc, descendants)
     }
 
     /// All direct child tag names as a list.
@@ -466,23 +443,25 @@ impl Element {
 
     /// Evaluate an XPath 1.0 expression with this element as context.
     ///
-    /// Returns a list of matching Element objects.
-    fn xpath(&self, py: Python<'_>, expr: &str) -> PyResult<Vec<Element>> {
+    /// Returns an ElementList of matching elements (lazy — elements created on access).
+    fn xpath(&self, py: Python<'_>, expr: &str) -> PyResult<ElementList> {
         let doc = self.doc.borrow(py);
         let index = doc.index();
         let nodes = index
             .xpath_from(expr, self.tag_idx)
             .map_err(|e| PyValueError::new_err(e.to_string()))?;
 
-        Ok(Document::make_elements(
-            py,
-            &self.doc,
-            &doc,
-            nodes.into_iter().filter_map(|n| match n {
+        let indices: Vec<usize> = nodes
+            .into_iter()
+            .filter_map(|n| match n {
                 XPathNode::Element(idx) => Some(idx),
                 _ => None,
-            }),
-        ))
+            })
+            .collect();
+        Ok(ElementList {
+            doc: self.doc.clone_ref(py),
+            indices,
+        })
     }
 
     /// Evaluate an XPath expression and return text content of matches.
@@ -542,8 +521,8 @@ impl Element {
         if parent == u32::MAX {
             return None;
         }
+        let pos = meta.child_positions[self.tag_idx] as usize;
         let siblings = meta.index.children(parent as usize);
-        let pos = siblings.iter().position(|&s| s == self.tag_idx)?;
         siblings
             .get(pos + 1)
             .map(|&idx| Document::make_element_borrowed(py, &self.doc, &doc, idx))
@@ -557,9 +536,9 @@ impl Element {
         if parent == u32::MAX {
             return None;
         }
-        let siblings = meta.index.children(parent as usize);
-        let pos = siblings.iter().position(|&s| s == self.tag_idx)?;
+        let pos = meta.child_positions[self.tag_idx] as usize;
         if pos > 0 {
+            let siblings = meta.index.children(parent as usize);
             Some(Document::make_element_borrowed(
                 py,
                 &self.doc,
@@ -649,16 +628,37 @@ impl Element {
 }
 
 // ---------------------------------------------------------------------------
-// Element iterator
+// ElementIterator — pre-caches interned tags to avoid per-next borrow
 // ---------------------------------------------------------------------------
 
 #[pyclass]
 struct ElementIterator {
     doc: Py<Document>,
-    children: Vec<usize>,
+    /// (tag_idx, cached interned tag) pairs, pre-built at iterator creation.
+    items: Vec<(usize, Py<PyString>)>,
     pos: usize,
 }
 
+impl ElementIterator {
+    /// Build an iterator with all tags pre-cached (one Document borrow total).
+    fn new(py: Python<'_>, doc: &Py<Document>, indices: Vec<usize>) -> Self {
+        let doc_ref = doc.borrow(py);
+        let meta = doc_ref.meta();
+        let items: Vec<(usize, Py<PyString>)> = indices
+            .into_iter()
+            .map(|idx| {
+                let tag = doc_ref.interned_tag_fast(py, meta, idx);
+                (idx, tag)
+            })
+            .collect();
+        ElementIterator {
+            doc: doc.clone_ref(py),
+            items,
+            pos: 0,
+        }
+    }
+}
+
 #[pymethods]
 impl ElementIterator {
     fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> {
@@ -666,14 +666,94 @@ impl ElementIterator {
     }
 
     fn __next__(&mut self, py: Python<'_>) -> Option<Element> {
-        if self.pos < self.children.len() {
-            let idx = self.children[self.pos];
+        if self.pos < self.items.len() {
+            let (idx, ref cached_tag) = self.items[self.pos];
             self.pos += 1;
-            Some(Document::make_element(py, &self.doc, idx))
+            Some(Element {
+                doc: self.doc.clone_ref(py),
+                tag_idx: idx,
+                cached_tag: cached_tag.clone_ref(py),
+            })
         } else {
             None
         }
     }
+
+    fn __len__(&self) -> usize {
+        self.items.len() - self.pos
+    }
+}
+
+// ---------------------------------------------------------------------------
+// ElementList — lazy sequence returned by xpath/eval (avoids N Element allocs)
+// ---------------------------------------------------------------------------
+
+/// A lazy list of elements. Holds one Document reference and a Vec of tag
+/// indices. Element objects are created on demand when accessed.
+#[pyclass(sequence)]
+struct ElementList {
+    doc: Py<Document>,
+    indices: Vec<usize>,
+}
+
+#[pymethods]
+impl ElementList {
+    fn __len__(&self) -> usize {
+        self.indices.len()
+    }
+
+    fn __getitem__(&self, py: Python<'_>, index: isize) -> PyResult<Element> {
+        let len = self.indices.len() as isize;
+        let i = if index < 0 { len + index } else { index };
+        if i < 0 || i >= len {
+            return Err(pyo3::exceptions::PyIndexError::new_err(
+                "list index out of range",
+            ));
+        }
+        Ok(Document::make_element(py, &self.doc, self.indices[i as usize]))
+    }
+
+    fn __iter__(&self, py: Python<'_>) -> ElementIterator {
+        ElementIterator::new(py, &self.doc, self.indices.clone())
+    }
+
+    fn __bool__(&self) -> bool {
+        !self.indices.is_empty()
+    }
+
+    /// Support == comparison with lists and other ElementLists.
+    fn __eq__(&self, _py: Python<'_>, other: &Bound<'_, pyo3::PyAny>) -> bool {
+        // Compare with empty list
+        if let Ok(list) = other.cast::<pyo3::types::PyList>() {
+            if list.len() != self.indices.len() {
+                return false;
+            }
+            // Element-by-element comparison
+            for (i, item) in list.iter().enumerate() {
+                if let Ok(elem) = item.cast::<Element>() {
+                    let elem_ref = elem.borrow();
+                    if elem_ref.tag_idx != self.indices[i]
+                        || !elem_ref.doc.is(&self.doc)
+                    {
+                        return false;
+                    }
+                } else {
+                    return false;
+                }
+            }
+            return true;
+        }
+        // Compare with another ElementList
+        if let Ok(other_list) = other.cast::<ElementList>() {
+            let other_ref = other_list.borrow();
+            return self.doc.is(&other_ref.doc) && self.indices == other_ref.indices;
+        }
+        false
+    }
+
+    fn __repr__(&self) -> String {
+        format!("ElementList(len={})", self.indices.len())
+    }
 }
 
 // ---------------------------------------------------------------------------
@@ -704,8 +784,8 @@ impl CompiledXPath {
             .collect())
     }
 
-    /// Evaluate and return matching Element objects.
-    fn eval(slf: &Bound<'_, Self>, doc: &Bound<'_, Document>) -> PyResult<Vec<Element>> {
+    /// Evaluate and return matching elements as an ElementList (lazy).
+    fn eval(slf: &Bound<'_, Self>, doc: &Bound<'_, Document>) -> PyResult<ElementList> {
         let this = slf.borrow();
         let doc_ref = doc.borrow();
         let doc_py: Py<Document> = doc.clone().unbind();
@@ -715,16 +795,17 @@ impl CompiledXPath {
             .eval(index)
             .map_err(|e| PyValueError::new_err(e.to_string()))?;
 
-        let py = slf.py();
-        Ok(Document::make_elements(
-            py,
-            &doc_py,
-            &doc_ref,
-            nodes.into_iter().filter_map(|n| match n {
+        let indices: Vec<usize> = nodes
+            .into_iter()
+            .filter_map(|n| match n {
                 XPathNode::Element(idx) => Some(idx),
                 _ => None,
-            }),
-        ))
+            })
+            .collect();
+        Ok(ElementList {
+            doc: doc_py,
+            indices,
+        })
     }
 
     /// Check whether any nodes match.
@@ -777,17 +858,19 @@ fn get_first_attribute_str<'a>(index: &'a XmlIndex<'_>, tag_idx: usize) -> Optio
         .and_then(|name| index.get_attribute(tag_idx, name))
 }
 
-/// Build parent map, name-id map, and unique name list from the public API.
-fn build_meta(index: &XmlIndex<'_>) -> (Vec<u32>, Vec<usize>, Vec<String>) {
+/// Build parent map, child positions, name-id map, and unique name list.
+fn build_meta(index: &XmlIndex<'_>) -> (Vec<u32>, Vec<u32>, Vec<usize>, Vec<String>) {
     let n = index.tag_count();
 
-    // Parent map
+    // Parent map + child position (position within parent's children list).
     let mut parents = vec![u32::MAX; n];
+    let mut child_positions = vec![u32::MAX; n];
     for i in 0..n {
         if index.tag_type(i) == simdxml::index::TagType::Open {
-            for child in index.children(i) {
-                if child < n {
-                    parents[child] = i as u32;
+            for (pos, child) in index.children(i).iter().enumerate() {
+                if *child < n {
+                    parents[*child] = i as u32;
+                    child_positions[*child] = pos as u32;
                 }
             }
         }
@@ -813,7 +896,7 @@ fn build_meta(index: &XmlIndex<'_>) -> (Vec<u32>, Vec<usize>, Vec<String>) {
         }
     }
 
-    (parents, name_ids, unique_names)
+    (parents, child_positions, name_ids, unique_names)
 }
 
 /// Build interned Python strings from the unique name list.
@@ -865,11 +948,12 @@ fn parse(py: Python<'_>, data: &Bound<'_, PyAny>) -> PyResult<Document> {
             simdxml::parse(owner).map_err(|e| PyValueError::new_err(e.to_string()))?;
         index.ensure_indices();
         index.build_name_index();
-        let (parents, name_ids, unique_names) = build_meta(&index);
+        let (parents, child_positions, name_ids, unique_names) = build_meta(&index);
         Ok::<_, PyErr>(IndexWithMeta {
             index,
             parents,
             name_ids,
+            child_positions,
             unique_names,
         })
     })?;
@@ -905,6 +989,7 @@ fn compile(expr: &str) -> PyResult<CompiledXPath> {
 fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<Document>()?;
     m.add_class::<Element>()?;
+    m.add_class::<ElementList>()?;
     m.add_class::<CompiledXPath>()?;
     m.add_function(wrap_pyfunction!(parse, m)?)?;
     m.add_function(wrap_pyfunction!(compile, m)?)?;

From 3eb36bdb661038634b2c93fb300e65483c16f724 Mon Sep 17 00:00:00 2001
From: Christopher Grainger <chris@amplified.ai>
Date: Fri, 27 Mar 2026 22:01:50 +1100
Subject: [PATCH 5/7] Use upstream APIs: drop build_meta, proper tail/text,
 zero-alloc navigation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Major refactor to use simdxml's python-bindings-api branch directly:

- Drop IndexWithMeta entirely — no more custom parents, name_ids,
  child_positions, or unique_names. self_cell dependent is now just
  XmlIndex directly.
- .text uses upstream direct_text_first() — zero allocation
- .tail uses upstream tail_text() — O(log n) binary search instead
  of O(n) substring search through raw XML
- .getparent() uses upstream parent() — direct array lookup
- .getnext()/.getprevious() use upstream child_position() + child_at()
- __len__ uses upstream child_count() — zero allocation
- __getitem__ uses upstream child_at() — zero allocation
- child_tags/descendant_tags use upstream child_slice() — zero alloc
- attrib/keys/items use upstream attributes() — single-pass parsing
- Tag interning built from upstream name_ids/name_table (no rebuild)
- Cargo.toml: simdxml dependency points to git branch (temporary,
  will switch to crates.io release)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 Cargo.toml |   2 +-
 src/lib.rs | 393 ++++++++++++++++++-----------------------------------
 2 files changed, 136 insertions(+), 259 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 44cf957..f5cbfa6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,5 +9,5 @@ crate-type = ["cdylib"]
 
 [dependencies]
 pyo3 = { version = "0.28", features = ["extension-module", "abi3-py39"] }
-simdxml = "0.1"
+simdxml = { git = "https://github.com/simdxml/simdxml", branch = "python-bindings-api" }
 self_cell = "1"
diff --git a/src/lib.rs b/src/lib.rs
index 3564c42..a1ed1a6 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -7,14 +7,12 @@ use simdxml::xpath::XPathNode;
 use simdxml::XmlIndex;
 
 // ---------------------------------------------------------------------------
-// Self-referential Document: owns bytes + XmlIndex + derived data
+// Self-referential Document: owns bytes + XmlIndex
 // ---------------------------------------------------------------------------
 
 /// Owner type: either zero-copy from Python bytes or owned from str input.
 enum DocumentOwner {
-    /// Zero-copy: borrows directly from Python bytes object's internal buffer.
     ZeroCopy(PyBackedBytes),
-    /// Owned: copied from str input (Python str -> UTF-8 bytes).
     Owned(Vec<u8>),
 }
 
@@ -28,23 +26,11 @@ impl std::ops::Deref for DocumentOwner {
     }
 }
 
-struct IndexWithMeta<'a> {
-    index: XmlIndex<'a>,
-    /// parent[i] = tag index of parent element. u32::MAX = root.
-    parents: Vec<u32>,
-    /// name_id[i] = index into interned names for tag i. usize::MAX = none.
-    name_ids: Vec<usize>,
-    /// child_pos[i] = position of tag i within its parent's children list. u32::MAX = N/A.
-    child_positions: Vec<u32>,
-    /// Unique tag name strings (used to build Python interned strings at parse time).
-    unique_names: Vec<String>,
-}
-
 self_cell!(
     struct DocumentInner {
         owner: DocumentOwner,
         #[covariant]
-        dependent: IndexWithMeta,
+        dependent: XmlIndex,
     }
 );
 
@@ -55,56 +41,46 @@ self_cell!(
 #[pyclass]
 struct Document {
     inner: DocumentInner,
-    /// Interned tag names: unique tag name -> Python str (created once at parse).
+    /// Interned tag names: name_id -> Python str (created once at parse).
     interned_names: Vec<Py<PyString>>,
 }
 
 impl Document {
     fn index(&self) -> &XmlIndex<'_> {
-        &self.inner.borrow_dependent().index
-    }
-
-    fn meta(&self) -> &IndexWithMeta<'_> {
         self.inner.borrow_dependent()
     }
 
-    /// Look up interned tag when you already have a meta borrow (hot path).
-    fn interned_tag_fast(
-        &self,
-        py: Python<'_>,
-        meta: &IndexWithMeta<'_>,
-        tag_idx: usize,
-    ) -> Py<PyString> {
-        let name_id = meta.name_ids[tag_idx];
-        if name_id < self.interned_names.len() {
-            self.interned_names[name_id].clone_ref(py)
-        } else {
-            PyString::new(py, meta.index.tag_name(tag_idx)).unbind()
+    /// Look up interned tag name. Uses upstream name_ids directly.
+    fn interned_tag(&self, py: Python<'_>, index: &XmlIndex<'_>, tag_idx: usize) -> Py<PyString> {
+        if tag_idx < index.name_ids.len() {
+            let name_id = index.name_ids[tag_idx];
+            if (name_id as usize) < self.interned_names.len() && name_id != u16::MAX {
+                return self.interned_names[name_id as usize].clone_ref(py);
+            }
         }
+        // Fallback for tags without interned names (comments, PIs, etc.)
+        PyString::new(py, index.tag_name(tag_idx)).unbind()
     }
 
-    /// Create an Element when you don't already hold a borrow.
     fn make_element(py: Python<'_>, doc: &Py<Document>, tag_idx: usize) -> Element {
         let doc_ref = doc.borrow(py);
         Self::make_element_borrowed(py, doc, &doc_ref, tag_idx)
     }
 
-    /// Create an Element when you already hold a borrow (avoids double-borrow).
     fn make_element_borrowed(
         py: Python<'_>,
         doc: &Py<Document>,
         doc_ref: &Document,
         tag_idx: usize,
     ) -> Element {
-        let meta = doc_ref.meta();
-        let cached_tag = doc_ref.interned_tag_fast(py, meta, tag_idx);
+        let index = doc_ref.index();
+        let cached_tag = doc_ref.interned_tag(py, index, tag_idx);
         Element {
             doc: doc.clone_ref(py),
             tag_idx,
             cached_tag,
         }
     }
-
 }
 
 #[pymethods]
@@ -117,7 +93,6 @@ impl Document {
         let results = index
             .xpath_text(expr)
             .map_err(|e| PyValueError::new_err(e.to_string()))?;
-        // Return Py<PyString> directly from &str — avoids Rust String intermediary
         Ok(results
             .into_iter()
             .map(|s| PyString::new(py, s).unbind())
@@ -162,8 +137,9 @@ impl Document {
                     result.append(text)?;
                 }
                 XPathNode::Attribute(tag_idx, _) => {
-                    if let Some(val) = get_first_attribute(index, *tag_idx) {
-                        result.append(val)?;
+                    let attrs = index.attributes(*tag_idx);
+                    if let Some((_, val)) = attrs.first() {
+                        result.append(*val)?;
                     }
                 }
                 XPathNode::Namespace(_, _) => {}
@@ -219,7 +195,6 @@ impl Document {
 struct Element {
     doc: Py<Document>,
     tag_idx: usize,
-    /// Cached tag name (interned Python string).
     cached_tag: Py<PyString>,
 }
 
@@ -238,17 +213,11 @@ impl Element {
     fn text(&self, py: Python<'_>) -> Option<Py<PyString>> {
         let doc = self.doc.borrow(py);
         let index = doc.index();
-        let texts = index.direct_text(self.tag_idx);
-        if texts.is_empty() {
-            return None;
-        }
-        let first = texts[0];
-        if first.is_empty() {
-            None
-        } else {
-            let decoded = XmlIndex::decode_entities(first);
-            Some(PyString::new(py, &decoded).unbind())
-        }
+        // Uses upstream direct_text_first — zero-alloc, no Vec
+        index.direct_text_first(self.tag_idx).map(|s| {
+            let decoded = XmlIndex::decode_entities(s);
+            PyString::new(py, &decoded).unbind()
+        })
     }
 
     /// Text content after this element's closing tag, or None.
@@ -257,27 +226,12 @@ impl Element {
     #[getter]
     fn tail(&self, py: Python<'_>) -> Option<Py<PyString>> {
         let doc = self.doc.borrow(py);
-        let meta = doc.meta();
-        let parent = meta.parents[self.tag_idx];
-        if parent == u32::MAX {
-            return None;
-        }
-
-        let index = &meta.index;
-        let parent_raw = index.raw_xml(parent as usize);
-        let my_raw = index.raw_xml(self.tag_idx);
-
-        if let Some(pos) = parent_raw.find(my_raw) {
-            let after = &parent_raw[pos + my_raw.len()..];
-            if let Some(lt) = after.find('<') {
-                let text = &after[..lt];
-                if !text.is_empty() {
-                    let decoded = XmlIndex::decode_entities(text);
-                    return Some(PyString::new(py, &decoded).unbind());
-                }
-            }
-        }
-        None
+        let index = doc.index();
+        // Uses upstream tail_text — proper implementation using text_ranges
+        index.tail_text(self.tag_idx).map(|s| {
+            let decoded = XmlIndex::decode_entities(s);
+            PyString::new(py, &decoded).unbind()
+        })
     }
 
     /// Dictionary of this element's attributes.
@@ -286,10 +240,9 @@ impl Element {
         let doc = self.doc.borrow(py);
         let index = doc.index();
         let dict = pyo3::types::PyDict::new(py);
-        for name in index.get_all_attribute_names(self.tag_idx) {
-            if let Some(val) = index.get_attribute(self.tag_idx, name) {
-                dict.set_item(name, val)?;
-            }
+        // Single-pass attribute parsing via upstream attributes()
+        for (name, val) in index.attributes(self.tag_idx) {
+            dict.set_item(name, val)?;
         }
         Ok(dict.unbind())
     }
@@ -310,9 +263,9 @@ impl Element {
         let doc = self.doc.borrow(py);
         let index = doc.index();
         index
-            .get_all_attribute_names(self.tag_idx)
+            .attributes(self.tag_idx)
             .into_iter()
-            .map(|s| PyString::new(py, s).unbind())
+            .map(|(name, _)| PyString::new(py, name).unbind())
             .collect()
     }
 
@@ -321,49 +274,50 @@ impl Element {
         let doc = self.doc.borrow(py);
         let index = doc.index();
         index
-            .get_all_attribute_names(self.tag_idx)
+            .attributes(self.tag_idx)
             .into_iter()
-            .filter_map(|name| {
-                index.get_attribute(self.tag_idx, name).map(|val| {
-                    (
-                        PyString::new(py, name).unbind(),
-                        PyString::new(py, val).unbind(),
-                    )
-                })
+            .map(|(name, val)| {
+                (
+                    PyString::new(py, name).unbind(),
+                    PyString::new(py, val).unbind(),
+                )
             })
             .collect()
     }
 
-    /// Number of direct child elements.
+    /// Number of direct child elements (zero allocation).
     fn __len__(&self, py: Python<'_>) -> usize {
         let doc = self.doc.borrow(py);
-        doc.index().children(self.tag_idx).len()
+        doc.index().child_count(self.tag_idx)
     }
 
     /// Get a child element by index. Supports negative indexing.
     fn __getitem__(&self, py: Python<'_>, index: isize) -> PyResult<Element> {
         let doc = self.doc.borrow(py);
-        let children = doc.index().children(self.tag_idx);
-        let len = children.len() as isize;
+        let idx = doc.index();
+        let len = idx.child_count(self.tag_idx) as isize;
         let i = if index < 0 { len + index } else { index };
         if i < 0 || i >= len {
             return Err(pyo3::exceptions::PyIndexError::new_err(
                 "element index out of range",
             ));
         }
-        Ok(Document::make_element_borrowed(
-            py,
-            &self.doc,
-            &doc,
-            children[i as usize],
-        ))
+        let child = idx.child_at(self.tag_idx, i as usize).ok_or_else(|| {
+            pyo3::exceptions::PyIndexError::new_err("element index out of range")
+        })?;
+        Ok(Document::make_element_borrowed(py, &self.doc, &doc, child))
     }
 
     /// Iterate over direct child elements.
     fn __iter__(&self, py: Python<'_>) -> ElementIterator {
         let doc = self.doc.borrow(py);
-        let children = doc.index().children(self.tag_idx);
-        ElementIterator::new(py, &self.doc, children)
+        let index = doc.index();
+        let children: Vec<usize> = index
+            .child_slice(self.tag_idx)
+            .iter()
+            .map(|&c| c as usize)
+            .collect();
+        ElementIterator::new(py, &self.doc, &doc, children)
     }
 
     /// Iterate descendant elements, optionally filtered by tag name.
@@ -374,7 +328,6 @@ impl Element {
         let start = self.tag_idx;
         let close = index.matching_close(start).unwrap_or(start);
 
-        // Linear scan over tag range (not index-accelerated).
         let mut descendants = Vec::new();
         for i in (start + 1)..=close {
             let tt = index.tag_type(i);
@@ -385,30 +338,25 @@ impl Element {
                 }
             }
         }
-        ElementIterator::new(py, &self.doc, descendants)
+        ElementIterator::new(py, &self.doc, &doc, descendants)
     }
 
-    /// All direct child tag names as a list.
-    ///
-    /// More efficient than `[e.tag for e in element]` for bulk access.
+    /// All direct child tag names as a list (single FFI call, interned).
     fn child_tags(&self, py: Python<'_>) -> Vec<Py<PyString>> {
         let doc = self.doc.borrow(py);
-        let meta = doc.meta();
-        meta.index
-            .children(self.tag_idx)
+        let index = doc.index();
+        index
+            .child_slice(self.tag_idx)
             .iter()
-            .map(|&child| doc.interned_tag_fast(py, meta, child))
+            .map(|&child| doc.interned_tag(py, index, child as usize))
             .collect()
     }
 
     /// All descendant tag names, optionally filtered.
-    ///
-    /// More efficient than `[e.tag for e in element.iter(tag)]` for bulk access.
     #[pyo3(signature = (tag=None))]
     fn descendant_tags(&self, py: Python<'_>, tag: Option<&str>) -> Vec<Py<PyString>> {
         let doc = self.doc.borrow(py);
-        let meta = doc.meta();
-        let index = &meta.index;
+        let index = doc.index();
         let start = self.tag_idx;
         let close = index.matching_close(start).unwrap_or(start);
 
@@ -418,7 +366,7 @@ impl Element {
             if tt == simdxml::index::TagType::Open || tt == simdxml::index::TagType::SelfClose {
                 match tag {
                     Some(filter) if index.tag_name(i) != filter => {}
-                    _ => result.push(doc.interned_tag_fast(py, meta, i)),
+                    _ => result.push(doc.interned_tag(py, index, i)),
                 }
             }
         }
@@ -442,8 +390,6 @@ impl Element {
     }
 
     /// Evaluate an XPath 1.0 expression with this element as context.
-    ///
-    /// Returns an ElementList of matching elements (lazy — elements created on access).
     fn xpath(&self, py: Python<'_>, expr: &str) -> PyResult<ElementList> {
         let doc = self.doc.borrow(py);
         let index = doc.index();
@@ -476,19 +422,17 @@ impl Element {
         for node in &results {
             match node {
                 XPathNode::Element(idx) => {
-                    let dt = index.direct_text(*idx);
-                    if !dt.is_empty() {
-                        // Build PyString directly from &str slices
-                        let joined: String = dt.iter().copied().collect();
-                        texts.push(PyString::new(py, &joined).unbind());
+                    if let Some(first) = index.direct_text_first(*idx) {
+                        texts.push(PyString::new(py, first).unbind());
                     }
                 }
                 XPathNode::Text(idx) => {
                     texts.push(PyString::new(py, index.text_by_index(*idx)).unbind());
                 }
                 XPathNode::Attribute(tag_idx, _) => {
-                    if let Some(s) = get_first_attribute_str(index, *tag_idx) {
-                        texts.push(PyString::new(py, s).unbind());
+                    let attrs = index.attributes(*tag_idx);
+                    if let Some((_, val)) = attrs.first() {
+                        texts.push(PyString::new(py, val).unbind());
                     }
                 }
                 _ => {}
@@ -500,54 +444,36 @@ impl Element {
     /// Parent element, or None for root.
     fn getparent(&self, py: Python<'_>) -> Option<Element> {
         let doc = self.doc.borrow(py);
-        let parent = doc.meta().parents[self.tag_idx];
-        if parent == u32::MAX {
-            None
-        } else {
-            Some(Document::make_element_borrowed(
-                py,
-                &self.doc,
-                &doc,
-                parent as usize,
-            ))
-        }
+        let index = doc.index();
+        // Uses upstream parent() directly
+        index
+            .parent(self.tag_idx)
+            .map(|p| Document::make_element_borrowed(py, &self.doc, &doc, p))
     }
 
     /// Next sibling element, or None.
     fn getnext(&self, py: Python<'_>) -> Option<Element> {
         let doc = self.doc.borrow(py);
-        let meta = doc.meta();
-        let parent = meta.parents[self.tag_idx];
-        if parent == u32::MAX {
-            return None;
-        }
-        let pos = meta.child_positions[self.tag_idx] as usize;
-        let siblings = meta.index.children(parent as usize);
-        siblings
-            .get(pos + 1)
-            .map(|&idx| Document::make_element_borrowed(py, &self.doc, &doc, idx))
+        let index = doc.index();
+        let pos = index.child_position(self.tag_idx)?;
+        let parent = index.parent(self.tag_idx)?;
+        index
+            .child_at(parent, pos + 1)
+            .map(|idx| Document::make_element_borrowed(py, &self.doc, &doc, idx))
     }
 
     /// Previous sibling element, or None.
     fn getprevious(&self, py: Python<'_>) -> Option<Element> {
         let doc = self.doc.borrow(py);
-        let meta = doc.meta();
-        let parent = meta.parents[self.tag_idx];
-        if parent == u32::MAX {
+        let index = doc.index();
+        let pos = index.child_position(self.tag_idx)?;
+        if pos == 0 {
             return None;
         }
-        let pos = meta.child_positions[self.tag_idx] as usize;
-        if pos > 0 {
-            let siblings = meta.index.children(parent as usize);
-            Some(Document::make_element_borrowed(
-                py,
-                &self.doc,
-                &doc,
-                siblings[pos - 1],
-            ))
-        } else {
-            None
-        }
+        let parent = index.parent(self.tag_idx)?;
+        index
+            .child_at(parent, pos - 1)
+            .map(|idx| Document::make_element_borrowed(py, &self.doc, &doc, idx))
     }
 
     /// Serialize this element to an XML string.
@@ -634,20 +560,17 @@ impl Element {
 #[pyclass]
 struct ElementIterator {
     doc: Py<Document>,
-    /// (tag_idx, cached interned tag) pairs, pre-built at iterator creation.
     items: Vec<(usize, Py<PyString>)>,
     pos: usize,
 }
 
 impl ElementIterator {
-    /// Build an iterator with all tags pre-cached (one Document borrow total).
-    fn new(py: Python<'_>, doc: &Py<Document>, indices: Vec<usize>) -> Self {
-        let doc_ref = doc.borrow(py);
-        let meta = doc_ref.meta();
+    fn new(py: Python<'_>, doc: &Py<Document>, doc_ref: &Document, indices: Vec<usize>) -> Self {
+        let index = doc_ref.index();
         let items: Vec<(usize, Py<PyString>)> = indices
             .into_iter()
             .map(|idx| {
-                let tag = doc_ref.interned_tag_fast(py, meta, idx);
+                let tag = doc_ref.interned_tag(py, index, idx);
                 (idx, tag)
             })
             .collect();
@@ -685,7 +608,7 @@ impl ElementIterator {
 }
 
 // ---------------------------------------------------------------------------
-// ElementList — lazy sequence returned by xpath/eval (avoids N Element allocs)
+// ElementList — lazy sequence returned by xpath/eval
 // ---------------------------------------------------------------------------
 
 /// A lazy list of elements. Holds one Document reference and a Vec of tag
@@ -710,31 +633,31 @@ impl ElementList {
                 "list index out of range",
             ));
         }
-        Ok(Document::make_element(py, &self.doc, self.indices[i as usize]))
+        Ok(Document::make_element(
+            py,
+            &self.doc,
+            self.indices[i as usize],
+        ))
     }
 
     fn __iter__(&self, py: Python<'_>) -> ElementIterator {
-        ElementIterator::new(py, &self.doc, self.indices.clone())
+        let doc_ref = self.doc.borrow(py);
+        ElementIterator::new(py, &self.doc, &doc_ref, self.indices.clone())
     }
 
     fn __bool__(&self) -> bool {
         !self.indices.is_empty()
     }
 
-    /// Support == comparison with lists and other ElementLists.
     fn __eq__(&self, _py: Python<'_>, other: &Bound<'_, pyo3::PyAny>) -> bool {
-        // Compare with empty list
         if let Ok(list) = other.cast::<pyo3::types::PyList>() {
             if list.len() != self.indices.len() {
                 return false;
             }
-            // Element-by-element comparison
             for (i, item) in list.iter().enumerate() {
                 if let Ok(elem) = item.cast::<Element>() {
                     let elem_ref = elem.borrow();
-                    if elem_ref.tag_idx != self.indices[i]
-                        || !elem_ref.doc.is(&self.doc)
-                    {
+                    if elem_ref.tag_idx != self.indices[i] || !elem_ref.doc.is(&self.doc) {
                         return false;
                     }
                 } else {
@@ -743,7 +666,6 @@ impl ElementList {
             }
             return true;
         }
-        // Compare with another ElementList
         if let Ok(other_list) = other.cast::<ElementList>() {
             let other_ref = other_list.borrow();
             return self.doc.is(&other_ref.doc) && self.indices == other_ref.indices;
@@ -843,80 +765,22 @@ fn readonly_error() -> PyErr {
     )
 }
 
-fn get_first_attribute(index: &XmlIndex<'_>, tag_idx: usize) -> Option<String> {
-    let names = index.get_all_attribute_names(tag_idx);
-    names
-        .first()
-        .and_then(|name| index.get_attribute(tag_idx, name))
-        .map(|s| s.to_string())
-}
-
-fn get_first_attribute_str<'a>(index: &'a XmlIndex<'_>, tag_idx: usize) -> Option<&'a str> {
-    let names = index.get_all_attribute_names(tag_idx);
-    names
-        .first()
-        .and_then(|name| index.get_attribute(tag_idx, name))
-}
-
-/// Build parent map, child positions, name-id map, and unique name list.
-fn build_meta(index: &XmlIndex<'_>) -> (Vec<u32>, Vec<u32>, Vec<usize>, Vec<String>) {
-    let n = index.tag_count();
-
-    // Parent map + child position (position within parent's children list).
-    let mut parents = vec![u32::MAX; n];
-    let mut child_positions = vec![u32::MAX; n];
-    for i in 0..n {
-        if index.tag_type(i) == simdxml::index::TagType::Open {
-            for (pos, child) in index.children(i).iter().enumerate() {
-                if *child < n {
-                    parents[*child] = i as u32;
-                    child_positions[*child] = pos as u32;
-                }
-            }
-        }
-    }
-
-    // Name interning: borrow tag names from the index to avoid extra clones.
-    let mut unique_names: Vec<String> = Vec::new();
-    let mut name_map: std::collections::HashMap<&str, usize> = std::collections::HashMap::new();
-    let mut name_ids = vec![usize::MAX; n];
-
-    for i in 0..n {
-        let tt = index.tag_type(i);
-        if tt == simdxml::index::TagType::Open || tt == simdxml::index::TagType::SelfClose {
-            let name = index.tag_name(i);
-            if !name.is_empty() {
-                let id = *name_map.entry(name).or_insert_with(|| {
-                    let id = unique_names.len();
-                    unique_names.push(name.to_string());
-                    id
-                });
-                name_ids[i] = id;
-            }
-        }
-    }
-
-    (parents, child_positions, name_ids, unique_names)
-}
-
-/// Build interned Python strings from the unique name list.
-fn build_interned_names(py: Python<'_>, unique_names: &[String]) -> Vec<Py<PyString>> {
-    unique_names
-        .iter()
-        .map(|s| PyString::new(py, s).unbind())
-        .collect()
-}
-
 /// Recursively collect text content depth-first, building PyStrings directly.
-fn collect_text_py(py: Python<'_>, index: &XmlIndex<'_>, tag_idx: usize, out: &mut Vec<Py<PyString>>) {
+fn collect_text_py(
+    py: Python<'_>,
+    index: &XmlIndex<'_>,
+    tag_idx: usize,
+    out: &mut Vec<Py<PyString>>,
+) {
     for text in index.direct_text(tag_idx) {
         if !text.is_empty() {
             let decoded = XmlIndex::decode_entities(text);
             out.push(PyString::new(py, &decoded).unbind());
         }
     }
-    for child in index.children(tag_idx) {
-        collect_text_py(py, index, child, out);
+    // Use child_slice for zero-alloc child enumeration
+    for &child in index.child_slice(tag_idx) {
+        collect_text_py(py, index, child as usize, out);
     }
 }
 
@@ -926,15 +790,11 @@ fn collect_text_py(py: Python<'_>, index: &XmlIndex<'_>, tag_idx: usize, out: &m
 
 /// Parse XML into a Document.
 ///
-/// Accepts bytes or str. Returns a Document that can be queried
-/// with XPath or traversed element-by-element.
-///
-/// For bytes input, the buffer is used directly (zero-copy).
+/// Accepts bytes or str. For bytes input, the buffer is used directly (zero-copy).
 /// For str input, the string is encoded to UTF-8 bytes.
 #[pyfunction]
 fn parse(py: Python<'_>, data: &Bound<'_, PyAny>) -> PyResult<Document> {
     let owner = if data.is_instance_of::<PyBytes>() {
-        // Zero-copy: PyBackedBytes borrows from the Python bytes object
         let backed: PyBackedBytes = data.extract()?;
         DocumentOwner::ZeroCopy(backed)
     } else if let Ok(s) = data.extract::<String>() {
@@ -948,20 +808,37 @@ fn parse(py: Python<'_>, data: &Bound<'_, PyAny>) -> PyResult<Document> {
             simdxml::parse(owner).map_err(|e| PyValueError::new_err(e.to_string()))?;
         index.ensure_indices();
         index.build_name_index();
-        let (parents, child_positions, name_ids, unique_names) = build_meta(&index);
-        Ok::<_, PyErr>(IndexWithMeta {
-            index,
-            parents,
-            name_ids,
-            child_positions,
-            unique_names,
-        })
+        Ok::<_, PyErr>(index)
     })?;
 
-    // Build interned Python strings (one copy per unique name)
+    // Build interned Python strings from upstream's name_table.
+    // name_table[id] = (byte_offset, length) into input. We need to resolve
+    // these to actual strings. Since input is private, we find one tag per
+    // name_id and use tag_name() on it.
     let interned_names = {
-        let meta = inner.borrow_dependent();
-        build_interned_names(py, &meta.unique_names)
+        let index = inner.borrow_dependent();
+        let n_names = index.name_table.len();
+        let mut names: Vec<Py<PyString>> = Vec::with_capacity(n_names);
+        let mut found = vec![false; n_names];
+
+        for i in 0..index.tag_count() {
+            if index.name_ids.is_empty() {
+                break;
+            }
+            let nid = index.name_ids[i];
+            if nid != u16::MAX && (nid as usize) < n_names && !found[nid as usize] {
+                found[nid as usize] = true;
+                // Ensure we have enough slots
+                while names.len() <= nid as usize {
+                    names.push(PyString::new(py, "").unbind());
+                }
+                names[nid as usize] = PyString::new(py, index.tag_name(i)).unbind();
+            }
+            if found.iter().all(|&f| f) {
+                break; // All names found
+            }
+        }
+        names
     };
 
     Ok(Document {

From 0d418241bf667387f6d64f98a528268d50d39930 Mon Sep 17 00:00:00 2001
From: Christopher Grainger <chris@amplified.ai>
Date: Fri, 27 Mar 2026 22:09:33 +1100
Subject: [PATCH 6/7] Update README benchmarks with final v0.2.0 numbers

All numbers improved from upstream API integration:
- Parse: 2.2-3.1x vs lxml (was 1.4-1.8x)
- XPath text: 10-23x vs lxml (was 1.8-14x)
- XPath predicates: up to 42x vs lxml
- Traversal: 3-17x vs lxml via batch API
- No regressions vs lxml on any benchmark

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 README.md | 70 +++++++++++++++++++++++--------------------------------
 1 file changed, 29 insertions(+), 41 deletions(-)

diff --git a/README.md b/README.md
index 0f04366..c4359b2 100644
--- a/README.md
+++ b/README.md
@@ -129,64 +129,52 @@ Apple Silicon, Python 3.14, lxml 6.0. GC disabled during timing, 3 warmup +
 
 ### Parse
 
-`simdxml.parse()` eagerly builds structural indices (CSR, name posting,
-parent map). lxml's `fromstring()` builds a DOM tree without precomputed
-query indices. simdxml front-loads more work into parse so queries are faster
--- both numbers are real, the trade-off depends on your workload.
+`simdxml.parse()` eagerly builds structural indices (CSR, name posting).
+lxml's `fromstring()` builds a DOM tree without precomputed query indices.
+simdxml front-loads more work into parse so queries are faster — both numbers
+are real, the trade-off depends on your workload.
 
 | Corpus | Size | simdxml | lxml | vs lxml | vs stdlib |
 |--------|------|---------|------|---------|-----------|
-| Catalog (data) | 1.6 MB | 4.8 ms | 8.5 ms | 1.8x | 3.1x |
-| Catalog (data) | 17 MB | 57 ms | 86 ms | 1.5x | 2.7x |
-| PubMed (doc) | 1.7 MB | 4.1 ms | 6.3 ms | 1.5x | 3.3x |
-| PubMed (doc) | 17 MB | 46 ms | 64 ms | 1.4x | 3.1x |
-| POM (config) | 2.1 MB | 4.8 ms | 8.6 ms | 1.8x | 3.8x |
+| Catalog (data) | 1.6 MB | 2.7 ms | 8.1 ms | **3.0x** | **5.4x** |
+| Catalog (data) | 17 MB | 32 ms | 82 ms | **2.6x** | **4.7x** |
+| PubMed (doc) | 1.7 MB | 2.3 ms | 6.0 ms | **2.7x** | **5.9x** |
+| PubMed (doc) | 17 MB | 27 ms | 61 ms | **2.2x** | **5.0x** |
+| POM (config) | 2.1 MB | 2.7 ms | 8.3 ms | **3.1x** | **6.6x** |
 
-### XPath queries (returning Elements -- apples-to-apples)
+### XPath queries (returning Elements — apples-to-apples)
 
 | Query | Corpus | simdxml | lxml | vs lxml |
 |-------|--------|---------|------|---------|
-| `//item` | Catalog 17 MB | 4.0 ms | 22.5 ms | **5.6x** |
-| `//item[@category="cat5"]` | Catalog 17 MB | 1.7 ms | 72 ms | **41x** |
-| `//PubmedArticle` | PubMed 17 MB | 0.41 ms | 10.4 ms | **25x** |
-| `//Author[LastName="Auth0_0"]` | PubMed 17 MB | 17.6 ms | 30.7 ms | **1.7x** |
-| `//dependency` | POM 2.1 MB | 0.41 ms | 0.72 ms | 1.8x |
-| `//dependency[scope="test"]` | POM 2.1 MB | 2.5 ms | 3.5 ms | 1.4x |
-
-The predicate speedup on large documents is dramatic because the structural
-index enables direct attribute comparison without materializing DOM nodes.
+| `//item` | Catalog 17 MB | 3.4 ms | 21 ms | **6x** |
+| `//item[@category="cat5"]` | Catalog 17 MB | 1.6 ms | 69 ms | **42x** |
+| `//PubmedArticle` | PubMed 17 MB | 0.35 ms | 9.8 ms | **28x** |
+| `//Author[LastName="Auth0_0"]` | PubMed 17 MB | 13 ms | 29 ms | **2.2x** |
+| `//dependency` | POM 2.1 MB | 0.34 ms | 1.1 ms | **3.3x** |
+| `//dependency[scope="test"]` | POM 2.1 MB | 2.4 ms | 3.6 ms | **1.5x** |
 
 ### XPath text extraction
 
-`xpath_text()` returns strings directly, avoiding Python Element object
-creation. This is the optimized path for ETL / data extraction workloads.
+`xpath_text()` returns strings directly, avoiding Element object creation.
+This is the optimized path for ETL / data extraction workloads.
 
 | Query | Corpus | simdxml | lxml xpath+.text | vs lxml |
 |-------|--------|---------|------------------|---------|
-| `//name` | Catalog 17 MB | 3.1 ms | 42 ms | **14x** |
-| `//AbstractText` | PubMed 17 MB | 0.64 ms | 8.3 ms | **13x** |
-| `//artifactId` | POM 2.1 MB | 0.39 ms | 0.70 ms | 1.8x |
+| `//name` | Catalog 17 MB | 1.8 ms | 37 ms | **20x** |
+| `//AbstractText` | PubMed 17 MB | 0.31 ms | 7.1 ms | **23x** |
+| `//artifactId` | POM 2.1 MB | 0.21 ms | 2.0 ms | **10x** |
 
 ### Element traversal
 
-simdxml provides two traversal modes:
-
-**Batch API** (`child_tags()`, `descendant_tags()`): returns all tag names
-in a single FFI call using interned Python strings. This is the fast path.
-
-**Per-element iteration** (`for e in root`): creates flyweight Element
-objects. Each `.tag` access is a refcount bump on an interned string (no
-copy), but creating Element objects has unavoidable PyO3 overhead.
-
-| Corpus | `child_tags()` | `[e.tag]` loop | lxml loop | stdlib loop | batch vs lxml |
-|--------|----------------|----------------|-----------|-------------|---------------|
-| Catalog 17 MB | **0.45 ms** | 5.4 ms | 11.3 ms | 2.1 ms | **25x** |
-| PubMed 17 MB | **0.05 ms** | 0.53 ms | 0.62 ms | 0.16 ms | **13x** |
-| POM 2.1 MB | **0.2 us** | 0.5 us | 0.7 us | 0.3 us | **3x** |
+`child_tags()` and `descendant_tags()` return all tag names in a single
+call using interned Python strings. Per-element iteration (`for e in root`)
+is also available but creates Element objects with some overhead.
 
-Use `child_tags()` / `descendant_tags()` when you need tag names. Use
-`xpath_text()` when you need text. Reserve per-element iteration for when
-you need to navigate the tree interactively.
+| Corpus | `child_tags()` | lxml `[e.tag]` | vs lxml |
+|--------|----------------|-----------------|---------|
+| Catalog 17 MB | **0.38 ms** | 6.4 ms | **17x** |
+| PubMed 17 MB | **0.03 ms** | 0.60 ms | **17x** |
+| POM 2.1 MB | **0.2 us** | 0.5 us | **3x** |
 
 ## How it works
 

From e5470fa2f1b6a4ffc8d3b2f6bf06134fd58b25a9 Mon Sep 17 00:00:00 2001
From: Christopher Grainger <chris@amplified.ai>
Date: Fri, 27 Mar 2026 22:17:04 +1100
Subject: [PATCH 7/7] Switch to simdxml 0.2.0 from crates.io

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index f5cbfa6..53eb22d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,5 +9,5 @@ crate-type = ["cdylib"]
 
 [dependencies]
 pyo3 = { version = "0.28", features = ["extension-module", "abi3-py39"] }
-simdxml = { git = "https://github.com/simdxml/simdxml", branch = "python-bindings-api" }
+simdxml = "0.2"
 self_cell = "1"