diff --git a/Cargo.toml b/Cargo.toml
index 2d6b234..53eb22d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simdxml-python"
-version = "0.1.0"
+version = "0.2.0"
 edition = "2021"
 
 [lib]
@@ -9,5 +9,5 @@ crate-type = ["cdylib"]
 
 [dependencies]
 pyo3 = { version = "0.28", features = ["extension-module", "abi3-py39"] }
-simdxml = "0.1"
+simdxml = "0.2"
 self_cell = "1"
diff --git a/README.md b/README.md
index cc347b3..c4359b2 100644
--- a/README.md
+++ b/README.md
@@ -62,6 +62,10 @@ elem.getprevious()                 # previous sibling or None
 elem.xpath(".//title")             # context-node evaluation
 elem.xpath_text("author")         # text extraction from context
 
+# Batch APIs (single FFI call, interned strings)
+root.child_tags()                  # -> list[str] of child tag names
+root.descendant_tags("item")       # -> list[str] filtered by tag
+
 # Compiled XPath (like re.compile)
 expr = simdxml.compile("//title")
 expr.eval_text(doc)                # -> list[str]
@@ -118,33 +122,59 @@ Full conformance with XPath 1.0:
 
 ## Benchmarks
 
-Measured on Apple Silicon (M-series), Python 3.14, comparing against
-lxml 6.0 and stdlib `xml.etree.ElementTree`. Run with `uv run python bench/bench_parse.py`.
+Apple Silicon, Python 3.14, lxml 6.0. GC disabled during timing, 3 warmup +
+20 timed iterations, median reported. Three corpus types: data-oriented
+(product catalog), document-oriented (PubMed abstracts), config-oriented
+(Maven POM). Run yourself: `uv run python bench/bench_parse.py`
+
+### Parse
+
+`simdxml.parse()` eagerly builds structural indices (CSR, name posting).
+lxml's `fromstring()` builds a DOM tree without precomputed query indices.
+simdxml front-loads more work into parse so queries are faster — both numbers
+are real, the trade-off depends on your workload.
+
+| Corpus | Size | simdxml | lxml | vs lxml | vs stdlib |
+|--------|------|---------|------|---------|-----------|
+| Catalog (data) | 1.6 MB | 2.7 ms | 8.1 ms | **3.0x** | **5.4x** |
+| Catalog (data) | 17 MB | 32 ms | 82 ms | **2.6x** | **4.7x** |
+| PubMed (doc) | 1.7 MB | 2.3 ms | 6.0 ms | **2.7x** | **5.9x** |
+| PubMed (doc) | 17 MB | 27 ms | 61 ms | **2.2x** | **5.0x** |
+| POM (config) | 2.1 MB | 2.7 ms | 8.3 ms | **3.1x** | **6.6x** |
+
+### XPath queries (returning Elements — apples-to-apples)
 
-### Parse throughput
+| Query | Corpus | simdxml | lxml | vs lxml |
+|-------|--------|---------|------|---------|
+| `//item` | Catalog 17 MB | 3.4 ms | 21 ms | **6x** |
+| `//item[@category="cat5"]` | Catalog 17 MB | 1.6 ms | 69 ms | **42x** |
+| `//PubmedArticle` | PubMed 17 MB | 0.35 ms | 9.8 ms | **28x** |
+| `//Author[LastName="Auth0_0"]` | PubMed 17 MB | 13 ms | 29 ms | **2.2x** |
+| `//dependency` | POM 2.1 MB | 0.34 ms | 1.1 ms | **3.3x** |
+| `//dependency[scope="test"]` | POM 2.1 MB | 2.4 ms | 3.6 ms | **1.5x** |
 
-| Document | simdxml | lxml | stdlib ET | vs lxml | vs stdlib |
-|----------|---------|------|-----------|---------|-----------|
-| 20 KB (100 items) | 0.05 ms | 0.09 ms | 0.15 ms | 1.8x | 3.0x |
-| 2 MB (10K items) | 3.3 ms | 8.5 ms | 16.7 ms | 2.6x | 5.0x |
-| 20 MB (100K items) | 40 ms | 87 ms | 181 ms | **2.2x** | **4.5x** |
+### XPath text extraction
 
-### XPath query: `//name`
+`xpath_text()` returns strings directly, avoiding Element object creation.
+This is the optimized path for ETL / data extraction workloads.
 
-| Document | simdxml | lxml | stdlib findall | vs lxml | vs stdlib |
-|----------|---------|------|----------------|---------|-----------|
-| 2 MB | 0.3 ms | 1.0 ms | 0.7 ms | 3.1x | 2.1x |
-| 20 MB | 3.8 ms | 19.7 ms | 7.3 ms | **5.2x** | **1.9x** |
+| Query | Corpus | simdxml | lxml xpath+.text | vs lxml |
+|-------|--------|---------|------------------|---------|
+| `//name` | Catalog 17 MB | 1.8 ms | 37 ms | **20x** |
+| `//AbstractText` | PubMed 17 MB | 0.31 ms | 7.1 ms | **23x** |
+| `//artifactId` | POM 2.1 MB | 0.21 ms | 2.0 ms | **10x** |
 
-### XPath query with predicate: `//item[@category="cat5"]`
+### Element traversal
 
-| Document | simdxml | lxml | stdlib findall | vs lxml |
-|----------|---------|------|----------------|---------|
-| 2 MB | 0.2 ms | 2.8 ms | 0.8 ms | 16x |
-| 20 MB | 2.0 ms | 46 ms | 9.1 ms | **23x** |
+`child_tags()` and `descendant_tags()` return all tag names in a single
+call using interned Python strings. Per-element iteration (`for e in root`)
+is also available but creates Element objects with some overhead.
 
-The predicate speedup is dramatic because simdxml's structural index enables
-direct attribute comparison without materializing DOM nodes.
+| Corpus | `child_tags()` | lxml `[e.tag]` | vs lxml |
+|--------|----------------|-----------------|---------|
+| Catalog 17 MB | **0.38 ms** | 6.4 ms | **17x** |
+| PubMed 17 MB | **0.03 ms** | 0.60 ms | **17x** |
+| POM 2.1 MB | **0.2 us** | 0.5 us | **3x** |
 
 ## How it works
 
@@ -157,7 +187,8 @@ and parents -- all indexed by the same position.
 - O(1) ancestor/descendant checks via pre/post-order numbering
 - O(1) child enumeration via CSR (Compressed Sparse Row) indices
 - SIMD-accelerated structural parsing (NEON on ARM, AVX2 on x86)
-- Lazy index building: CSR indices built on first query, not at parse time
+- Parse eagerly builds all indices (CSR, name posting, parent map) so
+  subsequent queries pay zero index construction cost
 
 ## Platform support
 
diff --git a/bench/bench_parse.py b/bench/bench_parse.py
index 48ca246..3eb532f 100644
--- a/bench/bench_parse.py
+++ b/bench/bench_parse.py
@@ -1,11 +1,27 @@
 """Benchmark: simdxml vs lxml vs stdlib xml.etree.ElementTree.
 
+Methodology:
+  - GC disabled during timing to avoid collection noise
+  - 3 warmup iterations discarded, then 20 timed iterations
+  - Reports median (robust to outliers from page faults, scheduling)
+  - All XPath benchmarks compare like-for-like: elements vs elements
+  - Both synthetic and real-world-shaped corpora
+
+Note: simdxml.parse() eagerly builds structural indices (CSR, name
+posting, parent map). lxml.fromstring() builds a DOM tree without
+precomputed indices. This means simdxml front-loads more work into
+parse, then queries are faster. Both numbers are real -- the question
+is which workload you have.
+
 Usage:
     uv run python bench/bench_parse.py
 """
 
 from __future__ import annotations
 
+import gc
+import random
+import sys
 import time
 import xml.etree.ElementTree as StdET
 
@@ -19,133 +35,313 @@
     HAS_LXML = False
 
 
-def generate_xml(n_items: int) -> bytes:
-    """Generate a catalog XML with n_items."""
+# ---------------------------------------------------------------------------
+# Corpus generators
+# ---------------------------------------------------------------------------
+
+
+def gen_catalog(n: int) -> bytes:
+    """Data-oriented: uniform structure, many attributes."""
     items = "\n".join(
-        f'  <item id="{i}" category="cat{i % 10}">'
+        "  "
+        f'<item id="{i}" category="cat{i % 10}">'
         f"<name>Item {i}</name>"
-        f"<description>Description for item {i} with some text content</description>"
+        f"<description>Desc for item {i}</description>"
         f"<price>{i * 1.5:.2f}</price>"
-        f"<tags><tag>tag{i % 5}</tag><tag>tag{i % 3}</tag></tags>"
+        f"<tags><tag>t{i % 5}</tag><tag>t{i % 3}</tag></tags>"
         f"</item>"
-        for i in range(n_items)
+        for i in range(n)
     )
     return f"<catalog>\n{items}\n</catalog>".encode()
 
 
-def bench(label: str, fn, iterations: int = 10) -> float:
-    """Run fn `iterations` times, return median time in ms."""
-    times = []
-    for _ in range(iterations):
-        start = time.perf_counter()
+def gen_pubmed(n: int) -> bytes:
+    """Document-oriented: mixed depth, varying children."""
+    rng = random.Random(42)
+    articles = []
+    for i in range(n):
+        n_auth = rng.randint(1, 8)
+        auths = "\n".join(
+            "        <Author>"
+            f"<LastName>Auth{j}_{i}</LastName>"
+            f"<ForeName>F{j}</ForeName>"
+            f"<Affiliation>Univ {rng.randint(1, 20)}"
+            "</Affiliation>"
+            "</Author>"
+            for j in range(n_auth)
+        )
+        n_mesh = rng.randint(2, 12)
+        mesh = "\n".join(
+            "        <MeshHeading>"
+            f'<DescriptorName UI="D{rng.randint(100000, 999999)}">'
+            f"Term{k}_{i}</DescriptorName>"
+            "</MeshHeading>"
+            for k in range(n_mesh)
+        )
+        kind = "randomized" if i % 2 else "retrospective"
+        sents = " ".join(
+            f"Sentence {s} about topic {i}." for s in range(rng.randint(3, 8))
+        )
+        issn = f"{rng.randint(1000, 9999)}-{rng.randint(1000, 9999)}"
+        articles.append(
+            "  <PubmedArticle>\n"
+            '    <MedlineCitation Status="MEDLINE">\n'
+            f"      <PMID>{10000000 + i}</PMID>\n"
+            "      <Article>\n"
+            "        <Journal>"
+            f'<ISSN IssnType="Print">{issn}</ISSN>'
+            f"<Title>J Example {i % 50}</Title>"
+            "</Journal>\n"
+            f"        <ArticleTitle>Topic {i}: "
+            f"a {kind} study</ArticleTitle>\n"
+            "        <Abstract>"
+            f"<AbstractText>{sents}</AbstractText>"
+            "</Abstract>\n"
+            f"        <AuthorList>\n{auths}\n"
+            "        </AuthorList>\n"
+            "        <Language>eng</Language>\n"
+            "      </Article>\n"
+            f"      <MeshHeadingList>\n{mesh}\n"
+            "      </MeshHeadingList>\n"
+            "    </MedlineCitation>\n"
+            "  </PubmedArticle>"
+        )
+    body = "\n".join(articles)
+    return f"<PubmedArticleSet>\n{body}\n</PubmedArticleSet>".encode()
+
+
+def gen_pom(n: int) -> bytes:
+    """Config-oriented: deep nesting, namespaces."""
+    deps = "\n".join(
+        "      <dependency>\n"
+        f"        <groupId>com.example.g{i % 20}</groupId>\n"
+        f"        <artifactId>art-{i}</artifactId>\n"
+        f"        <version>{i % 5}.{i % 10}.{i % 3}</version>\n"
+        "        <scope>"
+        + ("compile" if i % 3 == 0 else "test" if i % 3 == 1 else "runtime")
+        + "</scope>\n"
+        + (
+            "        <exclusions>\n"
+            f"          <exclusion>"
+            f"<groupId>com.ex.{i}</groupId>"
+            f"<artifactId>bad-{i}</artifactId>"
+            "</exclusion>\n"
+            "        </exclusions>\n"
+            if i % 4 == 0
+            else ""
+        )
+        + "      </dependency>"
+        for i in range(n)
+    )
+    return (
+        "<project>\n"
+        "  <modelVersion>4.0.0</modelVersion>\n"
+        "  <groupId>com.example</groupId>\n"
+        "  <artifactId>benchmark</artifactId>\n"
+        "  <version>1.0.0</version>\n"
+        f"  <dependencies>\n{deps}\n  </dependencies>\n"
+        "</project>"
+    ).encode()
+
+
+# ---------------------------------------------------------------------------
+# Bench harness
+# ---------------------------------------------------------------------------
+
+WARMUP = 3
+ITERATIONS = 20
+
+
+def bench(fn) -> float:
+    """Warmup then timed iterations; return median ms."""
+    for _ in range(WARMUP):
         fn()
-        elapsed = (time.perf_counter() - start) * 1000
-        times.append(elapsed)
+
+    gc.disable()
+    try:
+        times = []
+        for _ in range(ITERATIONS):
+            t0 = time.perf_counter()
+            fn()
+            times.append((time.perf_counter() - t0) * 1000)
+    finally:
+        gc.enable()
+
     times.sort()
-    median = times[len(times) // 2]
-    return median
+    return times[len(times) // 2]
 
 
-def print_row(label: str, time_ms: float, baseline_ms: float | None = None) -> None:
-    speedup = ""
-    if baseline_ms is not None and time_ms > 0:
-        ratio = baseline_ms / time_ms
-        lib = "lxml" if HAS_LXML else "stdlib"
-        speedup = f"  ({ratio:.1f}x vs {lib})"
-    print(f"  {label:<30s} {time_ms:8.2f} ms{speedup}")
+def fmt(ms: float) -> str:
+    if ms < 0.01:
+        return f"{ms * 1000:6.1f} us"
+    if ms < 1:
+        return f"{ms:6.2f} ms"
+    return f"{ms:6.1f} ms"
 
 
-def run_benchmarks(xml: bytes, label: str) -> None:
-    size_mb = len(xml) / (1024 * 1024)
-    print(f"\n{'=' * 60}")
-    print(f"  {label} ({size_mb:.1f} MB, {len(xml):,} bytes)")
-    print(f"{'=' * 60}")
+def ratio_str(a: float, b: float) -> str:
+    if b <= 0:
+        return ""
+    r = b / a
+    if r >= 1:
+        return f" \033[32m{r:.1f}x faster\033[0m"
+    return f" \033[31m{1 / r:.1f}x slower\033[0m"
+
+
+# ---------------------------------------------------------------------------
+# Benchmark suites
+# ---------------------------------------------------------------------------
+
 
-    # --- Parse ---
-    print("\n  Parse:")
-    simdxml_parse = bench("simdxml", lambda: simdxml.parse(xml))
-    print_row("simdxml.parse()", simdxml_parse)
+def bench_parse(xml: bytes, label: str) -> None:
+    print(f"\n  \033[1mParse\033[0m  ({label})")
+    print("  Note: simdxml.parse() includes index construction (CSR + name posting)")
+
+    t_simd = bench(lambda: simdxml.parse(xml))
+    print(f"    simdxml.parse()         {fmt(t_simd)}")
 
     if HAS_LXML:
-        lxml_parse = bench("lxml", lambda: lxml_etree.fromstring(xml))
-        print_row("lxml.etree.fromstring()", lxml_parse)
+        t_lxml = bench(lambda: lxml_etree.fromstring(xml))
+        print(f"    lxml.fromstring()       {fmt(t_lxml)}{ratio_str(t_simd, t_lxml)}")
+
+    t_std = bench(lambda: StdET.fromstring(xml))
+    print(f"    ET.fromstring()         {fmt(t_std)}{ratio_str(t_simd, t_std)}")
 
-    std_parse = bench("stdlib", lambda: StdET.fromstring(xml))
-    print_row("ET.fromstring()", std_parse)
 
-    baseline = lxml_parse if HAS_LXML else std_parse
-    lib = "lxml" if HAS_LXML else "stdlib"
-    print(f"\n  Parse speedup: {baseline / simdxml_parse:.1f}x vs {lib}")
+def bench_xpath_elements(xml: bytes, expr: str, label: str) -> None:
+    """XPath returning Element objects -- fair comparison."""
+    print(f"\n  \033[1mXPath -> Elements\033[0m  {expr}  ({label})")
 
-    # --- XPath: //name (simple descendant) ---
-    print("\n  XPath: //name")
     doc = simdxml.parse(xml)
-    compiled = simdxml.compile("//name")
+    t_simd = bench(lambda: doc.xpath(expr))
+    n_results = len(doc.xpath(expr))
+    print(f"    simdxml doc.xpath()     {fmt(t_simd)}  ({n_results} results)")
 
-    simdxml_xpath = bench("simdxml.xpath_text", lambda: doc.xpath_text("//name"))
-    print_row("doc.xpath_text()", simdxml_xpath)
+    if HAS_LXML:
+        lroot = lxml_etree.fromstring(xml)
+        t_lxml = bench(lambda: lroot.xpath(expr))
+        print(f"    lxml root.xpath()       {fmt(t_lxml)}{ratio_str(t_simd, t_lxml)}")
+
+    # stdlib findall -- skip complex expressions
+    if not any(c in expr for c in ("()", "::", "|")):
+        std_expr = expr
+        if not expr.startswith("."):
+            std_expr = "." + expr if expr.startswith("/") else "./" + expr
+        sroot = StdET.fromstring(xml)
+        try:
+            t_std = bench(lambda: sroot.findall(std_expr))
+            print(f"    ET.findall()            {fmt(t_std)}{ratio_str(t_simd, t_std)}")
+        except SyntaxError:
+            pass
+
+
+def bench_xpath_text(xml: bytes, expr: str, label: str) -> None:
+    """XPath returning text -- simdxml's optimized path."""
+    print(f"\n  \033[1mXPath -> Text\033[0m  {expr}  ({label})")
+
+    doc = simdxml.parse(xml)
+    compiled = simdxml.compile(expr)
 
-    simdxml_compiled = bench("simdxml.compiled", lambda: compiled.eval_text(doc))
-    print_row("compiled.eval_text()", simdxml_compiled)
+    t_inline = bench(lambda: doc.xpath_text(expr))
+    t_compiled = bench(lambda: compiled.eval_text(doc))
+    n = len(doc.xpath_text(expr))
+    print(f"    simdxml xpath_text()    {fmt(t_inline)}  ({n} results)")
+    print(f"    simdxml compiled        {fmt(t_compiled)}")
 
     if HAS_LXML:
-        lxml_root = lxml_etree.fromstring(xml)
-        lxml_xpath = bench("lxml.xpath", lambda: lxml_root.xpath("//name"))
-        print_row("lxml_root.xpath()", lxml_xpath)
-        baseline_xpath = lxml_xpath
-    else:
-        baseline_xpath = None
+        lroot = lxml_etree.fromstring(xml)
+        t_lxml = bench(lambda: [e.text for e in lroot.xpath(expr)])
+        print(f"    lxml xpath+.text        {fmt(t_lxml)}{ratio_str(t_inline, t_lxml)}")
+
 
-    std_root = StdET.fromstring(xml)
-    std_findall = bench("stdlib.findall", lambda: std_root.findall(".//name"))
-    print_row("std_root.findall()", std_findall)
+def bench_traversal(xml: bytes, label: str) -> None:
+    """Element traversal: per-element loop vs batch API."""
+    print(f"\n  \033[1mTraversal\033[0m  ({label})")
 
-    if baseline_xpath:
-        print(f"\n  XPath speedup: {baseline_xpath / simdxml_xpath:.1f}x vs lxml")
+    doc = simdxml.parse(xml)
 
-    # --- XPath: predicate query ---
-    print('\n  XPath: //item[@category="cat5"]')
-    pred_expr = '//item[@category="cat5"]'
-    simdxml_pred = bench("simdxml", lambda: doc.xpath(pred_expr))
-    print_row("doc.xpath()", simdxml_pred)
+    # Batch API (single FFI call, interned strings)
+    t_batch = bench(lambda: doc.root.child_tags())
+    print(f"    simdxml child_tags()    {fmt(t_batch)}  [batch, 1 FFI call]")
+
+    # Per-element loop (N FFI calls, but tags are interned)
+    t_loop = bench(lambda: [e.tag for e in doc.root])
+    print(f"    simdxml [e.tag for e]   {fmt(t_loop)}  [per-element FFI]")
 
     if HAS_LXML:
-        lxml_pred = bench("lxml", lambda: lxml_root.xpath(pred_expr))
-        print_row("lxml_root.xpath()", lxml_pred)
+        lroot = lxml_etree.fromstring(xml)
+        t_lxml = bench(lambda: [e.tag for e in lroot])
+        print(f"    lxml [e.tag for e]      {fmt(t_lxml)}{ratio_str(t_batch, t_lxml)}")
 
-    std_pred = bench("stdlib", lambda: std_root.findall('.//item[@category="cat5"]'))
-    print_row("std_root.findall()", std_pred)
+    sroot = StdET.fromstring(xml)
+    t_std = bench(lambda: [e.tag for e in sroot])
+    print(f"    stdlib [e.tag for e]    {fmt(t_std)}{ratio_str(t_batch, t_std)}")
 
-    # --- Element traversal ---
-    print("\n  Traversal: iterate all children of root")
-    simdxml_iter = bench("simdxml", lambda: [e.tag for e in doc.root])
-    print_row("for e in doc.root", simdxml_iter)
 
-    if HAS_LXML:
-        lxml_iter = bench("lxml", lambda: [e.tag for e in lxml_root])
-        print_row("for e in lxml_root", lxml_iter)
+def run_corpus(xml: bytes, name: str) -> None:
+    size_mb = len(xml) / (1024 * 1024)
+    print(f"\n{'=' * 65}")
+    print(f"  {name}  ({size_mb:.1f} MB, {len(xml):,} bytes)")
+    print(f"{'=' * 65}")
 
-    std_iter = bench("stdlib", lambda: [e.tag for e in std_root])
-    print_row("for e in std_root", std_iter)
+    bench_parse(xml, name)
 
+    if b"<item " in xml:
+        bench_xpath_elements(xml, "//item", name)
+        bench_xpath_elements(xml, '//item[@category="cat5"]', name)
+        bench_xpath_text(xml, "//name", name)
+    elif b"<PubmedArticle>" in xml:
+        bench_xpath_elements(xml, "//PubmedArticle", name)
+        bench_xpath_elements(xml, '//Author[LastName="Auth0_0"]', name)
+        bench_xpath_text(xml, "//AbstractText", name)
+    elif b"<dependency>" in xml:
+        bench_xpath_elements(xml, "//dependency", name)
+        bench_xpath_elements(xml, '//dependency[scope="test"]', name)
+        bench_xpath_text(xml, "//artifactId", name)
 
-def main() -> None:
-    print("simdxml benchmark")
-    print(f"  lxml available: {HAS_LXML}")
-    if HAS_LXML:
-        print(f"  lxml version: {lxml_etree.LXML_VERSION}")
+    bench_traversal(xml, name)
 
-    # Small document
-    small = generate_xml(100)
-    run_benchmarks(small, "Small (100 items)")
 
-    # Medium document
-    medium = generate_xml(10_000)
-    run_benchmarks(medium, "Medium (10K items)")
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
 
-    # Large document
-    large = generate_xml(100_000)
-    run_benchmarks(large, "Large (100K items)")
+
+def main() -> None:
+    print("simdxml benchmark suite")
+    print(f"  Python {sys.version.split()[0]}")
+    print(f"  simdxml {simdxml.__version__}")
+    if HAS_LXML:
+        ver = ".".join(str(x) for x in lxml_etree.LXML_VERSION)
+        print(f"  lxml {ver}")
+    else:
+        print("  lxml: not installed")
+    print(f"  Warmup: {WARMUP}, Timed: {ITERATIONS}, Metric: median")
+
+    run_corpus(
+        gen_catalog(10_000),
+        "Catalog 10K (data-oriented)",
+    )
+    run_corpus(
+        gen_catalog(100_000),
+        "Catalog 100K (data-oriented)",
+    )
+    run_corpus(
+        gen_pubmed(1_000),
+        "PubMed 1K (document-oriented)",
+    )
+    run_corpus(
+        gen_pubmed(10_000),
+        "PubMed 10K (document-oriented)",
+    )
+    run_corpus(
+        gen_pom(1_000),
+        "POM 1K (config-oriented)",
+    )
+    run_corpus(
+        gen_pom(10_000),
+        "POM 10K (config-oriented)",
+    )
 
 
 if __name__ == "__main__":
diff --git a/pyproject.toml b/pyproject.toml
index 4ab2cf3..7e936da 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "simdxml"
-version = "0.1.0"
+version = "0.2.0"
 description = "SIMD-accelerated XML parser with full XPath 1.0 support"
 readme = "README.md"
 authors = [
diff --git a/python/simdxml/__init__.py b/python/simdxml/__init__.py
index 049afc2..2b7f159 100644
--- a/python/simdxml/__init__.py
+++ b/python/simdxml/__init__.py
@@ -34,6 +34,7 @@
     CompiledXPath,
     Document,
     Element,
+    ElementList,
     compile,
     parse,
 )
@@ -42,8 +43,9 @@
     "CompiledXPath",
     "Document",
     "Element",
+    "ElementList",
     "compile",
     "parse",
 ]
 
-__version__ = "0.1.0"
+__version__ = "0.2.0"
diff --git a/python/simdxml/_core.pyi b/python/simdxml/_core.pyi
index 1961186..33ea939 100644
--- a/python/simdxml/_core.pyi
+++ b/python/simdxml/_core.pyi
@@ -1,59 +1,192 @@
 from collections.abc import Iterator
 
 class Document:
-    """A parsed XML document backed by a SIMD-accelerated structural index."""
+    """A parsed XML document.
+
+    Created by `parse()`. Use `root` to get the root element,
+    or query directly with `xpath_text()` and `xpath()`.
+    """
 
     @property
-    def root(self) -> Element | None: ...
+    def root(self) -> Element | None:
+        """The root element of the document, or None if empty."""
+        ...
     @property
-    def tag_count(self) -> int: ...
-    def xpath_text(self, expr: str) -> list[str]: ...
-    def xpath_string(self, expr: str) -> list[str]: ...
-    def xpath(self, expr: str) -> list[Element | str]: ...
+    def tag_count(self) -> int:
+        """Total number of XML tags in the document."""
+        ...
+    def xpath_text(self, expr: str) -> list[str]:
+        """Evaluate an XPath expression and return text content of matches.
+
+        Returns the direct child text of each matching element.
+        """
+        ...
+    def xpath_string(self, expr: str) -> list[str]:
+        """Evaluate an XPath expression and return string-values of matches.
+
+        Returns all descendant text for each match (XPath ``string()`` semantics).
+        """
+        ...
+    def xpath(self, expr: str) -> list[Element | str]:
+        """Evaluate an XPath expression.
+
+        Returns Element objects for element nodes, strings for text/attribute nodes.
+        """
+        ...
 
 class Element:
-    """A read-only element in a parsed XML document."""
+    """A read-only XML element.
+
+    Supports the ElementTree API (``.tag``, ``.text``, ``.attrib``, ``.get()``,
+    ``len()``, indexing, iteration) plus lxml extensions (``.xpath()``,
+    ``.getparent()``, ``.getnext()``, ``.getprevious()``).
+    """
 
     @property
-    def tag(self) -> str: ...
+    def tag(self) -> str:
+        """The element's tag name (e.g., ``'book'``, ``'title'``)."""
+        ...
     @property
-    def text(self) -> str | None: ...
+    def text(self) -> str | None:
+        """Text content before the first child element, or None.
+
+        For ``<p>Hello <b>world</b></p>``, ``p.text`` is ``'Hello '``.
+        """
+        ...
     @property
-    def tail(self) -> str | None: ...
+    def tail(self) -> str | None:
+        """Text content after this element's closing tag, or None.
+
+        For ``<p>Hello <b>world</b> more</p>``, ``b.tail`` is ``' more'``.
+        """
+        ...
     @property
-    def attrib(self) -> dict[str, str]: ...
-    def get(self, key: str, default: str | None = None) -> str | None: ...
-    def keys(self) -> list[str]: ...
-    def items(self) -> list[tuple[str, str]]: ...
-    def iter(self, tag: str | None = None) -> Iterator[Element]: ...
-    def itertext(self) -> list[str]: ...
-    def text_content(self) -> str: ...
-    def xpath(self, expr: str) -> list[Element]: ...
-    def xpath_text(self, expr: str) -> list[str]: ...
-    def getparent(self) -> Element | None: ...
-    def getnext(self) -> Element | None: ...
-    def getprevious(self) -> Element | None: ...
-    def tostring(self) -> str: ...
-    # Read-only enforcement: these raise TypeError
-    def set(self, key: str, value: str) -> None: ...
-    def append(self, element: Element) -> None: ...
-    def remove(self, element: Element) -> None: ...
-    def insert(self, index: int, element: Element) -> None: ...
-    def clear(self) -> None: ...
+    def attrib(self) -> dict[str, str]:
+        """Dictionary of this element's attributes."""
+        ...
+    def get(self, key: str, default: str | None = None) -> str | None:
+        """Get an attribute value by name, with optional default."""
+        ...
+    def keys(self) -> list[str]:
+        """List of attribute names."""
+        ...
+    def items(self) -> list[tuple[str, str]]:
+        """List of ``(name, value)`` attribute pairs."""
+        ...
+    def iter(self, tag: str | None = None) -> Iterator[Element]:
+        """Iterate over descendant elements, optionally filtered by tag name."""
+        ...
+    def child_tags(self) -> list[str]:
+        """All direct child tag names as a list.
+
+        More efficient than ``[e.tag for e in element]`` for bulk access.
+        """
+        ...
+    def descendant_tags(self, tag: str | None = None) -> list[str]:
+        """All descendant tag names, optionally filtered.
+
+        More efficient than ``[e.tag for e in element.iter(tag)]`` for bulk access.
+        """
+        ...
+    def itertext(self) -> list[str]:
+        """All text content within this element, depth-first."""
+        ...
+    def text_content(self) -> str:
+        """All descendant text concatenated into a single string."""
+        ...
+    def xpath(self, expr: str) -> ElementList:
+        """Evaluate an XPath 1.0 expression with this element as context.
+
+        Returns an ElementList of matching elements (lazy — created on access).
+        """
+        ...
+    def xpath_text(self, expr: str) -> list[str]:
+        """Evaluate an XPath expression and return text content of matches."""
+        ...
+    def getparent(self) -> Element | None:
+        """Parent element, or None if this is the root."""
+        ...
+    def getnext(self) -> Element | None:
+        """Next sibling element, or None if this is the last child."""
+        ...
+    def getprevious(self) -> Element | None:
+        """Previous sibling element, or None if this is the first child."""
+        ...
+    def tostring(self) -> str:
+        """Serialize this element to an XML string."""
+        ...
+    def set(self, key: str, value: str) -> None:
+        """Not supported. Raises TypeError (elements are read-only)."""
+        ...
+    def append(self, element: Element) -> None:
+        """Not supported. Raises TypeError (elements are read-only)."""
+        ...
+    def remove(self, element: Element) -> None:
+        """Not supported. Raises TypeError (elements are read-only)."""
+        ...
+    def insert(self, index: int, element: Element) -> None:
+        """Not supported. Raises TypeError (elements are read-only)."""
+        ...
+    def clear(self) -> None:
+        """Not supported. Raises TypeError (elements are read-only)."""
+        ...
+    def __len__(self) -> int:
+        """Number of direct child elements."""
+        ...
+    def __getitem__(self, index: int) -> Element:
+        """Get a child element by index. Supports negative indexing."""
+        ...
+    def __iter__(self) -> Iterator[Element]:
+        """Iterate over direct child elements."""
+        ...
+    def __bool__(self) -> bool: ...
+    def __eq__(self, other: object) -> bool: ...
+    def __hash__(self) -> int: ...
+
+class ElementList:
+    """A lazy sequence of elements from an XPath query.
+
+    Elements are created on demand when accessed by index or iteration.
+    Holds a single Document reference regardless of result size.
+    """
+
     def __len__(self) -> int: ...
     def __getitem__(self, index: int) -> Element: ...
     def __iter__(self) -> Iterator[Element]: ...
     def __bool__(self) -> bool: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
 
 class CompiledXPath:
-    """A compiled XPath expression for repeated evaluation."""
+    """A compiled XPath expression for repeated use.
+
+    Like ``re.compile()`` — parse the expression once, evaluate many times
+    across different documents.
+    """
+
+    def eval_text(self, doc: Document) -> list[str]:
+        """Evaluate and return text content of matching nodes."""
+        ...
+    def eval(self, doc: Document) -> ElementList:
+        """Evaluate and return matching elements as an ElementList (lazy)."""
+        ...
+    def eval_exists(self, doc: Document) -> bool:
+        """Check whether any nodes match the expression."""
+        ...
+    def eval_count(self, doc: Document) -> int:
+        """Count the number of matching nodes."""
+        ...
+
+def parse(data: bytes | str) -> Document:
+    """Parse XML into a Document.
+
+    Accepts ``bytes`` or ``str``. For bytes input, the buffer is used
+    directly (zero-copy). For str input, the string is encoded to UTF-8.
+    """
+    ...
 
-    def eval_text(self, doc: Document) -> list[str]: ...
-    def eval(self, doc: Document) -> list[Element]: ...
-    def eval_exists(self, doc: Document) -> bool: ...
-    def eval_count(self, doc: Document) -> int: ...
+def compile(expr: str) -> CompiledXPath:
+    """Compile an XPath expression for repeated use.
 
-def parse(data: bytes | str) -> Document: ...
-def compile(expr: str) -> CompiledXPath: ...
+    Like ``re.compile()`` — parse the expression once, evaluate many times
+    across different documents.
+    """
+    ...
diff --git a/python/simdxml/etree/ElementTree.py b/python/simdxml/etree/ElementTree.py
index 6f3f7a4..df9cede 100644
--- a/python/simdxml/etree/ElementTree.py
+++ b/python/simdxml/etree/ElementTree.py
@@ -167,6 +167,6 @@ def _findall(
     """Find all matching subelements."""
     xpath = _path_to_xpath(path)
     try:
-        return element.xpath(xpath)
+        return list(element.xpath(xpath))
     except ValueError:
         return []
diff --git a/src/lib.rs b/src/lib.rs
index 1f48a5c..a1ed1a6 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,85 +1,121 @@
 use pyo3::exceptions::{PyTypeError, PyValueError};
+use pyo3::pybacked::PyBackedBytes;
 use pyo3::prelude::*;
-use pyo3::types::PyBytes;
+use pyo3::types::{PyBytes, PyString};
 use self_cell::self_cell;
 use simdxml::xpath::XPathNode;
 use simdxml::XmlIndex;
 
 // ---------------------------------------------------------------------------
-// Self-referential Document: owns bytes + XmlIndex + derived data
+// Self-referential Document: owns bytes + XmlIndex
 // ---------------------------------------------------------------------------
 
-struct IndexWithMeta<'a> {
-    index: XmlIndex<'a>,
-    /// parent[i] = tag index of parent element. u32::MAX = root.
-    parents: Vec<u32>,
+/// Owner type: either zero-copy from Python bytes or owned from str input.
+enum DocumentOwner {
+    ZeroCopy(PyBackedBytes),
+    Owned(Vec<u8>),
+}
+
+impl std::ops::Deref for DocumentOwner {
+    type Target = [u8];
+    fn deref(&self) -> &[u8] {
+        match self {
+            DocumentOwner::ZeroCopy(b) => b,
+            DocumentOwner::Owned(v) => v,
+        }
+    }
 }
 
 self_cell!(
     struct DocumentInner {
-        owner: Vec<u8>,
+        owner: DocumentOwner,
         #[covariant]
-        dependent: IndexWithMeta,
+        dependent: XmlIndex,
     }
 );
 
-/// A parsed XML document backed by a SIMD-accelerated structural index.
+/// A parsed XML document.
+///
+/// Created by `parse()`. Use `root` to get the root element,
+/// or query directly with `xpath_text()` and `xpath()`.
 #[pyclass]
 struct Document {
     inner: DocumentInner,
+    /// Interned tag names: name_id -> Python str (created once at parse).
+    interned_names: Vec<Py<PyString>>,
 }
 
 impl Document {
     fn index(&self) -> &XmlIndex<'_> {
-        &self.inner.borrow_dependent().index
+        self.inner.borrow_dependent()
     }
 
-    fn parents(&self) -> &[u32] {
-        &self.inner.borrow_dependent().parents
+    /// Look up interned tag name. Uses upstream name_ids directly.
+    fn interned_tag(&self, py: Python<'_>, index: &XmlIndex<'_>, tag_idx: usize) -> Py<PyString> {
+        if tag_idx < index.name_ids.len() {
+            let name_id = index.name_ids[tag_idx];
+            if (name_id as usize) < self.interned_names.len() && name_id != u16::MAX {
+                return self.interned_names[name_id as usize].clone_ref(py);
+            }
+        }
+        // Fallback for tags without interned names (comments, PIs, etc.)
+        PyString::new(py, index.tag_name(tag_idx)).unbind()
     }
 
     fn make_element(py: Python<'_>, doc: &Py<Document>, tag_idx: usize) -> Element {
-        Element {
-            doc: doc.clone_ref(py),
-            tag_idx,
-        }
+        let doc_ref = doc.borrow(py);
+        Self::make_element_borrowed(py, doc, &doc_ref, tag_idx)
     }
 
-    fn make_elements(
+    fn make_element_borrowed(
         py: Python<'_>,
         doc: &Py<Document>,
-        tag_indices: impl Iterator<Item = usize>,
-    ) -> Vec<Element> {
-        tag_indices
-            .map(|idx| Element {
-                doc: doc.clone_ref(py),
-                tag_idx: idx,
-            })
-            .collect()
+        doc_ref: &Document,
+        tag_idx: usize,
+    ) -> Element {
+        let index = doc_ref.index();
+        let cached_tag = doc_ref.interned_tag(py, index, tag_idx);
+        Element {
+            doc: doc.clone_ref(py),
+            tag_idx,
+            cached_tag,
+        }
     }
 }
 
 #[pymethods]
 impl Document {
     /// Evaluate an XPath expression and return text content of matches.
-    fn xpath_text(&self, expr: &str) -> PyResult<Vec<String>> {
+    ///
+    /// Returns the direct child text of each matching element.
+    fn xpath_text(&self, py: Python<'_>, expr: &str) -> PyResult<Vec<Py<PyString>>> {
         let index = self.index();
         let results = index
             .xpath_text(expr)
             .map_err(|e| PyValueError::new_err(e.to_string()))?;
-        Ok(results.into_iter().map(|s| s.to_string()).collect())
+        Ok(results
+            .into_iter()
+            .map(|s| PyString::new(py, s).unbind())
+            .collect())
     }
 
-    /// Evaluate an XPath expression and return the XPath string-value of matches.
-    fn xpath_string(&self, expr: &str) -> PyResult<Vec<String>> {
+    /// Evaluate an XPath expression and return string-values of matches.
+    ///
+    /// Returns all descendant text for each match (XPath `string()` semantics).
+    fn xpath_string(&self, py: Python<'_>, expr: &str) -> PyResult<Vec<Py<PyString>>> {
         let index = self.index();
-        index
+        let results = index
             .xpath_string(expr)
-            .map_err(|e| PyValueError::new_err(e.to_string()))
+            .map_err(|e| PyValueError::new_err(e.to_string()))?;
+        Ok(results
+            .into_iter()
+            .map(|s| PyString::new(py, &s).unbind())
+            .collect())
     }
 
-    /// Evaluate an XPath expression. Returns Element list for node-sets,
-    /// strings for text/attribute nodes.
+    /// Evaluate an XPath expression.
+    ///
+    /// Returns Element objects for element nodes, strings for text/attribute nodes.
     fn xpath(slf: &Bound<'_, Self>, expr: &str) -> PyResult<Py<pyo3::types::PyList>> {
         let py = slf.py();
         let doc_py: Py<Document> = slf.clone().unbind();
@@ -93,7 +129,7 @@ impl Document {
         for node in &nodes {
             match node {
                 XPathNode::Element(idx) => {
-                    let elem = Document::make_element(py, &doc_py, *idx);
+                    let elem = Document::make_element_borrowed(py, &doc_py, &this, *idx);
                     result.append(elem.into_pyobject(py)?)?;
                 }
                 XPathNode::Text(idx) => {
@@ -101,8 +137,9 @@ impl Document {
                     result.append(text)?;
                 }
                 XPathNode::Attribute(tag_idx, _) => {
-                    if let Some(val) = get_first_attribute(index, *tag_idx) {
-                        result.append(val)?;
+                    let attrs = index.attributes(*tag_idx);
+                    if let Some((_, val)) = attrs.first() {
+                        result.append(*val)?;
                     }
                 }
                 XPathNode::Namespace(_, _) => {}
@@ -111,7 +148,7 @@ impl Document {
         Ok(result.unbind())
     }
 
-    /// The root element of the document.
+    /// The root element of the document, or None if empty.
     #[getter]
     fn root(slf: &Bound<'_, Self>) -> Option<Element> {
         let py = slf.py();
@@ -123,13 +160,13 @@ impl Document {
                 && (index.tag_type(i) == simdxml::index::TagType::Open
                     || index.tag_type(i) == simdxml::index::TagType::SelfClose)
             {
-                return Some(Document::make_element(py, &doc_py, i));
+                return Some(Document::make_element_borrowed(py, &doc_py, &this, i));
             }
         }
         None
     }
 
-    /// Number of tags in the structural index.
+    /// Total number of XML tags in the document.
     #[getter]
     fn tag_count(&self) -> usize {
         self.index().tag_count()
@@ -146,154 +183,141 @@ impl Document {
 }
 
 // ---------------------------------------------------------------------------
-// Element — lightweight flyweight handle into a Document
+// Element
 // ---------------------------------------------------------------------------
 
-/// A read-only element in a parsed XML document.
+/// A read-only XML element.
 ///
-/// Holds a Python reference to the Document (preventing GC) plus a tag index.
+/// Supports the ElementTree API (.tag, .text, .attrib, .get(), len(),
+/// indexing, iteration) plus lxml extensions (.xpath(), .getparent(),
+/// .getnext(), .getprevious()).
 #[pyclass(skip_from_py_object)]
 struct Element {
-    /// Python-ref-counted handle to the owning Document.
     doc: Py<Document>,
     tag_idx: usize,
-}
-
-impl Element {
-    fn with_index<'py, R>(&self, py: Python<'py>, f: impl FnOnce(&XmlIndex<'_>, &[u32]) -> R) -> R {
-        let doc = self.doc.borrow(py);
-        f(doc.index(), doc.parents())
-    }
+    cached_tag: Py<PyString>,
 }
 
 #[pymethods]
 impl Element {
-    /// The tag name.
+    /// The element's tag name (e.g., 'book', 'title').
     #[getter]
-    fn tag(&self, py: Python<'_>) -> String {
-        self.with_index(py, |index, _| index.tag_name(self.tag_idx).to_string())
+    fn tag(&self, py: Python<'_>) -> Py<PyString> {
+        self.cached_tag.clone_ref(py)
     }
 
-    /// Direct text content, or None.
+    /// Text content before the first child element, or None.
+    ///
+    /// For `<p>Hello <b>world</b></p>`, `p.text` is `'Hello '`.
     #[getter]
-    fn text(&self, py: Python<'_>) -> Option<String> {
-        self.with_index(py, |index, _| {
-            let texts = index.direct_text(self.tag_idx);
-            if texts.is_empty() {
-                return None;
-            }
-            let first = texts[0];
-            if first.is_empty() {
-                None
-            } else {
-                Some(XmlIndex::decode_entities(first).into_owned())
-            }
+    fn text(&self, py: Python<'_>) -> Option<Py<PyString>> {
+        let doc = self.doc.borrow(py);
+        let index = doc.index();
+        // Uses upstream direct_text_first — zero-alloc, no Vec
+        index.direct_text_first(self.tag_idx).map(|s| {
+            let decoded = XmlIndex::decode_entities(s);
+            PyString::new(py, &decoded).unbind()
         })
     }
 
-    /// Text after this element's closing tag (before next sibling).
+    /// Text content after this element's closing tag, or None.
+    ///
+    /// For `<p>Hello <b>world</b> more</p>`, `b.tail` is `' more'`.
     #[getter]
-    fn tail(&self, py: Python<'_>) -> Option<String> {
-        self.with_index(py, |index, parents| {
-            let parent = parents[self.tag_idx];
-            if parent == u32::MAX {
-                return None;
-            }
-
-            let parent_raw = index.raw_xml(parent as usize);
-            let my_raw = index.raw_xml(self.tag_idx);
-
-            if let Some(pos) = parent_raw.find(my_raw) {
-                let after = &parent_raw[pos + my_raw.len()..];
-                if let Some(lt) = after.find('<') {
-                    let text = &after[..lt];
-                    if !text.is_empty() {
-                        return Some(XmlIndex::decode_entities(text).into_owned());
-                    }
-                }
-            }
-            None
+    fn tail(&self, py: Python<'_>) -> Option<Py<PyString>> {
+        let doc = self.doc.borrow(py);
+        let index = doc.index();
+        // Uses upstream tail_text — proper implementation using text_ranges
+        index.tail_text(self.tag_idx).map(|s| {
+            let decoded = XmlIndex::decode_entities(s);
+            PyString::new(py, &decoded).unbind()
         })
     }
 
-    /// Dictionary of attributes.
+    /// Dictionary of this element's attributes.
     #[getter]
     fn attrib(&self, py: Python<'_>) -> PyResult<Py<pyo3::types::PyDict>> {
         let doc = self.doc.borrow(py);
         let index = doc.index();
         let dict = pyo3::types::PyDict::new(py);
-        for name in index.get_all_attribute_names(self.tag_idx) {
-            if let Some(val) = index.get_attribute(self.tag_idx, name) {
-                dict.set_item(name, val)?;
-            }
+        // Single-pass attribute parsing via upstream attributes()
+        for (name, val) in index.attributes(self.tag_idx) {
+            dict.set_item(name, val)?;
         }
         Ok(dict.unbind())
     }
 
     /// Get an attribute value by name, with optional default.
     #[pyo3(signature = (key, default=None))]
-    fn get(&self, py: Python<'_>, key: &str, default: Option<&str>) -> Option<String> {
-        self.with_index(py, |index, _| {
-            index
-                .get_attribute(self.tag_idx, key)
-                .map(|s| s.to_string())
-                .or_else(|| default.map(|s| s.to_string()))
-        })
+    fn get(&self, py: Python<'_>, key: &str, default: Option<&str>) -> Option<Py<PyString>> {
+        let doc = self.doc.borrow(py);
+        let index = doc.index();
+        index
+            .get_attribute(self.tag_idx, key)
+            .map(|s| PyString::new(py, s).unbind())
+            .or_else(|| default.map(|s| PyString::new(py, s).unbind()))
     }
 
     /// Attribute names.
-    fn keys(&self, py: Python<'_>) -> Vec<String> {
-        self.with_index(py, |index, _| {
-            index
-                .get_all_attribute_names(self.tag_idx)
-                .into_iter()
-                .map(|s| s.to_string())
-                .collect()
-        })
+    fn keys(&self, py: Python<'_>) -> Vec<Py<PyString>> {
+        let doc = self.doc.borrow(py);
+        let index = doc.index();
+        index
+            .attributes(self.tag_idx)
+            .into_iter()
+            .map(|(name, _)| PyString::new(py, name).unbind())
+            .collect()
     }
 
     /// (name, value) attribute pairs.
-    fn items(&self, py: Python<'_>) -> Vec<(String, String)> {
-        self.with_index(py, |index, _| {
-            index
-                .get_all_attribute_names(self.tag_idx)
-                .into_iter()
-                .filter_map(|name| {
-                    index
-                        .get_attribute(self.tag_idx, name)
-                        .map(|val| (name.to_string(), val.to_string()))
-                })
-                .collect()
-        })
+    fn items(&self, py: Python<'_>) -> Vec<(Py<PyString>, Py<PyString>)> {
+        let doc = self.doc.borrow(py);
+        let index = doc.index();
+        index
+            .attributes(self.tag_idx)
+            .into_iter()
+            .map(|(name, val)| {
+                (
+                    PyString::new(py, name).unbind(),
+                    PyString::new(py, val).unbind(),
+                )
+            })
+            .collect()
     }
 
-    /// Number of direct child elements.
+    /// Number of direct child elements (zero allocation).
     fn __len__(&self, py: Python<'_>) -> usize {
-        self.with_index(py, |index, _| index.children(self.tag_idx).len())
+        let doc = self.doc.borrow(py);
+        doc.index().child_count(self.tag_idx)
     }
 
-    /// Get the i-th child element.
+    /// Get a child element by index. Supports negative indexing.
     fn __getitem__(&self, py: Python<'_>, index: isize) -> PyResult<Element> {
         let doc = self.doc.borrow(py);
-        let children = doc.index().children(self.tag_idx);
-        let len = children.len() as isize;
+        let idx = doc.index();
+        let len = idx.child_count(self.tag_idx) as isize;
         let i = if index < 0 { len + index } else { index };
         if i < 0 || i >= len {
             return Err(pyo3::exceptions::PyIndexError::new_err(
                 "element index out of range",
             ));
         }
-        Ok(Document::make_element(py, &self.doc, children[i as usize]))
+        let child = idx.child_at(self.tag_idx, i as usize).ok_or_else(|| {
+            pyo3::exceptions::PyIndexError::new_err("element index out of range")
+        })?;
+        Ok(Document::make_element_borrowed(py, &self.doc, &doc, child))
     }
 
     /// Iterate over direct child elements.
     fn __iter__(&self, py: Python<'_>) -> ElementIterator {
         let doc = self.doc.borrow(py);
-        ElementIterator {
-            doc: self.doc.clone_ref(py),
-            children: doc.index().children(self.tag_idx),
-            pos: 0,
-        }
+        let index = doc.index();
+        let children: Vec<usize> = index
+            .child_slice(self.tag_idx)
+            .iter()
+            .map(|&c| c as usize)
+            .collect();
+        ElementIterator::new(py, &self.doc, &doc, children)
     }
 
     /// Iterate descendant elements, optionally filtered by tag name.
@@ -314,47 +338,80 @@ impl Element {
                 }
             }
         }
-        ElementIterator {
-            doc: self.doc.clone_ref(py),
-            children: descendants,
-            pos: 0,
+        ElementIterator::new(py, &self.doc, &doc, descendants)
+    }
+
+    /// All direct child tag names as a list (single FFI call, interned).
+    fn child_tags(&self, py: Python<'_>) -> Vec<Py<PyString>> {
+        let doc = self.doc.borrow(py);
+        let index = doc.index();
+        index
+            .child_slice(self.tag_idx)
+            .iter()
+            .map(|&child| doc.interned_tag(py, index, child as usize))
+            .collect()
+    }
+
+    /// All descendant tag names, optionally filtered.
+    #[pyo3(signature = (tag=None))]
+    fn descendant_tags(&self, py: Python<'_>, tag: Option<&str>) -> Vec<Py<PyString>> {
+        let doc = self.doc.borrow(py);
+        let index = doc.index();
+        let start = self.tag_idx;
+        let close = index.matching_close(start).unwrap_or(start);
+
+        let mut result = Vec::new();
+        for i in (start + 1)..=close {
+            let tt = index.tag_type(i);
+            if tt == simdxml::index::TagType::Open || tt == simdxml::index::TagType::SelfClose {
+                match tag {
+                    Some(filter) if index.tag_name(i) != filter => {}
+                    _ => result.push(doc.interned_tag(py, index, i)),
+                }
+            }
         }
+        result
     }
 
-    /// All text content (depth-first) as a list of strings.
-    fn itertext(&self, py: Python<'_>) -> Vec<String> {
+    /// All text content within this element, depth-first.
+    fn itertext(&self, py: Python<'_>) -> Vec<Py<PyString>> {
         let doc = self.doc.borrow(py);
         let index = doc.index();
         let mut texts = Vec::new();
-        collect_text(index, self.tag_idx, &mut texts);
+        collect_text_py(py, index, self.tag_idx, &mut texts);
         texts
     }
 
-    /// Concatenation of all descendant text.
-    fn text_content(&self, py: Python<'_>) -> String {
-        self.with_index(py, |index, _| index.all_text(self.tag_idx))
+    /// All descendant text concatenated into a single string.
+    fn text_content(&self, py: Python<'_>) -> Py<PyString> {
+        let doc = self.doc.borrow(py);
+        let text = doc.index().all_text(self.tag_idx);
+        PyString::new(py, &text).unbind()
     }
 
-    /// Evaluate full XPath 1.0 from this element as context node.
-    fn xpath(&self, py: Python<'_>, expr: &str) -> PyResult<Vec<Element>> {
+    /// Evaluate an XPath 1.0 expression with this element as context.
+    fn xpath(&self, py: Python<'_>, expr: &str) -> PyResult<ElementList> {
         let doc = self.doc.borrow(py);
         let index = doc.index();
         let nodes = index
             .xpath_from(expr, self.tag_idx)
             .map_err(|e| PyValueError::new_err(e.to_string()))?;
 
-        Ok(Document::make_elements(
-            py,
-            &self.doc,
-            nodes.into_iter().filter_map(|n| match n {
+        let indices: Vec<usize> = nodes
+            .into_iter()
+            .filter_map(|n| match n {
                 XPathNode::Element(idx) => Some(idx),
                 _ => None,
-            }),
-        ))
+            })
+            .collect();
+        Ok(ElementList {
+            doc: self.doc.clone_ref(py),
+            indices,
+        })
     }
 
-    /// XPath text extraction from this element as context.
-    fn xpath_text(&self, py: Python<'_>, expr: &str) -> PyResult<Vec<String>> {
+    /// Evaluate an XPath expression and return text content of matches.
+    fn xpath_text(&self, py: Python<'_>, expr: &str) -> PyResult<Vec<Py<PyString>>> {
         let doc = self.doc.borrow(py);
         let index = doc.index();
         let results = index
@@ -365,17 +422,17 @@ impl Element {
         for node in &results {
             match node {
                 XPathNode::Element(idx) => {
-                    let dt = index.direct_text(*idx);
-                    if !dt.is_empty() {
-                        texts.push(dt.join(""));
+                    if let Some(first) = index.direct_text_first(*idx) {
+                        texts.push(PyString::new(py, first).unbind());
                     }
                 }
                 XPathNode::Text(idx) => {
-                    texts.push(index.text_by_index(*idx).to_string());
+                    texts.push(PyString::new(py, index.text_by_index(*idx)).unbind());
                 }
                 XPathNode::Attribute(tag_idx, _) => {
-                    if let Some(val) = get_first_attribute(index, *tag_idx) {
-                        texts.push(val);
+                    let attrs = index.attributes(*tag_idx);
+                    if let Some((_, val)) = attrs.first() {
+                        texts.push(PyString::new(py, val).unbind());
                     }
                 }
                 _ => {}
@@ -386,51 +443,44 @@ impl Element {
 
     /// Parent element, or None for root.
     fn getparent(&self, py: Python<'_>) -> Option<Element> {
-        self.with_index(py, |_, parents| {
-            let parent = parents[self.tag_idx];
-            if parent == u32::MAX {
-                None
-            } else {
-                Some(Document::make_element(py, &self.doc, parent as usize))
-            }
-        })
+        let doc = self.doc.borrow(py);
+        let index = doc.index();
+        // Uses upstream parent() directly
+        index
+            .parent(self.tag_idx)
+            .map(|p| Document::make_element_borrowed(py, &self.doc, &doc, p))
     }
 
     /// Next sibling element, or None.
     fn getnext(&self, py: Python<'_>) -> Option<Element> {
-        self.with_index(py, |index, parents| {
-            let parent = parents[self.tag_idx];
-            if parent == u32::MAX {
-                return None;
-            }
-            let siblings = index.children(parent as usize);
-            let pos = siblings.iter().position(|&s| s == self.tag_idx)?;
-            siblings
-                .get(pos + 1)
-                .map(|&idx| Document::make_element(py, &self.doc, idx))
-        })
+        let doc = self.doc.borrow(py);
+        let index = doc.index();
+        let pos = index.child_position(self.tag_idx)?;
+        let parent = index.parent(self.tag_idx)?;
+        index
+            .child_at(parent, pos + 1)
+            .map(|idx| Document::make_element_borrowed(py, &self.doc, &doc, idx))
     }
 
     /// Previous sibling element, or None.
     fn getprevious(&self, py: Python<'_>) -> Option<Element> {
-        self.with_index(py, |index, parents| {
-            let parent = parents[self.tag_idx];
-            if parent == u32::MAX {
-                return None;
-            }
-            let siblings = index.children(parent as usize);
-            let pos = siblings.iter().position(|&s| s == self.tag_idx)?;
-            if pos > 0 {
-                Some(Document::make_element(py, &self.doc, siblings[pos - 1]))
-            } else {
-                None
-            }
-        })
+        let doc = self.doc.borrow(py);
+        let index = doc.index();
+        let pos = index.child_position(self.tag_idx)?;
+        if pos == 0 {
+            return None;
+        }
+        let parent = index.parent(self.tag_idx)?;
+        index
+            .child_at(parent, pos - 1)
+            .map(|idx| Document::make_element_borrowed(py, &self.doc, &doc, idx))
     }
 
-    /// Raw XML for this element (opening through closing tag).
-    fn tostring(&self, py: Python<'_>) -> String {
-        self.with_index(py, |index, _| index.raw_xml(self.tag_idx).to_string())
+    /// Serialize this element to an XML string.
+    fn tostring(&self, py: Python<'_>) -> Py<PyString> {
+        let doc = self.doc.borrow(py);
+        let raw = doc.index().raw_xml(self.tag_idx);
+        PyString::new(py, raw).unbind()
     }
 
     // -- Read-only enforcement --
@@ -450,35 +500,40 @@ impl Element {
         Err(readonly_error())
     }
 
+    /// Not supported. Raises TypeError (simdxml elements are read-only).
     #[pyo3(name = "set")]
     fn set_attr(&self, _key: &str, _value: &str) -> PyResult<()> {
         Err(readonly_error())
     }
 
+    /// Not supported. Raises TypeError (simdxml elements are read-only).
     fn append(&self, _element: &Element) -> PyResult<()> {
         Err(readonly_error())
     }
 
+    /// Not supported. Raises TypeError (simdxml elements are read-only).
     fn remove(&self, _element: &Element) -> PyResult<()> {
         Err(readonly_error())
     }
 
+    /// Not supported. Raises TypeError (simdxml elements are read-only).
     #[pyo3(signature = (_index, _element))]
     fn insert(&self, _index: isize, _element: &Element) -> PyResult<()> {
         Err(readonly_error())
     }
 
+    /// Not supported. Raises TypeError (simdxml elements are read-only).
     fn clear(&self) -> PyResult<()> {
         Err(readonly_error())
     }
 
     fn __repr__(&self, py: Python<'_>) -> String {
-        let tag = self.with_index(py, |index, _| index.tag_name(self.tag_idx).to_string());
-        format!("Element('{tag}')")
+        let tag_str = self.cached_tag.bind(py).to_cow().unwrap_or_default();
+        format!("Element('{tag_str}')")
     }
 
-    fn __str__(&self, py: Python<'_>) -> String {
-        self.with_index(py, |index, _| index.tag_name(self.tag_idx).to_string())
+    fn __str__(&self, py: Python<'_>) -> Py<PyString> {
+        self.cached_tag.clone_ref(py)
     }
 
     fn __bool__(&self) -> bool {
@@ -499,16 +554,34 @@ impl Element {
 }
 
 // ---------------------------------------------------------------------------
-// Element iterator
+// ElementIterator — pre-caches interned tags to avoid per-next borrow
 // ---------------------------------------------------------------------------
 
 #[pyclass]
 struct ElementIterator {
     doc: Py<Document>,
-    children: Vec<usize>,
+    items: Vec<(usize, Py<PyString>)>,
     pos: usize,
 }
 
+impl ElementIterator {
+    fn new(py: Python<'_>, doc: &Py<Document>, doc_ref: &Document, indices: Vec<usize>) -> Self {
+        let index = doc_ref.index();
+        let items: Vec<(usize, Py<PyString>)> = indices
+            .into_iter()
+            .map(|idx| {
+                let tag = doc_ref.interned_tag(py, index, idx);
+                (idx, tag)
+            })
+            .collect();
+        ElementIterator {
+            doc: doc.clone_ref(py),
+            items,
+            pos: 0,
+        }
+    }
+}
+
 #[pymethods]
 impl ElementIterator {
     fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> {
@@ -516,21 +589,103 @@ impl ElementIterator {
     }
 
     fn __next__(&mut self, py: Python<'_>) -> Option<Element> {
-        if self.pos < self.children.len() {
-            let idx = self.children[self.pos];
+        if self.pos < self.items.len() {
+            let (idx, ref cached_tag) = self.items[self.pos];
             self.pos += 1;
-            Some(Document::make_element(py, &self.doc, idx))
+            Some(Element {
+                doc: self.doc.clone_ref(py),
+                tag_idx: idx,
+                cached_tag: cached_tag.clone_ref(py),
+            })
         } else {
             None
         }
     }
+
+    fn __len__(&self) -> usize {
+        self.items.len() - self.pos
+    }
+}
+
+// ---------------------------------------------------------------------------
+// ElementList — lazy sequence returned by xpath/eval
+// ---------------------------------------------------------------------------
+
+/// A lazy list of elements. Holds one Document reference and a Vec of tag
+/// indices. Element objects are created on demand when accessed.
+#[pyclass(sequence)]
+struct ElementList {
+    doc: Py<Document>,
+    indices: Vec<usize>,
+}
+
+#[pymethods]
+impl ElementList {
+    fn __len__(&self) -> usize {
+        self.indices.len()
+    }
+
+    fn __getitem__(&self, py: Python<'_>, index: isize) -> PyResult<Element> {
+        let len = self.indices.len() as isize;
+        let i = if index < 0 { len + index } else { index };
+        if i < 0 || i >= len {
+            return Err(pyo3::exceptions::PyIndexError::new_err(
+                "list index out of range",
+            ));
+        }
+        Ok(Document::make_element(
+            py,
+            &self.doc,
+            self.indices[i as usize],
+        ))
+    }
+
+    fn __iter__(&self, py: Python<'_>) -> ElementIterator {
+        let doc_ref = self.doc.borrow(py);
+        ElementIterator::new(py, &self.doc, &doc_ref, self.indices.clone())
+    }
+
+    fn __bool__(&self) -> bool {
+        !self.indices.is_empty()
+    }
+
+    fn __eq__(&self, _py: Python<'_>, other: &Bound<'_, pyo3::PyAny>) -> bool {
+        if let Ok(list) = other.cast::<pyo3::types::PyList>() {
+            if list.len() != self.indices.len() {
+                return false;
+            }
+            for (i, item) in list.iter().enumerate() {
+                if let Ok(elem) = item.cast::<Element>() {
+                    let elem_ref = elem.borrow();
+                    if elem_ref.tag_idx != self.indices[i] || !elem_ref.doc.is(&self.doc) {
+                        return false;
+                    }
+                } else {
+                    return false;
+                }
+            }
+            return true;
+        }
+        if let Ok(other_list) = other.cast::<ElementList>() {
+            let other_ref = other_list.borrow();
+            return self.doc.is(&other_ref.doc) && self.indices == other_ref.indices;
+        }
+        false
+    }
+
+    fn __repr__(&self) -> String {
+        format!("ElementList(len={})", self.indices.len())
+    }
 }
 
 // ---------------------------------------------------------------------------
 // CompiledXPath
 // ---------------------------------------------------------------------------
 
-/// A compiled XPath expression for repeated evaluation.
+/// A compiled XPath expression for repeated use.
+///
+/// Like `re.compile()` -- parse the expression once, evaluate many times
+/// across different documents.
 #[pyclass]
 struct CompiledXPath {
     inner: simdxml::CompiledXPath,
@@ -538,18 +693,21 @@ struct CompiledXPath {
 
 #[pymethods]
 impl CompiledXPath {
-    /// Evaluate and return text content of matches.
-    fn eval_text(&self, doc: &Document) -> PyResult<Vec<String>> {
+    /// Evaluate and return text content of matching nodes.
+    fn eval_text(&self, py: Python<'_>, doc: &Document) -> PyResult<Vec<Py<PyString>>> {
         let index = doc.index();
         let results = self
             .inner
             .eval_text(index)
             .map_err(|e| PyValueError::new_err(e.to_string()))?;
-        Ok(results.into_iter().map(|s| s.to_string()).collect())
+        Ok(results
+            .into_iter()
+            .map(|s| PyString::new(py, s).unbind())
+            .collect())
     }
 
-    /// Evaluate and return matching Element nodes.
-    fn eval(slf: &Bound<'_, Self>, doc: &Bound<'_, Document>) -> PyResult<Vec<Element>> {
+    /// Evaluate and return matching elements as an ElementList (lazy).
+    fn eval(slf: &Bound<'_, Self>, doc: &Bound<'_, Document>) -> PyResult<ElementList> {
         let this = slf.borrow();
         let doc_ref = doc.borrow();
         let doc_py: Py<Document> = doc.clone().unbind();
@@ -559,18 +717,20 @@ impl CompiledXPath {
             .eval(index)
             .map_err(|e| PyValueError::new_err(e.to_string()))?;
 
-        let py = slf.py();
-        Ok(Document::make_elements(
-            py,
-            &doc_py,
-            nodes.into_iter().filter_map(|n| match n {
+        let indices: Vec<usize> = nodes
+            .into_iter()
+            .filter_map(|n| match n {
                 XPathNode::Element(idx) => Some(idx),
                 _ => None,
-            }),
-        ))
+            })
+            .collect();
+        Ok(ElementList {
+            doc: doc_py,
+            indices,
+        })
     }
 
-    /// Check if any nodes match.
+    /// Check whether any nodes match.
     fn eval_exists(&self, doc: &Document) -> PyResult<bool> {
         let index = doc.index();
         let nodes = self
@@ -580,7 +740,7 @@ impl CompiledXPath {
         Ok(!nodes.is_empty())
     }
 
-    /// Count matching nodes.
+    /// Count the number of matching nodes.
     fn eval_count(&self, doc: &Document) -> PyResult<usize> {
         let index = doc.index();
         let nodes = self
@@ -605,40 +765,22 @@ fn readonly_error() -> PyErr {
     )
 }
 
-fn get_first_attribute(index: &XmlIndex<'_>, tag_idx: usize) -> Option<String> {
-    let names = index.get_all_attribute_names(tag_idx);
-    names
-        .first()
-        .and_then(|name| index.get_attribute(tag_idx, name))
-        .map(|s| s.to_string())
-}
-
-/// Build a parent map from the public children() API.
-fn build_parent_map(index: &XmlIndex<'_>) -> Vec<u32> {
-    let n = index.tag_count();
-    let mut parents = vec![u32::MAX; n];
-    for i in 0..n {
-        let tt = index.tag_type(i);
-        if tt == simdxml::index::TagType::Open {
-            for child in index.children(i) {
-                if child < n {
-                    parents[child] = i as u32;
-                }
-            }
-        }
-    }
-    parents
-}
-
-/// Recursively collect text content depth-first (for itertext).
-fn collect_text(index: &XmlIndex<'_>, tag_idx: usize, out: &mut Vec<String>) {
+/// Recursively collect text content depth-first, building PyStrings directly.
+fn collect_text_py(
+    py: Python<'_>,
+    index: &XmlIndex<'_>,
+    tag_idx: usize,
+    out: &mut Vec<Py<PyString>>,
+) {
     for text in index.direct_text(tag_idx) {
         if !text.is_empty() {
-            out.push(XmlIndex::decode_entities(text).into_owned());
+            let decoded = XmlIndex::decode_entities(text);
+            out.push(PyString::new(py, &decoded).unbind());
         }
     }
-    for child in index.children(tag_idx) {
-        collect_text(index, child, out);
+    // Use child_slice for zero-alloc child enumeration
+    for &child in index.child_slice(tag_idx) {
+        collect_text_py(py, index, child as usize, out);
     }
 }
 
@@ -646,30 +788,69 @@ fn collect_text(index: &XmlIndex<'_>, tag_idx: usize, out: &mut Vec<String>) {
 // Module-level functions
 // ---------------------------------------------------------------------------
 
-/// Parse XML bytes or string into a Document.
+/// Parse XML into a Document.
+///
+/// Accepts bytes or str. For bytes input, the buffer is used directly (zero-copy).
+/// For str input, the string is encoded to UTF-8 bytes.
 #[pyfunction]
-fn parse(data: &Bound<'_, PyAny>) -> PyResult<Document> {
-    let bytes: Vec<u8> = if let Ok(b) = data.cast_exact::<PyBytes>() {
-        b.as_bytes().to_vec()
+fn parse(py: Python<'_>, data: &Bound<'_, PyAny>) -> PyResult<Document> {
+    let owner = if data.is_instance_of::<PyBytes>() {
+        let backed: PyBackedBytes = data.extract()?;
+        DocumentOwner::ZeroCopy(backed)
     } else if let Ok(s) = data.extract::<String>() {
-        s.into_bytes()
+        DocumentOwner::Owned(s.into_bytes())
     } else {
         return Err(PyTypeError::new_err("parse() requires bytes or str"));
     };
 
-    let inner = DocumentInner::try_new(bytes, |owner| {
+    let inner = DocumentInner::try_new(owner, |owner| {
         let mut index =
             simdxml::parse(owner).map_err(|e| PyValueError::new_err(e.to_string()))?;
         index.ensure_indices();
         index.build_name_index();
-        let parents = build_parent_map(&index);
-        Ok::<_, PyErr>(IndexWithMeta { index, parents })
+        Ok::<_, PyErr>(index)
     })?;
 
-    Ok(Document { inner })
+    // Build interned Python strings from upstream's name_table.
+    // name_table[id] = (byte_offset, length) into input. We need to resolve
+    // these to actual strings. Since input is private, we find one tag per
+    // name_id and use tag_name() on it.
+    let interned_names = {
+        let index = inner.borrow_dependent();
+        let n_names = index.name_table.len();
+        let mut names: Vec<Py<PyString>> = Vec::with_capacity(n_names);
+        let mut found = vec![false; n_names];
+
+        for i in 0..index.tag_count() {
+            if index.name_ids.is_empty() {
+                break;
+            }
+            let nid = index.name_ids[i];
+            if nid != u16::MAX && (nid as usize) < n_names && !found[nid as usize] {
+                found[nid as usize] = true;
+                // Ensure we have enough slots
+                while names.len() <= nid as usize {
+                    names.push(PyString::new(py, "").unbind());
+                }
+                names[nid as usize] = PyString::new(py, index.tag_name(i)).unbind();
+            }
+            if found.iter().all(|&f| f) {
+                break; // All names found
+            }
+        }
+        names
+    };
+
+    Ok(Document {
+        inner,
+        interned_names,
+    })
 }
 
-/// Compile an XPath expression for repeated evaluation.
+/// Compile an XPath expression for repeated use.
+///
+/// Like `re.compile()` -- parse the expression once, evaluate many times
+/// across different documents.
 #[pyfunction]
 fn compile(expr: &str) -> PyResult<CompiledXPath> {
     let inner =
@@ -685,6 +866,7 @@ fn compile(expr: &str) -> PyResult<CompiledXPath> {
 fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<Document>()?;
     m.add_class::<Element>()?;
+    m.add_class::<ElementList>()?;
     m.add_class::<CompiledXPath>()?;
     m.add_function(wrap_pyfunction!(parse, m)?)?;
     m.add_function(wrap_pyfunction!(compile, m)?)?;