From c9684a0ec2845f75bb59364bddecf89611ebdb0f Mon Sep 17 00:00:00 2001
From: "SAKAI, Kazuaki" <kaz.july.7@gmail.com>
Date: Sun, 10 May 2026 03:28:11 +0900
Subject: [PATCH 1/9] feat(candidates): add `dictool candidates corpus` for
 Wikipedia mining
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Surface-first vocabulary mining from a Wikipedia jawiki dump. Streams
the bz2 directly, skips wikitext templates `{{...}}` and `<ref>` blocks,
filters to article namespace, extracts maximal kanji runs, and diffs
against the build dict's surface set. Outputs `wikipedia.tsv` with
`surface\tfreq` rows.

Reading-assignment is intentionally deferred — the user picks top-N gap
surfaces and looks up readings before promoting to `extras/<domain>.tsv`,
mirroring the existing `mine`-then-promote-by-hand workflow.

Pilot run on jawiki-articles1 (80K articles, ~1.5GB raw text) finishes
in ~32s and yields 304K freq>=5 gap surfaces. Most are lattice-
composable (徳川家康, 室町時代, 令和元年 — Mozc handles via segment
composition) but real misses surface in the mix (e.g. 宇宙戦艦 →
Mozc top-1 returns 宇宙船感). Per-candidate verification via
`lextool explain` is still required before promotion.

deps: bzip2 0.4 (lex-cli only — same dev-tool scope as the existing
zip dep used by `candidates mine`).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 engine/Cargo.lock                             |  21 +
 engine/crates/lex-cli/Cargo.toml              |   3 +
 engine/crates/lex-cli/src/bin/dictool.rs      |  37 ++
 engine/crates/lex-cli/src/candidates/mod.rs   |   1 +
 .../lex-cli/src/candidates/wikipedia.rs       | 449 ++++++++++++++++++
 .../lex-cli/src/commands/candidates_ops.rs    |  83 ++++
 6 files changed, 594 insertions(+)
 create mode 100644 engine/crates/lex-cli/src/candidates/wikipedia.rs
diff --git a/engine/Cargo.lock b/engine/Cargo.lock
index fb180df..65e2116 100644
--- a/engine/Cargo.lock
+++ b/engine/Cargo.lock
@@ -236,6 +236,26 @@ version = "1.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
 
+[[package]]
+name = "bzip2"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8"
+dependencies = [
+ "bzip2-sys",
+ "libc",
+]
+
+[[package]]
+name = "bzip2-sys"
+version = "0.1.13+1.0.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14"
+dependencies = [
+ "cc",
+ "pkg-config",
+]
+
 [[package]]
 name = "camino"
 version = "1.2.2"
@@ -1000,6 +1020,7 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
 name = "lex-cli"
 version = "0.1.0"
 dependencies = [
+ "bzip2",
  "clap",
  "lex-core",
  "serde",
diff --git a/engine/crates/lex-cli/Cargo.toml b/engine/crates/lex-cli/Cargo.toml
index 7adf3ef..bcec983 100644
--- a/engine/crates/lex-cli/Cargo.toml
+++ b/engine/crates/lex-cli/Cargo.toml
@@ -30,3 +30,6 @@ toml = { workspace = true }
 # Kept as a hard dep (not optional) since the CLI is a build/dev tool and
 # not shipped with the IME itself.
 zip = { version = "7", default-features = false, features = ["deflate"] }
+# Used by `dictool candidates corpus` only — Wikipedia dumps ship as bz2.
+# Streaming decompress so we never materialize the full ~14GB XML on disk.
+bzip2 = "0.4"
diff --git a/engine/crates/lex-cli/src/bin/dictool.rs b/engine/crates/lex-cli/src/bin/dictool.rs
index a7acaa3..991a56a 100644
--- a/engine/crates/lex-cli/src/bin/dictool.rs
+++ b/engine/crates/lex-cli/src/bin/dictool.rs
@@ -255,6 +255,25 @@ enum CandidatesAction {
         #[arg(long)]
         clean: bool,
     },
+    /// Mine kanji-run candidates from a Wikipedia XML dump (.xml or .xml.bz2).
+    ///
+    /// Writes `wikipedia.tsv` with `surface\tfreq` rows for surfaces NOT in
+    /// the build dict, sorted by frequency descending. Reading-assignment is
+    /// done by hand on the top-N rows before promoting to `extras/`.
+    Corpus {
+        /// Path to the Wikipedia dump (.xml or .xml.bz2). User-supplied —
+        /// download from https://dumps.wikimedia.org/jawiki/latest/ first.
+        dump: String,
+        /// Build dict to diff against. Default: engine/data/lexime.dict
+        #[arg(long)]
+        build_dict: Option<String>,
+        /// Output dir. Default: engine/data/extras-candidates
+        #[arg(long)]
+        out_dir: Option<String>,
+        /// Drop surfaces with frequency below this (default: 3).
+        #[arg(long, default_value_t = 3)]
+        min_freq: u32,
+    },
 }
 
 fn main() {
@@ -342,6 +361,24 @@ fn main() {
                     std::process::exit(1);
                 }
             }
+            CandidatesAction::Corpus {
+                dump,
+                build_dict,
+                out_dir,
+                min_freq,
+            } => {
+                let out = out_dir
+                    .map(std::path::PathBuf::from)
+                    .unwrap_or_else(candidates_ops::default_out_dir);
+                let dict = build_dict
+                    .map(std::path::PathBuf::from)
+                    .unwrap_or_else(candidates_ops::default_build_dict);
+                let dump_path = std::path::PathBuf::from(dump);
+                if let Err(e) = candidates_ops::corpus(&dump_path, &dict, &out, min_freq) {
+                    eprintln!("corpus: {e}");
+                    std::process::exit(1);
+                }
+            }
         },
         Command::UserDict { file, action } => {
             let path_str = file.unwrap_or_else(user_dict_ops::default_user_dict_path);
diff --git a/engine/crates/lex-cli/src/candidates/mod.rs b/engine/crates/lex-cli/src/candidates/mod.rs
index acd6ef6..ca431f3 100644
--- a/engine/crates/lex-cli/src/candidates/mod.rs
+++ b/engine/crates/lex-cli/src/candidates/mod.rs
@@ -38,6 +38,7 @@
 //!   rarely useful for extras but kept for completeness.
 
 pub mod sudachi;
+pub mod wikipedia;
 
 use std::fs;
 use std::io::{self, BufWriter, Write};
diff --git a/engine/crates/lex-cli/src/candidates/wikipedia.rs b/engine/crates/lex-cli/src/candidates/wikipedia.rs
new file mode 100644
index 0000000..729e065
--- /dev/null
+++ b/engine/crates/lex-cli/src/candidates/wikipedia.rs
@@ -0,0 +1,449 @@
+//! Mine extras candidates from a Wikipedia XML dump.
+//!
+//! Lazy "surface-first" pipeline:
+//!
+//! 1. Stream-decompress the dump (`.xml.bz2` or `.xml`) line by line.
+//! 2. Inside `<text>...</text>` regions, extract maximal kanji runs.
+//! 3. Frequency-count surfaces (HashMap).
+//! 4. (Caller) diff against the build dict's surface set; surviving surfaces
+//!    are real Mozc gaps.
+//!
+//! No morphological analysis here — that step happens later (only for the
+//! diffed gap candidates), since reading assignment is the expensive part.
+//! See `feedback_extras_promotion.md` for why this approach was chosen
+//! over Sudachi/Wikidata seed sources.
+
+use std::collections::HashMap;
+use std::fs::File;
+use std::io::{BufRead, BufReader, Read};
+use std::path::Path;
+
+use bzip2::read::MultiBzDecoder;
+
+use super::CandidateError;
+
+/// Minimum kanji-run length to count. Single-char surfaces are dominated by
+/// fragments of compounds (e.g. の境内 → 境 + 内 fragments) and add noise.
+pub const MIN_SURFACE_CHARS: usize = 2;
+
+/// Maximum length to count. Long runs (>20 chars) tend to be wiki-markup
+/// artifacts (concatenated table cells, broken templates).
+pub const MAX_SURFACE_CHARS: usize = 20;
+
+/// Frequency floor when emitting candidates. count<3 is heavy long-tail
+/// noise — single article typos, OCR errors in references, etc.
+pub const DEFAULT_MIN_FREQ: u32 = 3;
+
+/// Stream-extract kanji-run frequencies from a Wikipedia dump.
+///
+/// `dump_path` may be `.xml.bz2` (decompressed on the fly) or already-
+/// decompressed `.xml`. Detection is by extension — explicit, no magic-byte
+/// guessing.
+pub fn extract_kanji_freqs(dump_path: &Path) -> Result<HashMap<String, u32>, CandidateError> {
+    let file = File::open(dump_path)?;
+    let reader: Box<dyn Read> = if dump_path.extension().and_then(|s| s.to_str()) == Some("bz2") {
+        // MultiBzDecoder handles concatenated bz2 streams (Wikipedia dumps
+        // are sometimes split into multiple bz2 blocks).
+        Box::new(MultiBzDecoder::new(file))
+    } else {
+        Box::new(file)
+    };
+    let buffered = BufReader::with_capacity(1 << 20, reader);
+
+    let mut freqs: HashMap<String, u32> = HashMap::new();
+    let mut in_text = false;
+    let mut buf = String::new();
+    let mut pages_seen: u64 = 0;
+    let mut pages_scanned: u64 = 0;
+    let mut bytes_seen: u64 = 0;
+    let mut last_progress = std::time::Instant::now();
+    // Wikitext template depth across line boundaries. Templates `{{...}}`
+    // contain field-name boilerplate (`乗車人員`, `駅構造`, `所属路線`...)
+    // that dominates top-frequency noise. Skip everything inside them.
+    // References `<ref>...</ref>` similarly contain citation strings.
+    let mut tmpl_depth: i32 = 0;
+    let mut in_ref = false;
+    // Per-page scratch state. <ns> arrives before <text> in the dump format,
+    // so we know whether to scan this page's text by the time we see it.
+    // Default to article (true) so older dumps without an explicit <ns> tag
+    // still get scanned.
+    let mut current_page_is_article = true;
+
+    for line_res in buffered.lines() {
+        let line = line_res?;
+        bytes_seen += line.len() as u64 + 1;
+
+        // Reset at each <page> boundary so a non-article page doesn't
+        // poison the next page when no explicit <ns> is provided.
+        if line.contains("<page>") {
+            current_page_is_article = true;
+        }
+        // Parse <ns>NUM</ns>. Filter to ns=0 (main article namespace).
+        // Skips Wikipedia: / User: / File: / Template: / Category: pages
+        // whose template-arg names and file-upload logs dominate top-
+        // frequency noise.
+        if let Some(start) = line.find("<ns>") {
+            if let Some(end) = line[start..].find("</ns>") {
+                let raw = &line[start + 4..start + end];
+                let ns: i32 = raw.trim().parse().unwrap_or(-1);
+                current_page_is_article = ns == 0;
+            }
+        }
+
+        // Flip in_text on `<text` (any attrs OK), reset on `</text>`. We
+        // tolerate <text> opening / closing on the same line.
+        let text_open = line.find("<text");
+        let text_close = line.find("</text>");
+        let scan_slice: &str = match (in_text, text_open, text_close) {
+            (false, Some(o), Some(c)) if c > o => {
+                // Whole text on one line.
+                pages_seen += 1;
+                if current_page_is_article {
+                    pages_scanned += 1;
+                }
+                let after_open = &line[o..];
+                let body_start = after_open.find('>').map(|p| o + p + 1).unwrap_or(o);
+                &line[body_start..c]
+            }
+            (false, Some(o), None) => {
+                in_text = true;
+                pages_seen += 1;
+                if current_page_is_article {
+                    pages_scanned += 1;
+                }
+                let after_open = &line[o..];
+                let body_start = after_open.find('>').map(|p| o + p + 1).unwrap_or(o);
+                &line[body_start..]
+            }
+            (true, _, Some(c)) => {
+                in_text = false;
+                &line[..c]
+            }
+            (true, _, None) => &line[..],
+            _ => "",
+        };
+
+        if !scan_slice.is_empty() && current_page_is_article {
+            scan_prose_kanji_runs(
+                scan_slice,
+                &mut buf,
+                &mut freqs,
+                &mut tmpl_depth,
+                &mut in_ref,
+            );
+        }
+
+        if last_progress.elapsed().as_secs() >= 10 {
+            eprintln!(
+                "  ... {} pages ({} articles scanned), ~{} MB, {} surfaces",
+                pages_seen,
+                pages_scanned,
+                bytes_seen >> 20,
+                freqs.len()
+            );
+            last_progress = std::time::Instant::now();
+        }
+    }
+
+    eprintln!(
+        "Done. {} pages, {} articles scanned, ~{} MB, {} unique surfaces",
+        pages_seen,
+        pages_scanned,
+        bytes_seen >> 20,
+        freqs.len()
+    );
+    Ok(freqs)
+}
+
+/// Wrapper that skips wikitext template (`{{...}}`) and `<ref>` blocks before
+/// counting kanji runs. State is carried across calls so multi-line templates
+/// stay closed.
+///
+/// `tmpl_depth` increases on `{{`, decreases on `}}`. `in_ref` toggles on
+/// `<ref` / `</ref>`. Outside-block characters are appended to a small local
+/// buffer that's flushed to `scan_kanji_runs` when a block opens/closes or at
+/// the end of the slice.
+fn scan_prose_kanji_runs(
+    s: &str,
+    buf: &mut String,
+    freqs: &mut HashMap<String, u32>,
+    tmpl_depth: &mut i32,
+    in_ref: &mut bool,
+) {
+    let bytes = s.as_bytes();
+    let mut i = 0;
+    let mut prose_start = 0; // start of the current prose run (when not inside a block)
+    while i < bytes.len() {
+        // Inline match on 2-byte ASCII pairs and `<ref` / `</ref>` headers.
+        // Using as_bytes lets us peek without UTF-8 decoding overhead;
+        // kanji are multi-byte but we only branch on ASCII patterns.
+        let in_block = *tmpl_depth > 0 || *in_ref;
+        let b = bytes[i];
+
+        if !in_block && b == b'{' && i + 1 < bytes.len() && bytes[i + 1] == b'{' {
+            // flush prose
+            if i > prose_start {
+                scan_kanji_runs(&s[prose_start..i], buf, freqs);
+            }
+            *tmpl_depth += 1;
+            i += 2;
+            prose_start = i;
+            continue;
+        }
+        if *tmpl_depth > 0 && b == b'{' && i + 1 < bytes.len() && bytes[i + 1] == b'{' {
+            *tmpl_depth += 1;
+            i += 2;
+            continue;
+        }
+        if *tmpl_depth > 0 && b == b'}' && i + 1 < bytes.len() && bytes[i + 1] == b'}' {
+            *tmpl_depth -= 1;
+            i += 2;
+            if *tmpl_depth == 0 {
+                prose_start = i;
+            }
+            continue;
+        }
+        if !in_block && b == b'<' && s[i..].starts_with("<ref") {
+            // Self-closing `<ref ... />` is one shot; full `<ref>...</ref>`
+            // is multi-token. Cheaply check the next `>`.
+            if i > prose_start {
+                scan_kanji_runs(&s[prose_start..i], buf, freqs);
+            }
+            // Find end of opening tag.
+            if let Some(rel) = s[i..].find('>') {
+                let close = i + rel;
+                // Self-closing if char before `>` is `/`.
+                if close > 0 && bytes[close - 1] == b'/' {
+                    i = close + 1;
+                    prose_start = i;
+                    continue;
+                }
+                *in_ref = true;
+                i = close + 1;
+                prose_start = i;
+                continue;
+            } else {
+                // Tag continues to next line; assume opening
+                *in_ref = true;
+                i = bytes.len();
+                prose_start = i;
+                break;
+            }
+        }
+        if *in_ref && b == b'<' && s[i..].starts_with("</ref>") {
+            *in_ref = false;
+            i += 6;
+            prose_start = i;
+            continue;
+        }
+
+        i += 1;
+    }
+    if !*in_ref && *tmpl_depth == 0 && prose_start < bytes.len() {
+        scan_kanji_runs(&s[prose_start..], buf, freqs);
+    }
+}
+
+/// Scan one slice for maximal kanji runs and bump frequencies.
+///
+/// `buf` is reused across calls so we don't reallocate per run.
+fn scan_kanji_runs(s: &str, buf: &mut String, freqs: &mut HashMap<String, u32>) {
+    buf.clear();
+    let mut char_count: usize = 0;
+    for ch in s.chars() {
+        if is_kanji(ch) {
+            buf.push(ch);
+            char_count += 1;
+        } else if !buf.is_empty() {
+            if (MIN_SURFACE_CHARS..=MAX_SURFACE_CHARS).contains(&char_count) {
+                // Avoid cloning the working buffer when the entry is fresh:
+                // entry().or_insert(buf.clone()) and entry-API patterns both
+                // require an owned key. Looking up first lets us only clone
+                // on insert, which is the cold path once vocab saturates.
+                if let Some(v) = freqs.get_mut(buf.as_str()) {
+                    *v = v.saturating_add(1);
+                } else {
+                    freqs.insert(buf.clone(), 1);
+                }
+            }
+            buf.clear();
+            char_count = 0;
+        }
+    }
+    if !buf.is_empty() && (MIN_SURFACE_CHARS..=MAX_SURFACE_CHARS).contains(&char_count) {
+        if let Some(v) = freqs.get_mut(buf.as_str()) {
+            *v = v.saturating_add(1);
+        } else {
+            freqs.insert(buf.clone(), 1);
+        }
+    }
+}
+
+/// CJK Unified Ideographs (U+4E00–U+9FFF) plus iteration mark 々 (U+3005).
+/// Excludes Extension A/B (rare archaic chars dominate noise) and katakana
+/// ヶ (typically a counter, not a content char).
+fn is_kanji(ch: char) -> bool {
+    matches!(ch, '\u{4E00}'..='\u{9FFF}' | '\u{3005}')
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn scan_extracts_maximal_kanji_runs() {
+        let mut buf = String::new();
+        let mut f = HashMap::new();
+        scan_kanji_runs("これは日本語の文章です", &mut buf, &mut f);
+        // 日本語 (3 chars) and 文章 (2 chars) qualify.
+        // 「これは / の / です」are hiragana — skipped.
+        assert_eq!(f.get("日本語"), Some(&1));
+        assert_eq!(f.get("文章"), Some(&1));
+        assert_eq!(f.len(), 2);
+    }
+
+    #[test]
+    fn scan_drops_single_char_surfaces() {
+        let mut buf = String::new();
+        let mut f = HashMap::new();
+        // 「私」と「本」は 1 字 → MIN_SURFACE_CHARS=2 で skip
+        scan_kanji_runs("私の本", &mut buf, &mut f);
+        assert!(f.is_empty());
+    }
+
+    #[test]
+    fn scan_drops_oversized_runs() {
+        let mut buf = String::new();
+        let mut f = HashMap::new();
+        let huge: String = "亜".repeat(MAX_SURFACE_CHARS + 1);
+        scan_kanji_runs(&huge, &mut buf, &mut f);
+        assert!(f.is_empty());
+        // Boundary: exactly MAX_SURFACE_CHARS should survive.
+        let edge: String = "亜".repeat(MAX_SURFACE_CHARS);
+        let mut f2 = HashMap::new();
+        scan_kanji_runs(&edge, &mut buf, &mut f2);
+        assert_eq!(f2.get(edge.as_str()), Some(&1));
+    }
+
+    #[test]
+    fn scan_treats_iter_mark_as_kanji() {
+        let mut buf = String::new();
+        let mut f = HashMap::new();
+        // 「人々」は 々 を含む 2-char surface → keep
+        scan_kanji_runs("人々が集まる", &mut buf, &mut f);
+        assert_eq!(f.get("人々"), Some(&1));
+    }
+
+    #[test]
+    fn scan_accumulates_frequency() {
+        let mut buf = String::new();
+        let mut f = HashMap::new();
+        scan_kanji_runs("日本語と日本語と日本語", &mut buf, &mut f);
+        assert_eq!(f.get("日本語"), Some(&3));
+    }
+
+    #[test]
+    fn scan_emits_run_at_eol() {
+        // Run that runs to end-of-string (no trailing non-kanji) must still
+        // be flushed.
+        let mut buf = String::new();
+        let mut f = HashMap::new();
+        scan_kanji_runs("文末は日本語", &mut buf, &mut f);
+        assert_eq!(f.get("文末"), Some(&1));
+        assert_eq!(f.get("日本語"), Some(&1));
+    }
+
+    #[test]
+    fn is_kanji_classifies_correctly() {
+        assert!(is_kanji('日'));
+        assert!(is_kanji('語'));
+        assert!(is_kanji('々'));
+        assert!(!is_kanji('あ')); // hiragana
+        assert!(!is_kanji('ア')); // katakana
+        assert!(!is_kanji('A')); // ascii
+        assert!(!is_kanji('1')); // digit
+    }
+}
+
+#[cfg(test)]
+mod prose_tests {
+    use super::*;
+    use std::collections::HashMap;
+
+    fn scan_one(s: &str) -> HashMap<String, u32> {
+        let mut buf = String::new();
+        let mut f = HashMap::new();
+        let mut depth = 0;
+        let mut in_ref = false;
+        scan_prose_kanji_runs(s, &mut buf, &mut f, &mut depth, &mut in_ref);
+        assert_eq!(depth, 0);
+        assert!(!in_ref);
+        f
+    }
+
+    #[test]
+    fn template_block_is_skipped() {
+        // 通常文章 (4 kanji), then a template block, then 続きの文章 — the
+        // hiragana き / の inside the tail break the run, so only "文章"
+        // survives from the tail. The template's "乗車人員" must NOT count.
+        let f = scan_one("通常文章{{infobox|乗車人員=12345}}続きの文章");
+        assert_eq!(f.get("通常文章"), Some(&1));
+        assert_eq!(f.get("文章"), Some(&1));
+        assert!(!f.contains_key("乗車人員"));
+    }
+
+    #[test]
+    fn nested_template_closes_correctly() {
+        let f = scan_one("外側{{a|{{b|内側}}|x}}終端文章");
+        assert_eq!(f.get("外側"), Some(&1));
+        assert_eq!(f.get("終端文章"), Some(&1));
+        // Inside nested template — must not be counted.
+        assert!(!f.contains_key("内側"));
+    }
+
+    #[test]
+    fn ref_block_is_skipped() {
+        // 本文章 (3-kanji) + ref block + 続き文章. After ref skip, き breaks
+        // the tail run, so only "文章" survives from the trailer.
+        let f = scan_one("本文章<ref>引用元の出典</ref>続き文章");
+        assert_eq!(f.get("本文章"), Some(&1));
+        assert_eq!(f.get("文章"), Some(&1));
+        assert!(!f.contains_key("出典"));
+        assert!(!f.contains_key("引用元"));
+    }
+
+    #[test]
+    fn self_closing_ref_is_handled() {
+        let f = scan_one("先頭文章<ref name=\"x\" />終端文章");
+        assert_eq!(f.get("先頭文章"), Some(&1));
+        assert_eq!(f.get("終端文章"), Some(&1));
+    }
+
+    #[test]
+    fn template_state_persists_across_slices() {
+        let mut buf = String::new();
+        let mut f = HashMap::new();
+        let mut depth = 0;
+        let mut in_ref = false;
+        scan_prose_kanji_runs(
+            "普通文{{tmpl|内容",
+            &mut buf,
+            &mut f,
+            &mut depth,
+            &mut in_ref,
+        );
+        assert_eq!(depth, 1);
+        scan_prose_kanji_runs(
+            "続き|更に}}終了文章",
+            &mut buf,
+            &mut f,
+            &mut depth,
+            &mut in_ref,
+        );
+        assert_eq!(depth, 0);
+        assert_eq!(f.get("普通文"), Some(&1));
+        assert_eq!(f.get("終了文章"), Some(&1));
+        assert!(!f.contains_key("内容"));
+        assert!(!f.contains_key("更に"));
+    }
+}
diff --git a/engine/crates/lex-cli/src/commands/candidates_ops.rs b/engine/crates/lex-cli/src/commands/candidates_ops.rs
index ecb3130..b6910ef 100644
--- a/engine/crates/lex-cli/src/commands/candidates_ops.rs
+++ b/engine/crates/lex-cli/src/commands/candidates_ops.rs
@@ -2,11 +2,13 @@
 
 use std::collections::HashSet;
 use std::fs;
+use std::io::{BufWriter, Write};
 use std::path::{Path, PathBuf};
 
 use lex_core::dict::{Dictionary, TrieDictionary};
 
 use crate::candidates::sudachi;
+use crate::candidates::wikipedia;
 use crate::candidates::{classify_pos_string, write_candidates, Bucket, Candidate, CandidateError};
 
 /// Mine extras candidates from SudachiDict.
@@ -87,6 +89,87 @@ pub fn mine(
     Ok(())
 }
 
+/// Mine extras candidates from a Wikipedia XML dump.
+///
+/// Surface-first pipeline (see `candidates::wikipedia`):
+/// 1. Stream the dump, count maximal kanji runs by frequency.
+/// 2. Diff against the merged build dict's surface set.
+/// 3. Surfaces NOT in the build dict, with `freq >= min_freq`, are written
+///    to `wikipedia.tsv` sorted by frequency descending.
+///
+/// Reading-assignment is intentionally skipped here. The user reviews top-N
+/// surfaces and assigns readings by hand (or via a separate tool) before
+/// promoting to `extras/<domain>.tsv`. This mirrors the existing
+/// `mine`-then-promote-by-hand workflow.
+pub fn corpus(
+    dump_path: &Path,
+    build_dict_path: &Path,
+    out_dir: &Path,
+    min_freq: u32,
+) -> Result<(), CandidateError> {
+    eprintln!("Scanning {} ...", dump_path.display());
+    let freqs = wikipedia::extract_kanji_freqs(dump_path)?;
+
+    let dict = TrieDictionary::open(build_dict_path).map_err(|e| {
+        CandidateError::Parse(format!(
+            "open build dict {}: {e}",
+            build_dict_path.display()
+        ))
+    })?;
+
+    // Build the build-dict surface set once. At ~1.2M entries this is ~10MB
+    // of String storage; trivial vs the freq map (~few hundred MB at
+    // full-corpus scale before frequency filtering).
+    let mut covered: HashSet<String> = HashSet::new();
+    for (_reading, entries) in dict.iter() {
+        for e in entries {
+            covered.insert(e.surface);
+        }
+    }
+    eprintln!("Build dict covers {} unique surfaces.", covered.len());
+
+    // Filter + sort: keep only surfaces NOT in build dict, with freq>=min,
+    // sort by freq desc then surface asc for deterministic output.
+    let mut gaps: Vec<(String, u32)> = freqs
+        .into_iter()
+        .filter(|(s, f)| *f >= min_freq && !covered.contains(s))
+        .collect();
+    gaps.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
+
+    fs::create_dir_all(out_dir)?;
+    let path = out_dir.join("wikipedia.tsv");
+    let file = fs::File::create(&path)?;
+    let mut w = BufWriter::new(file);
+    writeln!(w, "# Candidate pool for the curated `extras/` layer.")?;
+    writeln!(w, "# Source: Wikipedia 日本語 dump (CC-BY-SA)")?;
+    writeln!(
+        w,
+        "# Generated by `dictool candidates corpus` — DO NOT edit manually."
+    )?;
+    writeln!(
+        w,
+        "# Surfaces NOT in the build dict, freq >= {min_freq}, sorted desc."
+    )?;
+    writeln!(
+        w,
+        "# Reading is NOT assigned — pick top-N by hand and look up readings"
+    )?;
+    writeln!(w, "# before promoting to extras/<domain>.tsv. Gitignored.")?;
+    writeln!(w, "#")?;
+    writeln!(w, "# format: surface\\tfreq")?;
+    for (s, f) in &gaps {
+        writeln!(w, "{s}\t{f}")?;
+    }
+    w.flush()?;
+    eprintln!(
+        "Wrote {} gap surfaces (freq >= {}) to {}",
+        gaps.len(),
+        min_freq,
+        path.display()
+    );
+    Ok(())
+}
+
 /// Default cache dir for the working SudachiDict download. Sits under
 /// `engine/data/` like the other dict artifacts, but with a leading dot so
 /// it sorts away from the production caches (`mozc-raw/`, `extras-raw/`)

From 0a6d902cb304fcef3727f1409bc593740aa52956 Mon Sep 17 00:00:00 2001
From: "SAKAI, Kazuaki" <kaz.july.7@gmail.com>
Date: Thu, 14 May 2026 19:02:04 +0900
Subject: [PATCH 2/9] =?UTF-8?q?fix(candidates):=20PR244=20Copilot=20R1=20?=
 =?UTF-8?q?=E2=80=94=203=20findings=20(2=20IMP,=201=20MINOR)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. (IMP) `<ref` prefix match also captured `<references>` (common in
   Wikipedia citation sections). Since `<references>` closes with
   `</references>` rather than `</ref>`, `in_ref` got trapped true and
   silently dropped the rest of the page (and subsequent pages until
   the next page-boundary reset). Added a strict tag-name boundary
   check (`<ref` followed by space/`>`/`/`/EOL).

2. (IMP) Block-skip state (`tmpl_depth`, `in_ref`, `buf`) wasn't reset
   at `<page>` boundaries. Real dumps sometimes contain unbalanced
   `{{...` / `<ref ...` markup; without a reset, the open-block state
   leaked into the next page and silently skipped its content. Reset
   all three at every `<page>` line.

3. (MINOR) CLI `default_value_t = 3` duplicated the
   `wikipedia::DEFAULT_MIN_FREQ` constant. Reference the constant
   directly so they can't drift.

Empirical impact on jawiki-articles1.bz2: 304K → 334K gap surfaces
(+30K previously lost to the `<references>` trap).

Tests:
- `references_tag_does_not_trap_in_ref` covers both self-closing
  `<references/>` and `<references>...</references>` forms.
- `page_boundary_resets_block_state` drives `extract_kanji_freqs` over
  a synthetic 2-page dump where page 1 has an unclosed template and
  verifies page 2 is still fully scanned.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 engine/crates/lex-cli/src/bin/dictool.rs      |  6 +-
 .../lex-cli/src/candidates/wikipedia.rs       | 76 ++++++++++++++++++-
 2 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/engine/crates/lex-cli/src/bin/dictool.rs b/engine/crates/lex-cli/src/bin/dictool.rs
index 991a56a..e41c5ce 100644
--- a/engine/crates/lex-cli/src/bin/dictool.rs
+++ b/engine/crates/lex-cli/src/bin/dictool.rs
@@ -2,6 +2,7 @@ use std::path::Path;
 
 use clap::{Parser, Subcommand};
 
+use lex_cli::candidates::wikipedia;
 use lex_cli::commands::{candidates_ops, config_ops, convert_ops, dict_ops, user_dict_ops};
 
 /// Parse a `SOURCE:DIR` pair for `--extra-source`.
@@ -270,8 +271,9 @@ enum CandidatesAction {
         /// Output dir. Default: engine/data/extras-candidates
         #[arg(long)]
         out_dir: Option<String>,
-        /// Drop surfaces with frequency below this (default: 3).
-        #[arg(long, default_value_t = 3)]
+        /// Drop surfaces with frequency below this. Default tracks
+        /// `candidates::wikipedia::DEFAULT_MIN_FREQ` so this stays in sync.
+        #[arg(long, default_value_t = wikipedia::DEFAULT_MIN_FREQ)]
         min_freq: u32,
     },
 }
diff --git a/engine/crates/lex-cli/src/candidates/wikipedia.rs b/engine/crates/lex-cli/src/candidates/wikipedia.rs
index 729e065..dc45d44 100644
--- a/engine/crates/lex-cli/src/candidates/wikipedia.rs
+++ b/engine/crates/lex-cli/src/candidates/wikipedia.rs
@@ -75,8 +75,15 @@ pub fn extract_kanji_freqs(dump_path: &Path) -> Result<HashMap<String, u32>, Can
 
         // Reset at each <page> boundary so a non-article page doesn't
         // poison the next page when no explicit <ns> is provided.
+        // Also reset markup-skip state: a page with unbalanced `{{...` or
+        // `<ref ...` (real dumps contain these) would otherwise drag its
+        // open-block state into the next page and silently skip everything
+        // that follows.
         if line.contains("<page>") {
             current_page_is_article = true;
+            tmpl_depth = 0;
+            in_ref = false;
+            buf.clear();
         }
         // Parse <ns>NUM</ns>. Filter to ns=0 (main article namespace).
         // Skips Wikipedia: / User: / File: / Template: / Category: pages
@@ -203,7 +210,7 @@ fn scan_prose_kanji_runs(
             }
             continue;
         }
-        if !in_block && b == b'<' && s[i..].starts_with("<ref") {
+        if !in_block && b == b'<' && is_ref_open(&s[i..], bytes, i) {
             // Self-closing `<ref ... />` is one shot; full `<ref>...</ref>`
             // is multi-token. Cheaply check the next `>`.
             if i > prose_start {
@@ -244,6 +251,23 @@ fn scan_prose_kanji_runs(
     }
 }
 
+/// Distinguish `<ref>` / `<ref ...>` / `<ref/>` from `<references>` and
+/// `<refer...>` etc. `<references>` closes with `</references>` (not
+/// `</ref>`), so naive `starts_with("<ref")` traps `in_ref` permanently
+/// — the rest of the page (and worse, subsequent pages) silently drop.
+fn is_ref_open(slice: &str, bytes: &[u8], i: usize) -> bool {
+    if !slice.starts_with("<ref") {
+        return false;
+    }
+    // 4 = len("<ref"). Char after "<ref" must terminate the tag name.
+    match bytes.get(i + 4) {
+        Some(b' ') | Some(b'\t') | Some(b'\n') | Some(b'>') | Some(b'/') => true,
+        // EOF after "<ref" — treat as opening (line break inside attrs).
+        None => true,
+        _ => false,
+    }
+}
+
 /// Scan one slice for maximal kanji runs and bump frequencies.
 ///
 /// `buf` is reused across calls so we don't reallocate per run.
@@ -446,4 +470,54 @@ mod prose_tests {
         assert!(!f.contains_key("内容"));
         assert!(!f.contains_key("更に"));
     }
+
+    #[test]
+    fn references_tag_does_not_trap_in_ref() {
+        // `<references>` (and `<references/>` / `<references xml:space="..."/>`)
+        // closes with `</references>`, NOT `</ref>`. Naive `<ref` matching
+        // would trap in_ref true forever, silently dropping the rest of the
+        // page. We must NOT enter in_ref state for this tag.
+        let f = scan_one("先頭文章<references/>末尾文章");
+        assert_eq!(f.get("先頭文章"), Some(&1));
+        assert_eq!(f.get("末尾文章"), Some(&1));
+
+        let f2 = scan_one("先頭文章<references>引用集</references>末尾文章");
+        assert_eq!(f2.get("先頭文章"), Some(&1));
+        assert_eq!(f2.get("末尾文章"), Some(&1));
+        // The <references> body content here happens to look like prose
+        // since we didn't treat it as a block — that's fine; we trade
+        // theoretical "block body" purity for not losing the rest of the
+        // page when </ref> never arrives.
+    }
+
+    #[test]
+    fn page_boundary_resets_block_state() {
+        // A page with unbalanced `{{...` (no closing `}}`) leaves
+        // tmpl_depth > 0. The driver loop resets state at <page>
+        // boundaries — verify that the SECOND page is fully scanned.
+        let dump = "<page>\n<ns>0</ns>\n<text>第一段{{壊れ|未閉</text>\n</page>\n\
+                    <page>\n<ns>0</ns>\n<text>第二段文章</text>\n</page>";
+        let freqs = extract_kanji_freqs_from_str(dump).unwrap();
+        // 第一段 must be present (scanned before the open `{{`).
+        assert_eq!(freqs.get("第一段"), Some(&1));
+        // 第二段文章 must be present — would be missing if state leaked.
+        assert_eq!(freqs.get("第二段文章"), Some(&1));
+        // The unclosed-template body must NOT leak through.
+        assert!(!freqs.contains_key("未閉"));
+    }
+
+    /// Test helper: drive `extract_kanji_freqs` with an in-memory dump.
+    fn extract_kanji_freqs_from_str(s: &str) -> Result<HashMap<String, u32>, CandidateError> {
+        let tmp = std::env::temp_dir().join(format!(
+            "lexime_test_dump_{}.xml",
+            std::time::SystemTime::now()
+                .duration_since(std::time::UNIX_EPOCH)
+                .unwrap()
+                .as_nanos()
+        ));
+        std::fs::write(&tmp, s)?;
+        let r = extract_kanji_freqs(&tmp);
+        let _ = std::fs::remove_file(&tmp);
+        r
+    }
 }

From daca2847852c4503266bc9180ad15d867466198d Mon Sep 17 00:00:00 2001
From: "SAKAI, Kazuaki" <kaz.july.7@gmail.com>
Date: Thu, 14 May 2026 19:05:39 +0900
Subject: [PATCH 3/9] fix(audit): exempt bzip2 / bzip2-sys (PR244 audit CI fix)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`cargo vet check` was failing on the audit job because the new
`bzip2` / `bzip2-sys` deps (added in c9684a0 for `dictool candidates
corpus`) were unvetted. Add same-pattern exemptions matching the
existing `zip` entry — both are pulled in only by the dev/build CLI,
not the IME runtime.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 engine/supply-chain/config.toml | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/engine/supply-chain/config.toml b/engine/supply-chain/config.toml
index 70b0611..c1a01ff 100644
--- a/engine/supply-chain/config.toml
+++ b/engine/supply-chain/config.toml
@@ -64,16 +64,6 @@ criteria = "safe-to-deploy"
 version = "2.11.0"
 criteria = "safe-to-deploy"
 
-# Raised from `safe-to-run` to `safe-to-deploy` in PR #242. Two paths now
-# pull bumpalo as a transitive dependency:
-#   - `zip 7` → `zopfli` → `bumpalo`  (used by `dictool candidates mine`'s
-#     ZIP-extraction code; reachable from a CLI tool we may distribute).
-#   - `candle-core` → wasm-bindgen-macro-support → `bumpalo`  (proc-macro
-#     side, only with the neural feature).
-# Either path alone would be enough for cargo-vet to demand `safe-to-deploy`
-# evaluation. The IME runtime itself doesn't directly use bumpalo at
-# runtime (the dictool CLI is a separate binary), but the elevated trust
-# level is required for `cargo vet check` to pass on the workspace graph.
 [[exemptions.bumpalo]]
 version = "3.19.1"
 criteria = "safe-to-deploy"
@@ -90,6 +80,14 @@ criteria = "safe-to-deploy"
 version = "1.11.1"
 criteria = "safe-to-deploy"
 
+[[exemptions.bzip2]]
+version = "0.4.4"
+criteria = "safe-to-deploy"
+
+[[exemptions.bzip2-sys]]
+version = "0.1.13+1.0.8"
+criteria = "safe-to-deploy"
+
 [[exemptions.camino]]
 version = "1.2.2"
 criteria = "safe-to-deploy"

From 284b25eb3635c7cc4f4a18aa7c93cf697b01268b Mon Sep 17 00:00:00 2001
From: "SAKAI, Kazuaki" <kaz.july.7@gmail.com>
Date: Thu, 14 May 2026 19:06:21 +0900
Subject: [PATCH 4/9] chore(audit): restore bumpalo exemption comment (lost in
 daca284)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`cargo vet check` ran during the audit-fix work stripped the inline
comment above `[[exemptions.bumpalo]]` as part of its config-file
normalization. Put it back — the explanation of why bumpalo is
elevated to `safe-to-deploy` is load-bearing context for future
audits.
---
 engine/supply-chain/config.toml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/engine/supply-chain/config.toml b/engine/supply-chain/config.toml
index c1a01ff..f0595f2 100644
--- a/engine/supply-chain/config.toml
+++ b/engine/supply-chain/config.toml
@@ -64,6 +64,16 @@ criteria = "safe-to-deploy"
 version = "2.11.0"
 criteria = "safe-to-deploy"
 
+# Raised from `safe-to-run` to `safe-to-deploy` in PR #242. Two paths now
+# pull bumpalo as a transitive dependency:
+#   - `zip 7` → `zopfli` → `bumpalo`  (used by `dictool candidates mine`'s
+#     ZIP-extraction code; reachable from a CLI tool we may distribute).
+#   - `candle-core` → wasm-bindgen-macro-support → `bumpalo`  (proc-macro
+#     side, only with the neural feature).
+# Either path alone would be enough for cargo-vet to demand `safe-to-deploy`
+# evaluation. The IME runtime itself doesn't directly use bumpalo at
+# runtime (the dictool CLI is a separate binary), but the elevated trust
+# level is required for `cargo vet check` to pass on the workspace graph.
 [[exemptions.bumpalo]]
 version = "3.19.1"
 criteria = "safe-to-deploy"

From 1507c4e47599b0d66ce094db3d2ceb3d8d636b22 Mon Sep 17 00:00:00 2001
From: "SAKAI, Kazuaki" <kaz.july.7@gmail.com>
Date: Thu, 14 May 2026 19:10:15 +0900
Subject: [PATCH 5/9] fix(audit): accept bzip2-sys in build-script baseline

`scripts/check-build-scripts.sh` flagged `bzip2-sys` as a new crate
with `build.rs` after PR #244 added the bzip2 dep for the Wikipedia
corpus miner. The build.rs is upstream-standard (compiles vendored
libbz2 C source via `cc`), same supply-chain posture as the existing
audited C/build-script crates in the baseline (libc, ring, rustls,
ring-bindgen, etc.). Accept by updating the baseline.
---
 engine/build-script-baseline.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/engine/build-script-baseline.txt b/engine/build-script-baseline.txt
index 78c0527..c2b0e34 100644
--- a/engine/build-script-baseline.txt
+++ b/engine/build-script-baseline.txt
@@ -1,4 +1,5 @@
 anyhow
+bzip2-sys
 camino
 crc32fast
 crossbeam-utils

From 5eeb3f80feff51cfac6fe2a0ebf54224fd4264cc Mon Sep 17 00:00:00 2001
From: "SAKAI, Kazuaki" <kaz.july.7@gmail.com>
Date: Thu, 14 May 2026 19:14:04 +0900
Subject: [PATCH 6/9] =?UTF-8?q?fix(candidates):=20PR244=20Copilot=20R2=20?=
 =?UTF-8?q?=E2=80=94=202=20findings=20(1=20IMP,=201=20MINOR)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. (IMP) `<text ... />` self-closing form wasn't handled. The pattern
   match treated it like a normal opening (`<text>` with no `</text>`
   on the same line), so `in_text` stuck true for that page. If the
   next line was XML metadata (e.g. `<title>` of the following page
   when self-closing immediately precedes `</page>`), it would have
   been scanned as prose and polluted frequency counts. Detect `/>`
   before the first `>` of the opening tag and short-circuit. Also
   reset `in_text` at `<page>` boundaries alongside the other state
   resets (defence-in-depth).

2. (MINOR) Test helper wrote to a timestamp-based path under
   `std::env::temp_dir()`, which could collide on parallel test runs
   and leave files behind on panic. Refactored `extract_kanji_freqs`
   to split out a `extract_kanji_freqs_from_reader` private API that
   takes any `impl BufRead`; tests now run against `Cursor<&[u8]>`
   directly, no filesystem involvement.

The earlier R2 finding about supply-chain updates (Cargo.toml +35) is
already addressed in daca284 + 1507c4e — resolving as stale.

Tests:
- New `self_closing_text_tag_is_handled` constructs a 2-page dump
  where page 1 is `<text bytes="0" />` (self-closing) and verifies
  page 2's body is still scanned AND page 2's `<title>` metadata is
  NOT counted (would be if in_text leaked).
- All existing tests migrated to the reader-based helper.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../lex-cli/src/candidates/wikipedia.rs       | 103 ++++++++++++------
 1 file changed, 70 insertions(+), 33 deletions(-)

diff --git a/engine/crates/lex-cli/src/candidates/wikipedia.rs b/engine/crates/lex-cli/src/candidates/wikipedia.rs
index dc45d44..c937e2d 100644
--- a/engine/crates/lex-cli/src/candidates/wikipedia.rs
+++ b/engine/crates/lex-cli/src/candidates/wikipedia.rs
@@ -48,8 +48,16 @@ pub fn extract_kanji_freqs(dump_path: &Path) -> Result<HashMap<String, u32>, Can
     } else {
         Box::new(file)
     };
-    let buffered = BufReader::with_capacity(1 << 20, reader);
+    extract_kanji_freqs_from_reader(BufReader::with_capacity(1 << 20, reader), true)
+}
 
+/// Pure-stream variant of `extract_kanji_freqs`. Public-in-crate so tests
+/// can exercise the parse loop without touching the filesystem (avoids
+/// flaky tempfile races in parallel runs).
+pub(crate) fn extract_kanji_freqs_from_reader<R: BufRead>(
+    reader: R,
+    progress: bool,
+) -> Result<HashMap<String, u32>, CandidateError> {
     let mut freqs: HashMap<String, u32> = HashMap::new();
     let mut in_text = false;
     let mut buf = String::new();
@@ -69,20 +77,23 @@ pub fn extract_kanji_freqs(dump_path: &Path) -> Result<HashMap<String, u32>, Can
     // still get scanned.
     let mut current_page_is_article = true;
 
-    for line_res in buffered.lines() {
+    for line_res in reader.lines() {
         let line = line_res?;
         bytes_seen += line.len() as u64 + 1;
 
         // Reset at each <page> boundary so a non-article page doesn't
         // poison the next page when no explicit <ns> is provided.
-        // Also reset markup-skip state: a page with unbalanced `{{...` or
-        // `<ref ...` (real dumps contain these) would otherwise drag its
-        // open-block state into the next page and silently skip everything
-        // that follows.
+        // Also reset markup-skip state: a page with unbalanced `{{...`,
+        // `<ref ...`, or `<text` (without matching close) would otherwise
+        // drag its open-block state into the next page and silently skip
+        // everything that follows. `in_text` is reset here too — a stray
+        // self-closing/unclosed `<text` on a non-article page must not
+        // make us treat subsequent XML metadata of the NEXT page as prose.
         if line.contains("<page>") {
             current_page_is_article = true;
             tmpl_depth = 0;
             in_ref = false;
+            in_text = false;
             buf.clear();
         }
         // Parse <ns>NUM</ns>. Filter to ns=0 (main article namespace).
@@ -97,12 +108,30 @@ pub fn extract_kanji_freqs(dump_path: &Path) -> Result<HashMap<String, u32>, Can
             }
         }
 
-        // Flip in_text on `<text` (any attrs OK), reset on `</text>`. We
-        // tolerate <text> opening / closing on the same line.
+        // Detect the `<text` opening tag, with three flavours to handle:
+        //   `<text>body</text>`      single line (close after open)
+        //   `<text>body`             open continues to next line
+        //   `<text ... />`           self-closing, empty body (rare but
+        //                            present in real dumps for redirect /
+        //                            stub pages)
         let text_open = line.find("<text");
         let text_close = line.find("</text>");
-        let scan_slice: &str = match (in_text, text_open, text_close) {
-            (false, Some(o), Some(c)) if c > o => {
+        let text_self_closing = text_open.is_some_and(|o| {
+            // Self-closing iff the first `>` after `<text` is preceded by `/`.
+            line[o..]
+                .find('>')
+                .is_some_and(|rel| rel > 0 && line.as_bytes()[o + rel - 1] == b'/')
+        });
+        let scan_slice: &str = match (in_text, text_open, text_close, text_self_closing) {
+            (false, Some(_), _, true) => {
+                // Self-closing `<text ... />` — page seen, nothing to scan.
+                pages_seen += 1;
+                if current_page_is_article {
+                    pages_scanned += 1;
+                }
+                ""
+            }
+            (false, Some(o), Some(c), false) if c > o => {
                 // Whole text on one line.
                 pages_seen += 1;
                 if current_page_is_article {
@@ -112,7 +141,7 @@ pub fn extract_kanji_freqs(dump_path: &Path) -> Result<HashMap<String, u32>, Can
                 let body_start = after_open.find('>').map(|p| o + p + 1).unwrap_or(o);
                 &line[body_start..c]
             }
-            (false, Some(o), None) => {
+            (false, Some(o), None, false) => {
                 in_text = true;
                 pages_seen += 1;
                 if current_page_is_article {
@@ -122,11 +151,11 @@ pub fn extract_kanji_freqs(dump_path: &Path) -> Result<HashMap<String, u32>, Can
                 let body_start = after_open.find('>').map(|p| o + p + 1).unwrap_or(o);
                 &line[body_start..]
             }
-            (true, _, Some(c)) => {
+            (true, _, Some(c), _) => {
                 in_text = false;
                 &line[..c]
             }
-            (true, _, None) => &line[..],
+            (true, _, None, _) => &line[..],
             _ => "",
         };
 
@@ -140,7 +169,7 @@ pub fn extract_kanji_freqs(dump_path: &Path) -> Result<HashMap<String, u32>, Can
             );
         }
 
-        if last_progress.elapsed().as_secs() >= 10 {
+        if progress && last_progress.elapsed().as_secs() >= 10 {
             eprintln!(
                 "  ... {} pages ({} articles scanned), ~{} MB, {} surfaces",
                 pages_seen,
@@ -152,13 +181,15 @@ pub fn extract_kanji_freqs(dump_path: &Path) -> Result<HashMap<String, u32>, Can
         }
     }
 
-    eprintln!(
-        "Done. {} pages, {} articles scanned, ~{} MB, {} unique surfaces",
-        pages_seen,
-        pages_scanned,
-        bytes_seen >> 20,
-        freqs.len()
-    );
+    if progress {
+        eprintln!(
+            "Done. {} pages, {} articles scanned, ~{} MB, {} unique surfaces",
+            pages_seen,
+            pages_scanned,
+            bytes_seen >> 20,
+            freqs.len()
+        );
+    }
     Ok(freqs)
 }
 
@@ -506,18 +537,24 @@ mod prose_tests {
         assert!(!freqs.contains_key("未閉"));
     }
 
-    /// Test helper: drive `extract_kanji_freqs` with an in-memory dump.
+    #[test]
+    fn self_closing_text_tag_is_handled() {
+        // `<text ... />` (empty content, e.g. for redirect / stub pages)
+        // must NOT flip in_text to true — otherwise subsequent XML metadata
+        // lines of the next page would be scanned as prose.
+        let dump = "<page>\n<ns>0</ns>\n<text bytes=\"0\" />\n</page>\n\
+                    <page>\n<ns>0</ns>\n<title>普通記事</title>\n<text>本文文章</text>\n</page>";
+        let freqs = extract_kanji_freqs_from_str(dump).unwrap();
+        // 本文文章 from the second page must be counted.
+        assert_eq!(freqs.get("本文文章"), Some(&1));
+        // The XML metadata of page 2 (`<title>普通記事</title>`) must NOT
+        // be counted as prose — would leak if in_text stuck true.
+        assert!(!freqs.contains_key("普通記事"));
+    }
+
+    /// Test helper: drive the stream parser with an in-memory dump.
+    /// Avoids tempfile flakiness in parallel test runs.
     fn extract_kanji_freqs_from_str(s: &str) -> Result<HashMap<String, u32>, CandidateError> {
-        let tmp = std::env::temp_dir().join(format!(
-            "lexime_test_dump_{}.xml",
-            std::time::SystemTime::now()
-                .duration_since(std::time::UNIX_EPOCH)
-                .unwrap()
-                .as_nanos()
-        ));
-        std::fs::write(&tmp, s)?;
-        let r = extract_kanji_freqs(&tmp);
-        let _ = std::fs::remove_file(&tmp);
-        r
+        extract_kanji_freqs_from_reader(std::io::Cursor::new(s.as_bytes()), false)
     }
 }

From 1e943efb61f821186478195e7f2c8e6d549301ac Mon Sep 17 00:00:00 2001
From: "SAKAI, Kazuaki" <kaz.july.7@gmail.com>
Date: Thu, 14 May 2026 19:19:20 +0900
Subject: [PATCH 7/9] =?UTF-8?q?fix(candidates):=20PR244=20Copilot=20R3=20?=
 =?UTF-8?q?=E2=80=94=201=20MINOR=20(doc),=201=20MINOR=20resolved=20as=20WO?=
 =?UTF-8?q?NTFIX?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

doc: `scan_prose_kanji_runs`'s doc comment said outside-block chars
were "appended to a small local buffer". The implementation actually
slices the input string directly (`&s[prose_start..i]`) and only the
inner `scan_kanji_runs` reuses `buf` for the per-run kanji accumulator.
Rewrote to match.

The other R3 finding (perf: `dict.iter()` in `candidates_ops::corpus`
materializes all readings/surfaces) is a dev-tool runtime-profile
concern — same posture as the `dictool candidates mine` perf MINORs
covered in feedback memory. The pipeline still completes in ~32s for
jawiki-articles1.bz2 and the build dict surface set is ~10MB; not
worth a lex-core API addition this PR. Resolved as WONTFIX.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 engine/crates/lex-cli/src/candidates/wikipedia.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/engine/crates/lex-cli/src/candidates/wikipedia.rs b/engine/crates/lex-cli/src/candidates/wikipedia.rs
index c937e2d..616df83 100644
--- a/engine/crates/lex-cli/src/candidates/wikipedia.rs
+++ b/engine/crates/lex-cli/src/candidates/wikipedia.rs
@@ -198,9 +198,10 @@ pub(crate) fn extract_kanji_freqs_from_reader<R: BufRead>(
 /// stay closed.
 ///
 /// `tmpl_depth` increases on `{{`, decreases on `}}`. `in_ref` toggles on
-/// `<ref` / `</ref>`. Outside-block characters are appended to a small local
-/// buffer that's flushed to `scan_kanji_runs` when a block opens/closes or at
-/// the end of the slice.
+/// `<ref` / `</ref>`. Outside-block byte ranges are passed by reference
+/// (`&s[prose_start..i]`) directly to `scan_kanji_runs` whenever a block
+/// opens, closes, or the slice ends — no intermediate copy of the prose
+/// itself; only the per-run `buf` inside `scan_kanji_runs` is reused.
 fn scan_prose_kanji_runs(
     s: &str,
     buf: &mut String,

From 53bc79b15ed8a1780b7f6af18cdee46bd8b0b934 Mon Sep 17 00:00:00 2001
From: "SAKAI, Kazuaki" <kaz.july.7@gmail.com>
Date: Thu, 14 May 2026 19:24:13 +0900
Subject: [PATCH 8/9] docs(candidates): note UTF-8 safety invariant in
 scan_prose_kanji_runs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR244 Copilot R4 flagged the byte-indexed loop as potentially panicking
on `&s[prose_start..i]` if a UTF-8 continuation byte matched `{` / `}` /
`<`. That's not possible per the UTF-8 spec: continuation bytes are
0x80-0xBF, and our ASCII delimiters are 0x00-0x7F. Add an inline note
so the invariant is visible in the source — preempts future re-raises
without changing behavior.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 engine/crates/lex-cli/src/candidates/wikipedia.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/engine/crates/lex-cli/src/candidates/wikipedia.rs b/engine/crates/lex-cli/src/candidates/wikipedia.rs
index 616df83..5e90221 100644
--- a/engine/crates/lex-cli/src/candidates/wikipedia.rs
+++ b/engine/crates/lex-cli/src/candidates/wikipedia.rs
@@ -212,6 +212,12 @@ fn scan_prose_kanji_runs(
     let bytes = s.as_bytes();
     let mut i = 0;
     let mut prose_start = 0; // start of the current prose run (when not inside a block)
+    // UTF-8 safety: this loop is byte-indexed, but `&s[prose_start..i]`
+    // slicing is always at a char boundary because every advance of
+    // either index happens just past an ASCII delimiter byte (`{`, `}`,
+    // `<`, `>`, `/`, or whitespace — all 0x00-0x7F). UTF-8 continuation
+    // bytes are strictly 0x80-0xBF, so multi-byte chars (kanji etc.)
+    // cannot contribute a byte that matches any of our ASCII branches.
     while i < bytes.len() {
         // Inline match on 2-byte ASCII pairs and `<ref` / `</ref>` headers.
         // Using as_bytes lets us peek without UTF-8 decoding overhead;

From e4b6fa693caa5eb541da932f2c9b746f12e89c06 Mon Sep 17 00:00:00 2001
From: "SAKAI, Kazuaki" <kaz.july.7@gmail.com>
Date: Thu, 14 May 2026 19:27:01 +0900
Subject: [PATCH 9/9] fix(candidates): satisfy rustfmt on UTF-8-safety doc
 comment

The doc block landed as a trailing comment on `let mut prose_start = 0;`
which rustfmt then re-indents to a confusing column. Move the comment
to its own block above the binding so it formats cleanly.
---
 engine/crates/lex-cli/src/candidates/wikipedia.rs | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/engine/crates/lex-cli/src/candidates/wikipedia.rs b/engine/crates/lex-cli/src/candidates/wikipedia.rs
index 5e90221..cc8e22a 100644
--- a/engine/crates/lex-cli/src/candidates/wikipedia.rs
+++ b/engine/crates/lex-cli/src/candidates/wikipedia.rs
@@ -211,13 +211,15 @@ fn scan_prose_kanji_runs(
 ) {
     let bytes = s.as_bytes();
     let mut i = 0;
-    let mut prose_start = 0; // start of the current prose run (when not inside a block)
+    // start of the current prose run (when not inside a block).
+    //
     // UTF-8 safety: this loop is byte-indexed, but `&s[prose_start..i]`
-    // slicing is always at a char boundary because every advance of
-    // either index happens just past an ASCII delimiter byte (`{`, `}`,
-    // `<`, `>`, `/`, or whitespace — all 0x00-0x7F). UTF-8 continuation
-    // bytes are strictly 0x80-0xBF, so multi-byte chars (kanji etc.)
-    // cannot contribute a byte that matches any of our ASCII branches.
+    // slicing is always at a char boundary because every advance of either
+    // index happens just past an ASCII delimiter byte (`{`, `}`, `<`, `>`,
+    // `/`, or whitespace — all 0x00-0x7F). UTF-8 continuation bytes are
+    // strictly 0x80-0xBF, so multi-byte chars (kanji etc.) cannot
+    // contribute a byte that matches any of our ASCII branches.
+    let mut prose_start = 0;
     while i < bytes.len() {
         // Inline match on 2-byte ASCII pairs and `<ref` / `</ref>` headers.
         // Using as_bytes lets us peek without UTF-8 decoding overhead;