From c9684a0ec2845f75bb59364bddecf89611ebdb0f Mon Sep 17 00:00:00 2001 From: "SAKAI, Kazuaki" Date: Sun, 10 May 2026 03:28:11 +0900 Subject: [PATCH 1/9] feat(candidates): add `dictool candidates corpus` for Wikipedia mining MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surface-first vocabulary mining from a Wikipedia jawiki dump. Streams the bz2 directly, skips wikitext templates `{{...}}` and `` blocks, filters to article namespace, extracts maximal kanji runs, and diffs against the build dict's surface set. Outputs `wikipedia.tsv` with `surface\tfreq` rows. Reading-assignment is intentionally deferred — the user picks top-N gap surfaces and looks up readings before promoting to `extras/.tsv`, mirroring the existing `mine`-then-promote-by-hand workflow. Pilot run on jawiki-articles1 (80K articles, ~1.5GB raw text) finishes in ~32s and yields 304K freq>=5 gap surfaces. Most are lattice- composable (徳川家康, 室町時代, 令和元年 — Mozc handles via segment composition) but real misses surface in the mix (e.g. 宇宙戦艦 → Mozc top-1 returns 宇宙船感). Per-candidate verification via `lextool explain` is still required before promotion. deps: bzip2 0.4 (lex-cli only — same dev-tool scope as the existing zip dep used by `candidates mine`). Co-Authored-By: Claude Opus 4.7 (1M context) --- engine/Cargo.lock | 21 + engine/crates/lex-cli/Cargo.toml | 3 + engine/crates/lex-cli/src/bin/dictool.rs | 37 ++ engine/crates/lex-cli/src/candidates/mod.rs | 1 + .../lex-cli/src/candidates/wikipedia.rs | 449 ++++++++++++++++++ .../lex-cli/src/commands/candidates_ops.rs | 83 ++++ 6 files changed, 594 insertions(+) create mode 100644 engine/crates/lex-cli/src/candidates/wikipedia.rs diff --git a/engine/Cargo.lock b/engine/Cargo.lock index fb180df..65e2116 100644 --- a/engine/Cargo.lock +++ b/engine/Cargo.lock @@ -236,6 +236,26 @@ version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +[[package]] +name = "bzip2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.13+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "camino" version = "1.2.2" @@ -1000,6 +1020,7 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" name = "lex-cli" version = "0.1.0" dependencies = [ + "bzip2", "clap", "lex-core", "serde", diff --git a/engine/crates/lex-cli/Cargo.toml b/engine/crates/lex-cli/Cargo.toml index 7adf3ef..bcec983 100644 --- a/engine/crates/lex-cli/Cargo.toml +++ b/engine/crates/lex-cli/Cargo.toml @@ -30,3 +30,6 @@ toml = { workspace = true } # Kept as a hard dep (not optional) since the CLI is a build/dev tool and # not shipped with the IME itself. zip = { version = "7", default-features = false, features = ["deflate"] } +# Used by `dictool candidates corpus` only — Wikipedia dumps ship as bz2. +# Streaming decompress so we never materialize the full ~14GB XML on disk. +bzip2 = "0.4" diff --git a/engine/crates/lex-cli/src/bin/dictool.rs b/engine/crates/lex-cli/src/bin/dictool.rs index a7acaa3..991a56a 100644 --- a/engine/crates/lex-cli/src/bin/dictool.rs +++ b/engine/crates/lex-cli/src/bin/dictool.rs @@ -255,6 +255,25 @@ enum CandidatesAction { #[arg(long)] clean: bool, }, + /// Mine kanji-run candidates from a Wikipedia XML dump (.xml or .xml.bz2). + /// + /// Writes `wikipedia.tsv` with `surface\tfreq` rows for surfaces NOT in + /// the build dict, sorted by frequency descending. Reading-assignment is + /// done by hand on the top-N rows before promoting to `extras/`. + Corpus { + /// Path to the Wikipedia dump (.xml or .xml.bz2). User-supplied — + /// download from https://dumps.wikimedia.org/jawiki/latest/ first. + dump: String, + /// Build dict to diff against. Default: engine/data/lexime.dict + #[arg(long)] + build_dict: Option, + /// Output dir. Default: engine/data/extras-candidates + #[arg(long)] + out_dir: Option, + /// Drop surfaces with frequency below this (default: 3). + #[arg(long, default_value_t = 3)] + min_freq: u32, + }, } fn main() { @@ -342,6 +361,24 @@ fn main() { std::process::exit(1); } } + CandidatesAction::Corpus { + dump, + build_dict, + out_dir, + min_freq, + } => { + let out = out_dir + .map(std::path::PathBuf::from) + .unwrap_or_else(candidates_ops::default_out_dir); + let dict = build_dict + .map(std::path::PathBuf::from) + .unwrap_or_else(candidates_ops::default_build_dict); + let dump_path = std::path::PathBuf::from(dump); + if let Err(e) = candidates_ops::corpus(&dump_path, &dict, &out, min_freq) { + eprintln!("corpus: {e}"); + std::process::exit(1); + } + } }, Command::UserDict { file, action } => { let path_str = file.unwrap_or_else(user_dict_ops::default_user_dict_path); diff --git a/engine/crates/lex-cli/src/candidates/mod.rs b/engine/crates/lex-cli/src/candidates/mod.rs index acd6ef6..ca431f3 100644 --- a/engine/crates/lex-cli/src/candidates/mod.rs +++ b/engine/crates/lex-cli/src/candidates/mod.rs @@ -38,6 +38,7 @@ //! rarely useful for extras but kept for completeness. pub mod sudachi; +pub mod wikipedia; use std::fs; use std::io::{self, BufWriter, Write}; diff --git a/engine/crates/lex-cli/src/candidates/wikipedia.rs b/engine/crates/lex-cli/src/candidates/wikipedia.rs new file mode 100644 index 0000000..729e065 --- /dev/null +++ b/engine/crates/lex-cli/src/candidates/wikipedia.rs @@ -0,0 +1,449 @@ +//! Mine extras candidates from a Wikipedia XML dump. +//! +//! Lazy "surface-first" pipeline: +//! +//! 1. Stream-decompress the dump (`.xml.bz2` or `.xml`) line by line. +//! 2. Inside `...` regions, extract maximal kanji runs. +//! 3. Frequency-count surfaces (HashMap). +//! 4. (Caller) diff against the build dict's surface set; surviving surfaces +//! are real Mozc gaps. +//! +//! No morphological analysis here — that step happens later (only for the +//! diffed gap candidates), since reading assignment is the expensive part. +//! See `feedback_extras_promotion.md` for why this approach was chosen +//! over Sudachi/Wikidata seed sources. + +use std::collections::HashMap; +use std::fs::File; +use std::io::{BufRead, BufReader, Read}; +use std::path::Path; + +use bzip2::read::MultiBzDecoder; + +use super::CandidateError; + +/// Minimum kanji-run length to count. Single-char surfaces are dominated by +/// fragments of compounds (e.g. の境内 → 境 + 内 fragments) and add noise. +pub const MIN_SURFACE_CHARS: usize = 2; + +/// Maximum length to count. Long runs (>20 chars) tend to be wiki-markup +/// artifacts (concatenated table cells, broken templates). +pub const MAX_SURFACE_CHARS: usize = 20; + +/// Frequency floor when emitting candidates. count<3 is heavy long-tail +/// noise — single article typos, OCR errors in references, etc. +pub const DEFAULT_MIN_FREQ: u32 = 3; + +/// Stream-extract kanji-run frequencies from a Wikipedia dump. +/// +/// `dump_path` may be `.xml.bz2` (decompressed on the fly) or already- +/// decompressed `.xml`. Detection is by extension — explicit, no magic-byte +/// guessing. +pub fn extract_kanji_freqs(dump_path: &Path) -> Result, CandidateError> { + let file = File::open(dump_path)?; + let reader: Box = if dump_path.extension().and_then(|s| s.to_str()) == Some("bz2") { + // MultiBzDecoder handles concatenated bz2 streams (Wikipedia dumps + // are sometimes split into multiple bz2 blocks). + Box::new(MultiBzDecoder::new(file)) + } else { + Box::new(file) + }; + let buffered = BufReader::with_capacity(1 << 20, reader); + + let mut freqs: HashMap = HashMap::new(); + let mut in_text = false; + let mut buf = String::new(); + let mut pages_seen: u64 = 0; + let mut pages_scanned: u64 = 0; + let mut bytes_seen: u64 = 0; + let mut last_progress = std::time::Instant::now(); + // Wikitext template depth across line boundaries. Templates `{{...}}` + // contain field-name boilerplate (`乗車人員`, `駅構造`, `所属路線`...) + // that dominates top-frequency noise. Skip everything inside them. + // References `...` similarly contain citation strings. + let mut tmpl_depth: i32 = 0; + let mut in_ref = false; + // Per-page scratch state. arrives before in the dump format, + // so we know whether to scan this page's text by the time we see it. + // Default to article (true) so older dumps without an explicit tag + // still get scanned. + let mut current_page_is_article = true; + + for line_res in buffered.lines() { + let line = line_res?; + bytes_seen += line.len() as u64 + 1; + + // Reset at each boundary so a non-article page doesn't + // poison the next page when no explicit is provided. + if line.contains("") { + current_page_is_article = true; + } + // Parse NUM. Filter to ns=0 (main article namespace). + // Skips Wikipedia: / User: / File: / Template: / Category: pages + // whose template-arg names and file-upload logs dominate top- + // frequency noise. + if let Some(start) = line.find("") { + if let Some(end) = line[start..].find("") { + let raw = &line[start + 4..start + end]; + let ns: i32 = raw.trim().parse().unwrap_or(-1); + current_page_is_article = ns == 0; + } + } + + // Flip in_text on ``. We + // tolerate opening / closing on the same line. + let text_open = line.find(""); + let scan_slice: &str = match (in_text, text_open, text_close) { + (false, Some(o), Some(c)) if c > o => { + // Whole text on one line. + pages_seen += 1; + if current_page_is_article { + pages_scanned += 1; + } + let after_open = &line[o..]; + let body_start = after_open.find('>').map(|p| o + p + 1).unwrap_or(o); + &line[body_start..c] + } + (false, Some(o), None) => { + in_text = true; + pages_seen += 1; + if current_page_is_article { + pages_scanned += 1; + } + let after_open = &line[o..]; + let body_start = after_open.find('>').map(|p| o + p + 1).unwrap_or(o); + &line[body_start..] + } + (true, _, Some(c)) => { + in_text = false; + &line[..c] + } + (true, _, None) => &line[..], + _ => "", + }; + + if !scan_slice.is_empty() && current_page_is_article { + scan_prose_kanji_runs( + scan_slice, + &mut buf, + &mut freqs, + &mut tmpl_depth, + &mut in_ref, + ); + } + + if last_progress.elapsed().as_secs() >= 10 { + eprintln!( + " ... {} pages ({} articles scanned), ~{} MB, {} surfaces", + pages_seen, + pages_scanned, + bytes_seen >> 20, + freqs.len() + ); + last_progress = std::time::Instant::now(); + } + } + + eprintln!( + "Done. {} pages, {} articles scanned, ~{} MB, {} unique surfaces", + pages_seen, + pages_scanned, + bytes_seen >> 20, + freqs.len() + ); + Ok(freqs) +} + +/// Wrapper that skips wikitext template (`{{...}}`) and `` blocks before +/// counting kanji runs. State is carried across calls so multi-line templates +/// stay closed. +/// +/// `tmpl_depth` increases on `{{`, decreases on `}}`. `in_ref` toggles on +/// ``. Outside-block characters are appended to a small local +/// buffer that's flushed to `scan_kanji_runs` when a block opens/closes or at +/// the end of the slice. +fn scan_prose_kanji_runs( + s: &str, + buf: &mut String, + freqs: &mut HashMap, + tmpl_depth: &mut i32, + in_ref: &mut bool, +) { + let bytes = s.as_bytes(); + let mut i = 0; + let mut prose_start = 0; // start of the current prose run (when not inside a block) + while i < bytes.len() { + // Inline match on 2-byte ASCII pairs and `` headers. + // Using as_bytes lets us peek without UTF-8 decoding overhead; + // kanji are multi-byte but we only branch on ASCII patterns. + let in_block = *tmpl_depth > 0 || *in_ref; + let b = bytes[i]; + + if !in_block && b == b'{' && i + 1 < bytes.len() && bytes[i + 1] == b'{' { + // flush prose + if i > prose_start { + scan_kanji_runs(&s[prose_start..i], buf, freqs); + } + *tmpl_depth += 1; + i += 2; + prose_start = i; + continue; + } + if *tmpl_depth > 0 && b == b'{' && i + 1 < bytes.len() && bytes[i + 1] == b'{' { + *tmpl_depth += 1; + i += 2; + continue; + } + if *tmpl_depth > 0 && b == b'}' && i + 1 < bytes.len() && bytes[i + 1] == b'}' { + *tmpl_depth -= 1; + i += 2; + if *tmpl_depth == 0 { + prose_start = i; + } + continue; + } + if !in_block && b == b'<' && s[i..].starts_with("` is one shot; full `...` + // is multi-token. Cheaply check the next `>`. + if i > prose_start { + scan_kanji_runs(&s[prose_start..i], buf, freqs); + } + // Find end of opening tag. + if let Some(rel) = s[i..].find('>') { + let close = i + rel; + // Self-closing if char before `>` is `/`. + if close > 0 && bytes[close - 1] == b'/' { + i = close + 1; + prose_start = i; + continue; + } + *in_ref = true; + i = close + 1; + prose_start = i; + continue; + } else { + // Tag continues to next line; assume opening + *in_ref = true; + i = bytes.len(); + prose_start = i; + break; + } + } + if *in_ref && b == b'<' && s[i..].starts_with("") { + *in_ref = false; + i += 6; + prose_start = i; + continue; + } + + i += 1; + } + if !*in_ref && *tmpl_depth == 0 && prose_start < bytes.len() { + scan_kanji_runs(&s[prose_start..], buf, freqs); + } +} + +/// Scan one slice for maximal kanji runs and bump frequencies. +/// +/// `buf` is reused across calls so we don't reallocate per run. +fn scan_kanji_runs(s: &str, buf: &mut String, freqs: &mut HashMap) { + buf.clear(); + let mut char_count: usize = 0; + for ch in s.chars() { + if is_kanji(ch) { + buf.push(ch); + char_count += 1; + } else if !buf.is_empty() { + if (MIN_SURFACE_CHARS..=MAX_SURFACE_CHARS).contains(&char_count) { + // Avoid cloning the working buffer when the entry is fresh: + // entry().or_insert(buf.clone()) and entry-API patterns both + // require an owned key. Looking up first lets us only clone + // on insert, which is the cold path once vocab saturates. + if let Some(v) = freqs.get_mut(buf.as_str()) { + *v = v.saturating_add(1); + } else { + freqs.insert(buf.clone(), 1); + } + } + buf.clear(); + char_count = 0; + } + } + if !buf.is_empty() && (MIN_SURFACE_CHARS..=MAX_SURFACE_CHARS).contains(&char_count) { + if let Some(v) = freqs.get_mut(buf.as_str()) { + *v = v.saturating_add(1); + } else { + freqs.insert(buf.clone(), 1); + } + } +} + +/// CJK Unified Ideographs (U+4E00–U+9FFF) plus iteration mark 々 (U+3005). +/// Excludes Extension A/B (rare archaic chars dominate noise) and katakana +/// ヶ (typically a counter, not a content char). +fn is_kanji(ch: char) -> bool { + matches!(ch, '\u{4E00}'..='\u{9FFF}' | '\u{3005}') +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn scan_extracts_maximal_kanji_runs() { + let mut buf = String::new(); + let mut f = HashMap::new(); + scan_kanji_runs("これは日本語の文章です", &mut buf, &mut f); + // 日本語 (3 chars) and 文章 (2 chars) qualify. + // 「これは / の / です」are hiragana — skipped. + assert_eq!(f.get("日本語"), Some(&1)); + assert_eq!(f.get("文章"), Some(&1)); + assert_eq!(f.len(), 2); + } + + #[test] + fn scan_drops_single_char_surfaces() { + let mut buf = String::new(); + let mut f = HashMap::new(); + // 「私」と「本」は 1 字 → MIN_SURFACE_CHARS=2 で skip + scan_kanji_runs("私の本", &mut buf, &mut f); + assert!(f.is_empty()); + } + + #[test] + fn scan_drops_oversized_runs() { + let mut buf = String::new(); + let mut f = HashMap::new(); + let huge: String = "亜".repeat(MAX_SURFACE_CHARS + 1); + scan_kanji_runs(&huge, &mut buf, &mut f); + assert!(f.is_empty()); + // Boundary: exactly MAX_SURFACE_CHARS should survive. + let edge: String = "亜".repeat(MAX_SURFACE_CHARS); + let mut f2 = HashMap::new(); + scan_kanji_runs(&edge, &mut buf, &mut f2); + assert_eq!(f2.get(edge.as_str()), Some(&1)); + } + + #[test] + fn scan_treats_iter_mark_as_kanji() { + let mut buf = String::new(); + let mut f = HashMap::new(); + // 「人々」は 々 を含む 2-char surface → keep + scan_kanji_runs("人々が集まる", &mut buf, &mut f); + assert_eq!(f.get("人々"), Some(&1)); + } + + #[test] + fn scan_accumulates_frequency() { + let mut buf = String::new(); + let mut f = HashMap::new(); + scan_kanji_runs("日本語と日本語と日本語", &mut buf, &mut f); + assert_eq!(f.get("日本語"), Some(&3)); + } + + #[test] + fn scan_emits_run_at_eol() { + // Run that runs to end-of-string (no trailing non-kanji) must still + // be flushed. + let mut buf = String::new(); + let mut f = HashMap::new(); + scan_kanji_runs("文末は日本語", &mut buf, &mut f); + assert_eq!(f.get("文末"), Some(&1)); + assert_eq!(f.get("日本語"), Some(&1)); + } + + #[test] + fn is_kanji_classifies_correctly() { + assert!(is_kanji('日')); + assert!(is_kanji('語')); + assert!(is_kanji('々')); + assert!(!is_kanji('あ')); // hiragana + assert!(!is_kanji('ア')); // katakana + assert!(!is_kanji('A')); // ascii + assert!(!is_kanji('1')); // digit + } +} + +#[cfg(test)] +mod prose_tests { + use super::*; + use std::collections::HashMap; + + fn scan_one(s: &str) -> HashMap { + let mut buf = String::new(); + let mut f = HashMap::new(); + let mut depth = 0; + let mut in_ref = false; + scan_prose_kanji_runs(s, &mut buf, &mut f, &mut depth, &mut in_ref); + assert_eq!(depth, 0); + assert!(!in_ref); + f + } + + #[test] + fn template_block_is_skipped() { + // 通常文章 (4 kanji), then a template block, then 続きの文章 — the + // hiragana き / の inside the tail break the run, so only "文章" + // survives from the tail. The template's "乗車人員" must NOT count. + let f = scan_one("通常文章{{infobox|乗車人員=12345}}続きの文章"); + assert_eq!(f.get("通常文章"), Some(&1)); + assert_eq!(f.get("文章"), Some(&1)); + assert!(!f.contains_key("乗車人員")); + } + + #[test] + fn nested_template_closes_correctly() { + let f = scan_one("外側{{a|{{b|内側}}|x}}終端文章"); + assert_eq!(f.get("外側"), Some(&1)); + assert_eq!(f.get("終端文章"), Some(&1)); + // Inside nested template — must not be counted. + assert!(!f.contains_key("内側")); + } + + #[test] + fn ref_block_is_skipped() { + // 本文章 (3-kanji) + ref block + 続き文章. After ref skip, き breaks + // the tail run, so only "文章" survives from the trailer. + let f = scan_one("本文章引用元の出典続き文章"); + assert_eq!(f.get("本文章"), Some(&1)); + assert_eq!(f.get("文章"), Some(&1)); + assert!(!f.contains_key("出典")); + assert!(!f.contains_key("引用元")); + } + + #[test] + fn self_closing_ref_is_handled() { + let f = scan_one("先頭文章終端文章"); + assert_eq!(f.get("先頭文章"), Some(&1)); + assert_eq!(f.get("終端文章"), Some(&1)); + } + + #[test] + fn template_state_persists_across_slices() { + let mut buf = String::new(); + let mut f = HashMap::new(); + let mut depth = 0; + let mut in_ref = false; + scan_prose_kanji_runs( + "普通文{{tmpl|内容", + &mut buf, + &mut f, + &mut depth, + &mut in_ref, + ); + assert_eq!(depth, 1); + scan_prose_kanji_runs( + "続き|更に}}終了文章", + &mut buf, + &mut f, + &mut depth, + &mut in_ref, + ); + assert_eq!(depth, 0); + assert_eq!(f.get("普通文"), Some(&1)); + assert_eq!(f.get("終了文章"), Some(&1)); + assert!(!f.contains_key("内容")); + assert!(!f.contains_key("更に")); + } +} diff --git a/engine/crates/lex-cli/src/commands/candidates_ops.rs b/engine/crates/lex-cli/src/commands/candidates_ops.rs index ecb3130..b6910ef 100644 --- a/engine/crates/lex-cli/src/commands/candidates_ops.rs +++ b/engine/crates/lex-cli/src/commands/candidates_ops.rs @@ -2,11 +2,13 @@ use std::collections::HashSet; use std::fs; +use std::io::{BufWriter, Write}; use std::path::{Path, PathBuf}; use lex_core::dict::{Dictionary, TrieDictionary}; use crate::candidates::sudachi; +use crate::candidates::wikipedia; use crate::candidates::{classify_pos_string, write_candidates, Bucket, Candidate, CandidateError}; /// Mine extras candidates from SudachiDict. @@ -87,6 +89,87 @@ pub fn mine( Ok(()) } +/// Mine extras candidates from a Wikipedia XML dump. +/// +/// Surface-first pipeline (see `candidates::wikipedia`): +/// 1. Stream the dump, count maximal kanji runs by frequency. +/// 2. Diff against the merged build dict's surface set. +/// 3. Surfaces NOT in the build dict, with `freq >= min_freq`, are written +/// to `wikipedia.tsv` sorted by frequency descending. +/// +/// Reading-assignment is intentionally skipped here. The user reviews top-N +/// surfaces and assigns readings by hand (or via a separate tool) before +/// promoting to `extras/.tsv`. This mirrors the existing +/// `mine`-then-promote-by-hand workflow. +pub fn corpus( + dump_path: &Path, + build_dict_path: &Path, + out_dir: &Path, + min_freq: u32, +) -> Result<(), CandidateError> { + eprintln!("Scanning {} ...", dump_path.display()); + let freqs = wikipedia::extract_kanji_freqs(dump_path)?; + + let dict = TrieDictionary::open(build_dict_path).map_err(|e| { + CandidateError::Parse(format!( + "open build dict {}: {e}", + build_dict_path.display() + )) + })?; + + // Build the build-dict surface set once. At ~1.2M entries this is ~10MB + // of String storage; trivial vs the freq map (~few hundred MB at + // full-corpus scale before frequency filtering). + let mut covered: HashSet = HashSet::new(); + for (_reading, entries) in dict.iter() { + for e in entries { + covered.insert(e.surface); + } + } + eprintln!("Build dict covers {} unique surfaces.", covered.len()); + + // Filter + sort: keep only surfaces NOT in build dict, with freq>=min, + // sort by freq desc then surface asc for deterministic output. + let mut gaps: Vec<(String, u32)> = freqs + .into_iter() + .filter(|(s, f)| *f >= min_freq && !covered.contains(s)) + .collect(); + gaps.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0))); + + fs::create_dir_all(out_dir)?; + let path = out_dir.join("wikipedia.tsv"); + let file = fs::File::create(&path)?; + let mut w = BufWriter::new(file); + writeln!(w, "# Candidate pool for the curated `extras/` layer.")?; + writeln!(w, "# Source: Wikipedia 日本語 dump (CC-BY-SA)")?; + writeln!( + w, + "# Generated by `dictool candidates corpus` — DO NOT edit manually." + )?; + writeln!( + w, + "# Surfaces NOT in the build dict, freq >= {min_freq}, sorted desc." + )?; + writeln!( + w, + "# Reading is NOT assigned — pick top-N by hand and look up readings" + )?; + writeln!(w, "# before promoting to extras/.tsv. Gitignored.")?; + writeln!(w, "#")?; + writeln!(w, "# format: surface\\tfreq")?; + for (s, f) in &gaps { + writeln!(w, "{s}\t{f}")?; + } + w.flush()?; + eprintln!( + "Wrote {} gap surfaces (freq >= {}) to {}", + gaps.len(), + min_freq, + path.display() + ); + Ok(()) +} + /// Default cache dir for the working SudachiDict download. Sits under /// `engine/data/` like the other dict artifacts, but with a leading dot so /// it sorts away from the production caches (`mozc-raw/`, `extras-raw/`) From 0a6d902cb304fcef3727f1409bc593740aa52956 Mon Sep 17 00:00:00 2001 From: "SAKAI, Kazuaki" Date: Thu, 14 May 2026 19:02:04 +0900 Subject: [PATCH 2/9] =?UTF-8?q?fix(candidates):=20PR244=20Copilot=20R1=20?= =?UTF-8?q?=E2=80=94=203=20findings=20(2=20IMP,=201=20MINOR)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. (IMP) `` (common in Wikipedia citation sections). Since `` closes with `` rather than ``, `in_ref` got trapped true and silently dropped the rest of the page (and subsequent pages until the next page-boundary reset). Added a strict tag-name boundary check (``/`/`/EOL). 2. (IMP) Block-skip state (`tmpl_depth`, `in_ref`, `buf`) wasn't reset at `` boundaries. Real dumps sometimes contain unbalanced `{{...` / `` line. 3. (MINOR) CLI `default_value_t = 3` duplicated the `wikipedia::DEFAULT_MIN_FREQ` constant. Reference the constant directly so they can't drift. Empirical impact on jawiki-articles1.bz2: 304K → 334K gap surfaces (+30K previously lost to the `` trap). Tests: - `references_tag_does_not_trap_in_ref` covers both self-closing `` and `...` forms. - `page_boundary_resets_block_state` drives `extract_kanji_freqs` over a synthetic 2-page dump where page 1 has an unclosed template and verifies page 2 is still fully scanned. Co-Authored-By: Claude Opus 4.7 (1M context) --- engine/crates/lex-cli/src/bin/dictool.rs | 6 +- .../lex-cli/src/candidates/wikipedia.rs | 76 ++++++++++++++++++- 2 files changed, 79 insertions(+), 3 deletions(-) diff --git a/engine/crates/lex-cli/src/bin/dictool.rs b/engine/crates/lex-cli/src/bin/dictool.rs index 991a56a..e41c5ce 100644 --- a/engine/crates/lex-cli/src/bin/dictool.rs +++ b/engine/crates/lex-cli/src/bin/dictool.rs @@ -2,6 +2,7 @@ use std::path::Path; use clap::{Parser, Subcommand}; +use lex_cli::candidates::wikipedia; use lex_cli::commands::{candidates_ops, config_ops, convert_ops, dict_ops, user_dict_ops}; /// Parse a `SOURCE:DIR` pair for `--extra-source`. @@ -270,8 +271,9 @@ enum CandidatesAction { /// Output dir. Default: engine/data/extras-candidates #[arg(long)] out_dir: Option, - /// Drop surfaces with frequency below this (default: 3). - #[arg(long, default_value_t = 3)] + /// Drop surfaces with frequency below this. Default tracks + /// `candidates::wikipedia::DEFAULT_MIN_FREQ` so this stays in sync. + #[arg(long, default_value_t = wikipedia::DEFAULT_MIN_FREQ)] min_freq: u32, }, } diff --git a/engine/crates/lex-cli/src/candidates/wikipedia.rs b/engine/crates/lex-cli/src/candidates/wikipedia.rs index 729e065..dc45d44 100644 --- a/engine/crates/lex-cli/src/candidates/wikipedia.rs +++ b/engine/crates/lex-cli/src/candidates/wikipedia.rs @@ -75,8 +75,15 @@ pub fn extract_kanji_freqs(dump_path: &Path) -> Result, Can // Reset at each boundary so a non-article page doesn't // poison the next page when no explicit is provided. + // Also reset markup-skip state: a page with unbalanced `{{...` or + // `") { current_page_is_article = true; + tmpl_depth = 0; + in_ref = false; + buf.clear(); } // Parse NUM. Filter to ns=0 (main article namespace). // Skips Wikipedia: / User: / File: / Template: / Category: pages @@ -203,7 +210,7 @@ fn scan_prose_kanji_runs( } continue; } - if !in_block && b == b'<' && s[i..].starts_with("` is one shot; full `...` // is multi-token. Cheaply check the next `>`. if i > prose_start { @@ -244,6 +251,23 @@ fn scan_prose_kanji_runs( } } +/// Distinguish `` / `` / `` from `` and +/// `` etc. `` closes with `` (not +/// ``), so naive `starts_with(" bool { + if !slice.starts_with("') | Some(b'/') => true, + // EOF after " true, + _ => false, + } +} + /// Scan one slice for maximal kanji runs and bump frequencies. /// /// `buf` is reused across calls so we don't reallocate per run. @@ -446,4 +470,54 @@ mod prose_tests { assert!(!f.contains_key("内容")); assert!(!f.contains_key("更に")); } + + #[test] + fn references_tag_does_not_trap_in_ref() { + // `` (and `` / ``) + // closes with ``, NOT ``. Naive `末尾文章"); + assert_eq!(f.get("先頭文章"), Some(&1)); + assert_eq!(f.get("末尾文章"), Some(&1)); + + let f2 = scan_one("先頭文章引用集末尾文章"); + assert_eq!(f2.get("先頭文章"), Some(&1)); + assert_eq!(f2.get("末尾文章"), Some(&1)); + // The body content here happens to look like prose + // since we didn't treat it as a block — that's fine; we trade + // theoretical "block body" purity for not losing the rest of the + // page when never arrives. + } + + #[test] + fn page_boundary_resets_block_state() { + // A page with unbalanced `{{...` (no closing `}}`) leaves + // tmpl_depth > 0. The driver loop resets state at + // boundaries — verify that the SECOND page is fully scanned. + let dump = "\n0\n第一段{{壊れ|未閉\n\n\ + \n0\n第二段文章\n"; + let freqs = extract_kanji_freqs_from_str(dump).unwrap(); + // 第一段 must be present (scanned before the open `{{`). + assert_eq!(freqs.get("第一段"), Some(&1)); + // 第二段文章 must be present — would be missing if state leaked. + assert_eq!(freqs.get("第二段文章"), Some(&1)); + // The unclosed-template body must NOT leak through. + assert!(!freqs.contains_key("未閉")); + } + + /// Test helper: drive `extract_kanji_freqs` with an in-memory dump. + fn extract_kanji_freqs_from_str(s: &str) -> Result, CandidateError> { + let tmp = std::env::temp_dir().join(format!( + "lexime_test_dump_{}.xml", + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos() + )); + std::fs::write(&tmp, s)?; + let r = extract_kanji_freqs(&tmp); + let _ = std::fs::remove_file(&tmp); + r + } } From daca2847852c4503266bc9180ad15d867466198d Mon Sep 17 00:00:00 2001 From: "SAKAI, Kazuaki" Date: Thu, 14 May 2026 19:05:39 +0900 Subject: [PATCH 3/9] fix(audit): exempt bzip2 / bzip2-sys (PR244 audit CI fix) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `cargo vet check` was failing on the audit job because the new `bzip2` / `bzip2-sys` deps (added in c9684a0 for `dictool candidates corpus`) were unvetted. Add same-pattern exemptions matching the existing `zip` entry — both are pulled in only by the dev/build CLI, not the IME runtime. Co-Authored-By: Claude Opus 4.7 (1M context) --- engine/supply-chain/config.toml | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/engine/supply-chain/config.toml b/engine/supply-chain/config.toml index 70b0611..c1a01ff 100644 --- a/engine/supply-chain/config.toml +++ b/engine/supply-chain/config.toml @@ -64,16 +64,6 @@ criteria = "safe-to-deploy" version = "2.11.0" criteria = "safe-to-deploy" -# Raised from `safe-to-run` to `safe-to-deploy` in PR #242. Two paths now -# pull bumpalo as a transitive dependency: -# - `zip 7` → `zopfli` → `bumpalo` (used by `dictool candidates mine`'s -# ZIP-extraction code; reachable from a CLI tool we may distribute). -# - `candle-core` → wasm-bindgen-macro-support → `bumpalo` (proc-macro -# side, only with the neural feature). -# Either path alone would be enough for cargo-vet to demand `safe-to-deploy` -# evaluation. The IME runtime itself doesn't directly use bumpalo at -# runtime (the dictool CLI is a separate binary), but the elevated trust -# level is required for `cargo vet check` to pass on the workspace graph. [[exemptions.bumpalo]] version = "3.19.1" criteria = "safe-to-deploy" @@ -90,6 +80,14 @@ criteria = "safe-to-deploy" version = "1.11.1" criteria = "safe-to-deploy" +[[exemptions.bzip2]] +version = "0.4.4" +criteria = "safe-to-deploy" + +[[exemptions.bzip2-sys]] +version = "0.1.13+1.0.8" +criteria = "safe-to-deploy" + [[exemptions.camino]] version = "1.2.2" criteria = "safe-to-deploy" From 284b25eb3635c7cc4f4a18aa7c93cf697b01268b Mon Sep 17 00:00:00 2001 From: "SAKAI, Kazuaki" Date: Thu, 14 May 2026 19:06:21 +0900 Subject: [PATCH 4/9] chore(audit): restore bumpalo exemption comment (lost in daca284) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `cargo vet check` ran during the audit-fix work stripped the inline comment above `[[exemptions.bumpalo]]` as part of its config-file normalization. Put it back — the explanation of why bumpalo is elevated to `safe-to-deploy` is load-bearing context for future audits. --- engine/supply-chain/config.toml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/engine/supply-chain/config.toml b/engine/supply-chain/config.toml index c1a01ff..f0595f2 100644 --- a/engine/supply-chain/config.toml +++ b/engine/supply-chain/config.toml @@ -64,6 +64,16 @@ criteria = "safe-to-deploy" version = "2.11.0" criteria = "safe-to-deploy" +# Raised from `safe-to-run` to `safe-to-deploy` in PR #242. Two paths now +# pull bumpalo as a transitive dependency: +# - `zip 7` → `zopfli` → `bumpalo` (used by `dictool candidates mine`'s +# ZIP-extraction code; reachable from a CLI tool we may distribute). +# - `candle-core` → wasm-bindgen-macro-support → `bumpalo` (proc-macro +# side, only with the neural feature). +# Either path alone would be enough for cargo-vet to demand `safe-to-deploy` +# evaluation. The IME runtime itself doesn't directly use bumpalo at +# runtime (the dictool CLI is a separate binary), but the elevated trust +# level is required for `cargo vet check` to pass on the workspace graph. [[exemptions.bumpalo]] version = "3.19.1" criteria = "safe-to-deploy" From 1507c4e47599b0d66ce094db3d2ceb3d8d636b22 Mon Sep 17 00:00:00 2001 From: "SAKAI, Kazuaki" Date: Thu, 14 May 2026 19:10:15 +0900 Subject: [PATCH 5/9] fix(audit): accept bzip2-sys in build-script baseline `scripts/check-build-scripts.sh` flagged `bzip2-sys` as a new crate with `build.rs` after PR #244 added the bzip2 dep for the Wikipedia corpus miner. The build.rs is upstream-standard (compiles vendored libbz2 C source via `cc`), same supply-chain posture as the existing audited C/build-script crates in the baseline (libc, ring, rustls, ring-bindgen, etc.). Accept by updating the baseline. --- engine/build-script-baseline.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/engine/build-script-baseline.txt b/engine/build-script-baseline.txt index 78c0527..c2b0e34 100644 --- a/engine/build-script-baseline.txt +++ b/engine/build-script-baseline.txt @@ -1,4 +1,5 @@ anyhow +bzip2-sys camino crc32fast crossbeam-utils From 5eeb3f80feff51cfac6fe2a0ebf54224fd4264cc Mon Sep 17 00:00:00 2001 From: "SAKAI, Kazuaki" Date: Thu, 14 May 2026 19:14:04 +0900 Subject: [PATCH 6/9] =?UTF-8?q?fix(candidates):=20PR244=20Copilot=20R2=20?= =?UTF-8?q?=E2=80=94=202=20findings=20(1=20IMP,=201=20MINOR)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. (IMP) `` self-closing form wasn't handled. The pattern match treated it like a normal opening (`` with no `` on the same line), so `in_text` stuck true for that page. If the next line was XML metadata (e.g. `` of the following page when self-closing immediately precedes `</page>`), it would have been scanned as prose and polluted frequency counts. Detect `/>` before the first `>` of the opening tag and short-circuit. Also reset `in_text` at `<page>` boundaries alongside the other state resets (defence-in-depth). 2. (MINOR) Test helper wrote to a timestamp-based path under `std::env::temp_dir()`, which could collide on parallel test runs and leave files behind on panic. Refactored `extract_kanji_freqs` to split out a `extract_kanji_freqs_from_reader` private API that takes any `impl BufRead`; tests now run against `Cursor<&[u8]>` directly, no filesystem involvement. The earlier R2 finding about supply-chain updates (Cargo.toml +35) is already addressed in daca284 + 1507c4e — resolving as stale. Tests: - New `self_closing_text_tag_is_handled` constructs a 2-page dump where page 1 is `<text bytes="0" />` (self-closing) and verifies page 2's body is still scanned AND page 2's `<title>` metadata is NOT counted (would be if in_text leaked). - All existing tests migrated to the reader-based helper. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --- .../lex-cli/src/candidates/wikipedia.rs | 103 ++++++++++++------ 1 file changed, 70 insertions(+), 33 deletions(-) diff --git a/engine/crates/lex-cli/src/candidates/wikipedia.rs b/engine/crates/lex-cli/src/candidates/wikipedia.rs index dc45d44..c937e2d 100644 --- a/engine/crates/lex-cli/src/candidates/wikipedia.rs +++ b/engine/crates/lex-cli/src/candidates/wikipedia.rs @@ -48,8 +48,16 @@ pub fn extract_kanji_freqs(dump_path: &Path) -> Result<HashMap<String, u32>, Can } else { Box::new(file) }; - let buffered = BufReader::with_capacity(1 << 20, reader); + extract_kanji_freqs_from_reader(BufReader::with_capacity(1 << 20, reader), true) +} +/// Pure-stream variant of `extract_kanji_freqs`. Public-in-crate so tests +/// can exercise the parse loop without touching the filesystem (avoids +/// flaky tempfile races in parallel runs). +pub(crate) fn extract_kanji_freqs_from_reader<R: BufRead>( + reader: R, + progress: bool, +) -> Result<HashMap<String, u32>, CandidateError> { let mut freqs: HashMap<String, u32> = HashMap::new(); let mut in_text = false; let mut buf = String::new(); @@ -69,20 +77,23 @@ pub fn extract_kanji_freqs(dump_path: &Path) -> Result<HashMap<String, u32>, Can // still get scanned. let mut current_page_is_article = true; - for line_res in buffered.lines() { + for line_res in reader.lines() { let line = line_res?; bytes_seen += line.len() as u64 + 1; // Reset at each <page> boundary so a non-article page doesn't // poison the next page when no explicit <ns> is provided. - // Also reset markup-skip state: a page with unbalanced `{{...` or - // `<ref ...` (real dumps contain these) would otherwise drag its - // open-block state into the next page and silently skip everything - // that follows. + // Also reset markup-skip state: a page with unbalanced `{{...`, + // `<ref ...`, or `<text` (without matching close) would otherwise + // drag its open-block state into the next page and silently skip + // everything that follows. `in_text` is reset here too — a stray + // self-closing/unclosed `<text` on a non-article page must not + // make us treat subsequent XML metadata of the NEXT page as prose. if line.contains("<page>") { current_page_is_article = true; tmpl_depth = 0; in_ref = false; + in_text = false; buf.clear(); } // Parse <ns>NUM</ns>. Filter to ns=0 (main article namespace). @@ -97,12 +108,30 @@ pub fn extract_kanji_freqs(dump_path: &Path) -> Result<HashMap<String, u32>, Can } } - // Flip in_text on `<text` (any attrs OK), reset on `</text>`. We - // tolerate <text> opening / closing on the same line. + // Detect the `<text` opening tag, with three flavours to handle: + // `<text>body</text>` single line (close after open) + // `<text>body` open continues to next line + // `<text ... />` self-closing, empty body (rare but + // present in real dumps for redirect / + // stub pages) let text_open = line.find("<text"); let text_close = line.find("</text>"); - let scan_slice: &str = match (in_text, text_open, text_close) { - (false, Some(o), Some(c)) if c > o => { + let text_self_closing = text_open.is_some_and(|o| { + // Self-closing iff the first `>` after `<text` is preceded by `/`. + line[o..] + .find('>') + .is_some_and(|rel| rel > 0 && line.as_bytes()[o + rel - 1] == b'/') + }); + let scan_slice: &str = match (in_text, text_open, text_close, text_self_closing) { + (false, Some(_), _, true) => { + // Self-closing `<text ... />` — page seen, nothing to scan. + pages_seen += 1; + if current_page_is_article { + pages_scanned += 1; + } + "" + } + (false, Some(o), Some(c), false) if c > o => { // Whole text on one line. pages_seen += 1; if current_page_is_article { @@ -112,7 +141,7 @@ pub fn extract_kanji_freqs(dump_path: &Path) -> Result<HashMap<String, u32>, Can let body_start = after_open.find('>').map(|p| o + p + 1).unwrap_or(o); &line[body_start..c] } - (false, Some(o), None) => { + (false, Some(o), None, false) => { in_text = true; pages_seen += 1; if current_page_is_article { @@ -122,11 +151,11 @@ pub fn extract_kanji_freqs(dump_path: &Path) -> Result<HashMap<String, u32>, Can let body_start = after_open.find('>').map(|p| o + p + 1).unwrap_or(o); &line[body_start..] } - (true, _, Some(c)) => { + (true, _, Some(c), _) => { in_text = false; &line[..c] } - (true, _, None) => &line[..], + (true, _, None, _) => &line[..], _ => "", }; @@ -140,7 +169,7 @@ pub fn extract_kanji_freqs(dump_path: &Path) -> Result<HashMap<String, u32>, Can ); } - if last_progress.elapsed().as_secs() >= 10 { + if progress && last_progress.elapsed().as_secs() >= 10 { eprintln!( " ... {} pages ({} articles scanned), ~{} MB, {} surfaces", pages_seen, @@ -152,13 +181,15 @@ pub fn extract_kanji_freqs(dump_path: &Path) -> Result<HashMap<String, u32>, Can } } - eprintln!( - "Done. {} pages, {} articles scanned, ~{} MB, {} unique surfaces", - pages_seen, - pages_scanned, - bytes_seen >> 20, - freqs.len() - ); + if progress { + eprintln!( + "Done. {} pages, {} articles scanned, ~{} MB, {} unique surfaces", + pages_seen, + pages_scanned, + bytes_seen >> 20, + freqs.len() + ); + } Ok(freqs) } @@ -506,18 +537,24 @@ mod prose_tests { assert!(!freqs.contains_key("未閉")); } - /// Test helper: drive `extract_kanji_freqs` with an in-memory dump. + #[test] + fn self_closing_text_tag_is_handled() { + // `<text ... />` (empty content, e.g. for redirect / stub pages) + // must NOT flip in_text to true — otherwise subsequent XML metadata + // lines of the next page would be scanned as prose. + let dump = "<page>\n<ns>0</ns>\n<text bytes=\"0\" />\n</page>\n\ + <page>\n<ns>0</ns>\n<title>普通記事\n本文文章\n"; + let freqs = extract_kanji_freqs_from_str(dump).unwrap(); + // 本文文章 from the second page must be counted. + assert_eq!(freqs.get("本文文章"), Some(&1)); + // The XML metadata of page 2 (`普通記事`) must NOT + // be counted as prose — would leak if in_text stuck true. + assert!(!freqs.contains_key("普通記事")); + } + + /// Test helper: drive the stream parser with an in-memory dump. + /// Avoids tempfile flakiness in parallel test runs. fn extract_kanji_freqs_from_str(s: &str) -> Result, CandidateError> { - let tmp = std::env::temp_dir().join(format!( - "lexime_test_dump_{}.xml", - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap() - .as_nanos() - )); - std::fs::write(&tmp, s)?; - let r = extract_kanji_freqs(&tmp); - let _ = std::fs::remove_file(&tmp); - r + extract_kanji_freqs_from_reader(std::io::Cursor::new(s.as_bytes()), false) } } From 1e943efb61f821186478195e7f2c8e6d549301ac Mon Sep 17 00:00:00 2001 From: "SAKAI, Kazuaki" Date: Thu, 14 May 2026 19:19:20 +0900 Subject: [PATCH 7/9] =?UTF-8?q?fix(candidates):=20PR244=20Copilot=20R3=20?= =?UTF-8?q?=E2=80=94=201=20MINOR=20(doc),=201=20MINOR=20resolved=20as=20WO?= =?UTF-8?q?NTFIX?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit doc: `scan_prose_kanji_runs`'s doc comment said outside-block chars were "appended to a small local buffer". The implementation actually slices the input string directly (`&s[prose_start..i]`) and only the inner `scan_kanji_runs` reuses `buf` for the per-run kanji accumulator. Rewrote to match. The other R3 finding (perf: `dict.iter()` in `candidates_ops::corpus` materializes all readings/surfaces) is a dev-tool runtime-profile concern — same posture as the `dictool candidates mine` perf MINORs covered in feedback memory. The pipeline still completes in ~32s for jawiki-articles1.bz2 and the build dict surface set is ~10MB; not worth a lex-core API addition this PR. Resolved as WONTFIX. Co-Authored-By: Claude Opus 4.7 (1M context) --- engine/crates/lex-cli/src/candidates/wikipedia.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/engine/crates/lex-cli/src/candidates/wikipedia.rs b/engine/crates/lex-cli/src/candidates/wikipedia.rs index c937e2d..616df83 100644 --- a/engine/crates/lex-cli/src/candidates/wikipedia.rs +++ b/engine/crates/lex-cli/src/candidates/wikipedia.rs @@ -198,9 +198,10 @@ pub(crate) fn extract_kanji_freqs_from_reader( /// stay closed. /// /// `tmpl_depth` increases on `{{`, decreases on `}}`. `in_ref` toggles on -/// ``. Outside-block characters are appended to a small local -/// buffer that's flushed to `scan_kanji_runs` when a block opens/closes or at -/// the end of the slice. +/// ``. Outside-block byte ranges are passed by reference +/// (`&s[prose_start..i]`) directly to `scan_kanji_runs` whenever a block +/// opens, closes, or the slice ends — no intermediate copy of the prose +/// itself; only the per-run `buf` inside `scan_kanji_runs` is reused. fn scan_prose_kanji_runs( s: &str, buf: &mut String, From 53bc79b15ed8a1780b7f6af18cdee46bd8b0b934 Mon Sep 17 00:00:00 2001 From: "SAKAI, Kazuaki" Date: Thu, 14 May 2026 19:24:13 +0900 Subject: [PATCH 8/9] docs(candidates): note UTF-8 safety invariant in scan_prose_kanji_runs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR244 Copilot R4 flagged the byte-indexed loop as potentially panicking on `&s[prose_start..i]` if a UTF-8 continuation byte matched `{` / `}` / `<`. That's not possible per the UTF-8 spec: continuation bytes are 0x80-0xBF, and our ASCII delimiters are 0x00-0x7F. Add an inline note so the invariant is visible in the source — preempts future re-raises without changing behavior. Co-Authored-By: Claude Opus 4.7 (1M context) --- engine/crates/lex-cli/src/candidates/wikipedia.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/engine/crates/lex-cli/src/candidates/wikipedia.rs b/engine/crates/lex-cli/src/candidates/wikipedia.rs index 616df83..5e90221 100644 --- a/engine/crates/lex-cli/src/candidates/wikipedia.rs +++ b/engine/crates/lex-cli/src/candidates/wikipedia.rs @@ -212,6 +212,12 @@ fn scan_prose_kanji_runs( let bytes = s.as_bytes(); let mut i = 0; let mut prose_start = 0; // start of the current prose run (when not inside a block) + // UTF-8 safety: this loop is byte-indexed, but `&s[prose_start..i]` + // slicing is always at a char boundary because every advance of + // either index happens just past an ASCII delimiter byte (`{`, `}`, + // `<`, `>`, `/`, or whitespace — all 0x00-0x7F). UTF-8 continuation + // bytes are strictly 0x80-0xBF, so multi-byte chars (kanji etc.) + // cannot contribute a byte that matches any of our ASCII branches. while i < bytes.len() { // Inline match on 2-byte ASCII pairs and `` headers. // Using as_bytes lets us peek without UTF-8 decoding overhead; From e4b6fa693caa5eb541da932f2c9b746f12e89c06 Mon Sep 17 00:00:00 2001 From: "SAKAI, Kazuaki" Date: Thu, 14 May 2026 19:27:01 +0900 Subject: [PATCH 9/9] fix(candidates): satisfy rustfmt on UTF-8-safety doc comment The doc block landed as a trailing comment on `let mut prose_start = 0;` which rustfmt then re-indents to a confusing column. Move the comment to its own block above the binding so it formats cleanly. --- engine/crates/lex-cli/src/candidates/wikipedia.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/engine/crates/lex-cli/src/candidates/wikipedia.rs b/engine/crates/lex-cli/src/candidates/wikipedia.rs index 5e90221..cc8e22a 100644 --- a/engine/crates/lex-cli/src/candidates/wikipedia.rs +++ b/engine/crates/lex-cli/src/candidates/wikipedia.rs @@ -211,13 +211,15 @@ fn scan_prose_kanji_runs( ) { let bytes = s.as_bytes(); let mut i = 0; - let mut prose_start = 0; // start of the current prose run (when not inside a block) + // start of the current prose run (when not inside a block). + // // UTF-8 safety: this loop is byte-indexed, but `&s[prose_start..i]` - // slicing is always at a char boundary because every advance of - // either index happens just past an ASCII delimiter byte (`{`, `}`, - // `<`, `>`, `/`, or whitespace — all 0x00-0x7F). UTF-8 continuation - // bytes are strictly 0x80-0xBF, so multi-byte chars (kanji etc.) - // cannot contribute a byte that matches any of our ASCII branches. + // slicing is always at a char boundary because every advance of either + // index happens just past an ASCII delimiter byte (`{`, `}`, `<`, `>`, + // `/`, or whitespace — all 0x00-0x7F). UTF-8 continuation bytes are + // strictly 0x80-0xBF, so multi-byte chars (kanji etc.) cannot + // contribute a byte that matches any of our ASCII branches. + let mut prose_start = 0; while i < bytes.len() { // Inline match on 2-byte ASCII pairs and `` headers. // Using as_bytes lets us peek without UTF-8 decoding overhead;