Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions engine/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions engine/build-script-baseline.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
anyhow
bzip2-sys
camino
crc32fast
crossbeam-utils
Expand Down
3 changes: 3 additions & 0 deletions engine/crates/lex-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,6 @@ toml = { workspace = true }
# Kept as a hard dep (not optional) since the CLI is a build/dev tool and
# not shipped with the IME itself.
zip = { version = "7", default-features = false, features = ["deflate"] }
# Used by `dictool candidates corpus` only — Wikipedia dumps ship as bz2.
# Streaming decompress so we never materialize the full ~14GB XML on disk.
bzip2 = "0.4"
Comment thread
send marked this conversation as resolved.
39 changes: 39 additions & 0 deletions engine/crates/lex-cli/src/bin/dictool.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use std::path::Path;

use clap::{Parser, Subcommand};

use lex_cli::candidates::wikipedia;
use lex_cli::commands::{candidates_ops, config_ops, convert_ops, dict_ops, user_dict_ops};

/// Parse a `SOURCE:DIR` pair for `--extra-source`.
Expand Down Expand Up @@ -255,6 +256,26 @@ enum CandidatesAction {
#[arg(long)]
clean: bool,
},
/// Mine kanji-run candidates from a Wikipedia XML dump (.xml or .xml.bz2).
///
/// Writes `wikipedia.tsv` with `surface\tfreq` rows for surfaces NOT in
/// the build dict, sorted by frequency descending. Reading-assignment is
/// done by hand on the top-N rows before promoting to `extras/`.
Corpus {
/// Path to the Wikipedia dump (.xml or .xml.bz2). User-supplied —
/// download from https://dumps.wikimedia.org/jawiki/latest/ first.
dump: String,
/// Build dict to diff against. Default: engine/data/lexime.dict
#[arg(long)]
build_dict: Option<String>,
/// Output dir. Default: engine/data/extras-candidates
#[arg(long)]
out_dir: Option<String>,
/// Drop surfaces with frequency below this. Default tracks
/// `candidates::wikipedia::DEFAULT_MIN_FREQ` so this stays in sync.
#[arg(long, default_value_t = wikipedia::DEFAULT_MIN_FREQ)]
min_freq: u32,
},
}

fn main() {
Expand Down Expand Up @@ -342,6 +363,24 @@ fn main() {
std::process::exit(1);
}
}
CandidatesAction::Corpus {
dump,
build_dict,
out_dir,
min_freq,
} => {
let out = out_dir
.map(std::path::PathBuf::from)
.unwrap_or_else(candidates_ops::default_out_dir);
let dict = build_dict
.map(std::path::PathBuf::from)
.unwrap_or_else(candidates_ops::default_build_dict);
let dump_path = std::path::PathBuf::from(dump);
if let Err(e) = candidates_ops::corpus(&dump_path, &dict, &out, min_freq) {
eprintln!("corpus: {e}");
std::process::exit(1);
}
}
},
Command::UserDict { file, action } => {
let path_str = file.unwrap_or_else(user_dict_ops::default_user_dict_path);
Expand Down
1 change: 1 addition & 0 deletions engine/crates/lex-cli/src/candidates/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
//! rarely useful for extras but kept for completeness.

pub mod sudachi;
pub mod wikipedia;

use std::fs;
use std::io::{self, BufWriter, Write};
Expand Down
Loading
Loading