From 968f63bdc159d4e7513af40c681153ebf40bdb06 Mon Sep 17 00:00:00 2001 From: "sds.rs" Date: Sun, 10 May 2026 05:07:34 +0800 Subject: [PATCH] refactor(indexer): split pipeline.rs into 7-module per-concern tree MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit src/indexer/pipeline.rs (2374 lines) → src/indexer/pipeline/{mod,embed, context,python_modules,resolve,index_files,tests}.rs. mod.rs (237 lines) keeps the public entry points (run_full_index, ensure_file_indexed, run_incremental_index{,_cached}) + IndexStats/IndexResult/ProgressFn + collect_dirty_node_ids glue. The Phase-0..3 orchestrator stays whole in index_files.rs (827 lines) — its phases share local transaction/atomics/ batch_parsed/name_to_ids state that splitting would have to thread back in via large arg lists. Per-concern submodules: embed.rs (71) — embed_and_store_batch + sequential fallback context.rs (197) — categorize_edges + format_route_from_metadata + regenerate_context_strings + repair_null_* python_modules.rs (73) — build_python_module_map + resolve_python_* resolve.rs (204) — refine_ambiguous_targets + resolve_pending_calls index_files.rs (827) — Phase 0..3 orchestrator + FileIndexed tests.rs (884) — all #[cfg(test)] tests Public surface preserved (`crate::indexer::pipeline::{run_full_index, ensure_file_indexed, run_incremental_index, run_incremental_index_cached, embed_and_store_batch, repair_null_context_strings, IndexStats, IndexResult, ProgressFn}`). External callers in cli.rs / mcp/server / tests / benches / claude-plugin all keep their imports unchanged. Verification: - cargo check: clean - cargo +1.95.0 clippy --no-default-features -- -D warnings: clean - cargo +1.95.0 clippy --all-targets -- -D warnings: clean - cargo test --release: 292 lib + 6 + 44 + 19 + 6 + 54 = 421 tests, 0 failed (1 pre-existing #[ignore]) Co-Authored-By: Claude Opus 4.7 (1M context) --- src/indexer/pipeline.rs | 2374 ------------------------ src/indexer/pipeline/context.rs | 197 ++ src/indexer/pipeline/embed.rs | 71 + src/indexer/pipeline/index_files.rs | 827 +++++++++ src/indexer/pipeline/mod.rs | 237 +++ src/indexer/pipeline/python_modules.rs | 73 + src/indexer/pipeline/resolve.rs | 204 ++ src/indexer/pipeline/tests.rs | 884 +++++++++ 8 files changed, 2493 insertions(+), 2374 deletions(-) delete mode 100644 src/indexer/pipeline.rs create mode 100644 src/indexer/pipeline/context.rs create mode 100644 src/indexer/pipeline/embed.rs create mode 100644 src/indexer/pipeline/index_files.rs create mode 100644 src/indexer/pipeline/mod.rs create mode 100644 src/indexer/pipeline/python_modules.rs create mode 100644 src/indexer/pipeline/resolve.rs create mode 100644 src/indexer/pipeline/tests.rs diff --git a/src/indexer/pipeline.rs b/src/indexer/pipeline.rs deleted file mode 100644 index 3bcd780..0000000 --- a/src/indexer/pipeline.rs +++ /dev/null @@ -1,2374 +0,0 @@ -use anyhow::Result; -use std::collections::{HashMap, HashSet}; -use std::path::Path; - -use rayon::prelude::*; - -use crate::embedding::context::{build_context_string, NodeContext}; -use crate::embedding::model::EmbeddingModel; -use crate::indexer::merkle::{compute_diff, hash_file, scan_directory, scan_directory_cached, DirectoryCache}; -use crate::parser::relations::extract_relations_from_tree; -use crate::parser::treesitter::{parse_tree, extract_nodes_from_tree}; -use crate::search::tokenizer::split_identifier; -use crate::storage::db::Database; -use crate::storage::queries::{ - delete_files_by_paths, delete_nodes_by_file, - get_all_file_hashes, get_all_node_names_with_ids, get_dirty_node_ids, get_edges_batch, - get_inbound_cross_file_edges, - get_nodes_by_file_path, - get_nodes_missing_context, get_nodes_with_files_by_ids, - insert_edge_cached, insert_node_cached, - insert_node_vectors_batch, update_context_strings_batch, upsert_file, - EdgeInfo, FileRecord, NodeRecord, NodeResult, -}; -use crate::domain::{REL_CALLS, REL_IMPORTS, REL_INHERITS, REL_ROUTES_TO, REL_IMPLEMENTS, REL_EXPORTS, max_file_size, CROSS_FILE_CALL_NOISE}; -use crate::utils::config::detect_language; - -/// Counters for indexing observability — tracks skipped items. -#[derive(Debug, Clone, Default)] -pub struct IndexStats { - pub files_skipped_size: usize, - pub files_skipped_parse: usize, - pub files_skipped_read: usize, - pub files_skipped_hash: usize, - pub files_skipped_language: usize, -} - -pub struct IndexResult { - pub files_indexed: usize, - pub nodes_created: usize, - pub edges_created: usize, - pub stats: IndexStats, -} - -/// Progress callback: called with (files_done, files_total) after each batch. -pub type ProgressFn<'a> = &'a dyn Fn(usize, usize); - -/// Extract "METHOD path" from route edge metadata JSON, falling back to the edge name. -fn format_route_from_metadata(metadata: Option<&str>, name: &str) -> String { - if let Some(meta) = metadata { - if let Ok(v) = serde_json::from_str::(meta) { - let method = v["method"].as_str().unwrap_or("ALL"); - if let Some(path) = v["path"].as_str() { - return format!("{} {}", method, path); - } - } - } - name.to_string() -} - -/// Embed context strings using batched inference and batch-insert vectors. -/// Public so the background embedding thread in server.rs can call it. -/// Wraps vector inserts in a transaction for atomicity and performance. -pub fn embed_and_store_batch(db: &Database, model: &EmbeddingModel, context_updates: &[(i64, String)]) -> Result<()> { - if context_updates.is_empty() { - return Ok(()); - } - - let t0 = std::time::Instant::now(); - let texts: Vec<&str> = context_updates.iter().map(|(_, ctx)| ctx.as_str()).collect(); - let ids: Vec = context_updates.iter().map(|(id, _)| *id).collect(); - - let embeddings = match model.embed_batch(&texts) { - Ok(embs) => embs, - Err(e) => { - tracing::warn!("Batch embed failed, falling back to sequential: {}", e); - // Fallback: sequential embed - let mut embs = Vec::new(); - for (i, text) in texts.iter().enumerate() { - match model.embed(text) { - Ok(emb) => embs.push(Some(emb)), - Err(e2) => { - tracing::warn!("Failed to embed node {}: {}", ids[i], e2); - embs.push(None); - } - } - } - let vectors: Vec<(i64, Vec)> = ids.iter().zip(embs) - .filter_map(|(&id, emb)| emb.map(|e| (id, e))) - .collect(); - if !vectors.is_empty() { - let tx = db.conn().unchecked_transaction()?; - insert_node_vectors_batch(db.conn(), &vectors)?; - tx.commit()?; - } - tracing::info!("[embed] {} nodes (sequential fallback) in {:.1}s", - context_updates.len(), t0.elapsed().as_secs_f64()); - return Ok(()); - } - }; - - let vectors: Vec<(i64, Vec)> = ids.into_iter().zip(embeddings).collect(); - let t_embed = t0.elapsed(); - - if !vectors.is_empty() { - let tx = db.conn().unchecked_transaction()?; - insert_node_vectors_batch(db.conn(), &vectors)?; - tx.commit()?; - } - - tracing::info!("[embed] {} nodes in {:.1}s (embed {:.1}s, store {:.1}s)", - context_updates.len(), - t0.elapsed().as_secs_f64(), - t_embed.as_secs_f64(), - (t0.elapsed() - t_embed).as_secs_f64(), - ); - Ok(()) -} - -struct CategorizedEdges { - callees: Vec, - callers: Vec, - inherits: Vec, - routes: Vec, - imports: Vec, - implements: Vec, - exports: Vec, -} - -fn categorize_edges(edges: Option<&Vec>, format_route: impl Fn(Option<&str>, &str) -> String) -> CategorizedEdges { - let mut result = CategorizedEdges { - callees: Vec::new(), - callers: Vec::new(), - inherits: Vec::new(), - routes: Vec::new(), - imports: Vec::new(), - implements: Vec::new(), - exports: Vec::new(), - }; - if let Some(edge_list) = edges { - for (relation, direction, name, metadata) in edge_list { - match (relation.as_str(), direction.as_str()) { - (rel, "out") if rel == REL_CALLS => result.callees.push(name.clone()), - (rel, "in") if rel == REL_CALLS => result.callers.push(name.clone()), - (rel, "out") if rel == REL_INHERITS => result.inherits.push(name.clone()), - (rel, "out") if rel == REL_ROUTES_TO => { - result.routes.push(format_route(metadata.as_deref(), name)); - } - (rel, "out") if rel == REL_IMPORTS => result.imports.push(name.clone()), - (rel, "out") if rel == REL_IMPLEMENTS => result.implements.push(name.clone()), - (rel, "out") if rel == REL_EXPORTS => result.exports.push(name.clone()), - _ => {} - } - } - } - result -} - -pub fn run_full_index(db: &Database, project_root: &Path, model: Option<&EmbeddingModel>, progress: Option) -> Result { - let current_hashes = scan_directory(project_root)?; - let files: Vec = current_hashes.keys().cloned().collect(); - index_files(db, project_root, &files, ¤t_hashes, model, &[], progress) -} - -/// Reindex a single file when its on-disk hash differs from the stored hash. -/// No-op when the hashes match (or `rel_path` was never indexed in a way that -/// would currently reindex it). Returns true when a reindex (or stale-row -/// cleanup) actually fired. -/// -/// Used by query-time freshness: when an MCP tool receives an explicit -/// `file_path` argument, the agent is signaling "I just edited this; please -/// answer against the current bytes." The 30s `last_incremental_check` -/// debounce in the server is too coarse for tight Edit→search loops. -/// -/// Cross-file dirty-edge handling mirrors `run_incremental_index`: collect -/// dirty node IDs **before** re-indexing (cascade delete strips old edges), -/// then regenerate context strings + embeddings once the new nodes exist. -pub fn ensure_file_indexed( - db: &Database, - project_root: &Path, - rel_path: &str, - model: Option<&EmbeddingModel>, -) -> Result { - let abs_path = project_root.join(rel_path); - - // Missing-file path: drop stale row so future queries don't return phantom nodes. - if !abs_path.is_file() { - let exists_in_db: Option = db.conn().query_row( - "SELECT id FROM files WHERE path = ?1", - [rel_path], - |row| row.get(0), - ).ok(); - if exists_in_db.is_some() { - let tx = db.conn().unchecked_transaction()?; - delete_files_by_paths(db.conn(), &[rel_path.to_string()])?; - tx.commit()?; - return Ok(true); - } - return Ok(false); - } - - // Skip files we wouldn't index in the first place (binary / wrong language). - if crate::utils::config::detect_language(rel_path).is_none() { - return Ok(false); - } - - let on_disk_hash = crate::indexer::merkle::hash_file(&abs_path)?; - let stored_hash: Option = db.conn().query_row( - "SELECT blake3_hash FROM files WHERE path = ?1", - [rel_path], - |row| row.get(0), - ).ok(); - - if stored_hash.as_deref() == Some(&on_disk_hash) { - return Ok(false); - } - - // Cross-file edges into this file's nodes need their context strings rebuilt - // *after* the node IDs are replaced — capture the dirty set BEFORE re-indexing. - let dirty_node_ids = collect_dirty_node_ids(db, std::slice::from_ref(&rel_path.to_string()))?; - - let mut hashes: HashMap = HashMap::new(); - hashes.insert(rel_path.to_string(), on_disk_hash); - let files = vec![rel_path.to_string()]; - index_files(db, project_root, &files, &hashes, model, &[], None)?; - - if !dirty_node_ids.is_empty() { - regenerate_context_strings(db, &dirty_node_ids, model)?; - } - Ok(true) -} - -pub fn run_incremental_index(db: &Database, project_root: &Path, model: Option<&EmbeddingModel>, progress: Option) -> Result { - let start = std::time::Instant::now(); - let stored_hashes = get_all_file_hashes(db.conn())?; - let current_hashes = scan_directory(project_root)?; - let diff = compute_diff(&stored_hashes, ¤t_hashes); - - // Preserve pseudo-file across incremental indexes - let deleted_files: Vec = diff.deleted_files.into_iter() - .filter(|p| p != "") - .collect(); - let to_index: Vec = [diff.new_files, diff.changed_files].concat(); - - let dirty_node_ids = if !to_index.is_empty() { - collect_dirty_node_ids(db, &to_index)? - } else { - HashSet::new() - }; - - let result = index_files(db, project_root, &to_index, ¤t_hashes, model, &deleted_files, progress)?; - - if !dirty_node_ids.is_empty() { - regenerate_context_strings(db, &dirty_node_ids, model)?; - } - - if result.files_indexed > 0 || !deleted_files.is_empty() { - tracing::info!( - "[incremental] {} files changed, {} deleted, {} nodes, {} edges, {:.1}s", - result.files_indexed, deleted_files.len(), - result.nodes_created, result.edges_created, - start.elapsed().as_secs_f64() - ); - } - - Ok(result) -} - -/// Incremental index with directory mtime cache for faster scanning. -/// Files in unchanged directories are skipped entirely. -pub fn run_incremental_index_cached( - db: &Database, - project_root: &Path, - model: Option<&EmbeddingModel>, - dir_cache: Option<&DirectoryCache>, - progress: Option, -) -> Result<(IndexResult, DirectoryCache)> { - let start = std::time::Instant::now(); - let stored_hashes = get_all_file_hashes(db.conn())?; - let (mut current_hashes, new_cache) = scan_directory_cached(project_root, dir_cache)?; - - // Merge stored hashes for files in unchanged directories. - // scan_directory_cached skips files in unchanged dirs, so we need to - // carry forward their stored hashes to prevent false "deleted" diffs. - // Use new_cache.file_mtimes (populated for ALL walked files) to check existence - // without per-file stat calls. - for (path, hash) in &stored_hashes { - if !current_hashes.contains_key(path) && new_cache.file_exists(path) { - current_hashes.insert(path.clone(), hash.clone()); - } - } - - let diff = compute_diff(&stored_hashes, ¤t_hashes); - - // Preserve pseudo-file across incremental indexes - let deleted_files: Vec = diff.deleted_files.into_iter() - .filter(|p| p != "") - .collect(); - let to_index: Vec = [diff.new_files, diff.changed_files].concat(); - - let dirty_node_ids = if !to_index.is_empty() { - collect_dirty_node_ids(db, &to_index)? - } else { - HashSet::new() - }; - - let result = index_files(db, project_root, &to_index, ¤t_hashes, model, &deleted_files, progress)?; - - if !dirty_node_ids.is_empty() { - regenerate_context_strings(db, &dirty_node_ids, model)?; - } - - if result.files_indexed > 0 || !deleted_files.is_empty() { - tracing::info!( - "[incremental] {} files changed, {} deleted, {} nodes, {} edges, {:.1}s", - result.files_indexed, deleted_files.len(), - result.nodes_created, result.edges_created, - start.elapsed().as_secs_f64() - ); - } - - Ok((result, new_cache)) -} - -/// Collect node IDs in OTHER files that have edges pointing to nodes in the changed files. -/// Must be called BEFORE re-indexing (cascade delete removes old edges). -fn collect_dirty_node_ids(db: &Database, changed_paths: &[String]) -> Result> { - let mut changed_file_ids = Vec::new(); - for path in changed_paths { - let file_id: Option = db.conn().query_row( - "SELECT id FROM files WHERE path = ?1", - [path], - |row| row.get(0), - ).ok(); - if let Some(id) = file_id { - changed_file_ids.push(id); - } - } - let ids = get_dirty_node_ids(db.conn(), &changed_file_ids)?; - Ok(ids.into_iter().collect()) -} - -/// Regenerate context strings (and embeddings) for the given set of dirty nodes. -fn regenerate_context_strings(db: &Database, dirty_ids: &HashSet, model: Option<&EmbeddingModel>) -> Result<()> { - let tx = db.conn().unchecked_transaction()?; - let id_vec: Vec = dirty_ids.iter().copied().collect(); - let all_edges = get_edges_batch(db.conn(), &id_vec)?; - let all_nodes: HashMap)> = { - let nwfs = get_nodes_with_files_by_ids(db.conn(), &id_vec)?; - nwfs.into_iter().map(|nwf| (nwf.node.id, (nwf.node, nwf.file_path, nwf.language))).collect() - }; - - // Build all context strings first - let mut context_updates: Vec<(i64, String)> = Vec::with_capacity(dirty_ids.len()); - for &node_id in dirty_ids { - if let Some((node, file_path, language)) = all_nodes.get(&node_id) { - let edges = all_edges.get(&node_id); - let cat = categorize_edges(edges, format_route_from_metadata); - - let ctx = build_context_string(&NodeContext { - node_type: node.node_type.clone(), - name: node.name.clone(), - qualified_name: node.qualified_name.clone(), - file_path: file_path.clone(), - language: language.clone(), - signature: node.signature.clone(), - return_type: node.return_type.clone(), - param_types: node.param_types.clone(), - code_content: Some(node.code_content.clone()), - routes: cat.routes, - callees: cat.callees, - callers: cat.callers, - inherits: cat.inherits, - imports: cat.imports, - implements: cat.implements, - exports: cat.exports, - doc_comment: node.doc_comment.clone(), - }); - - context_updates.push((node_id, ctx)); - } - } - - // Batch update context strings - update_context_strings_batch(db.conn(), &context_updates)?; - tx.commit()?; - - // Embed outside the committed tx — recoverable on failure - if let Some(m) = model { - if db.vec_enabled() { - embed_and_store_batch(db, m, &context_updates)?; - } - } - Ok(()) -} - -/// Repair nodes that have NULL context_string (likely from a failed Phase 3). -/// This is called at startup after index verification. -pub fn repair_null_context_strings( - db: &Database, - model: Option<&EmbeddingModel>, -) -> Result { - let missing_ids = get_nodes_missing_context(db.conn())?; - if missing_ids.is_empty() { - return Ok(0); - } - - tracing::info!("[repair] Found {} nodes with NULL context_string, rebuilding...", missing_ids.len()); - - // Load node details with file paths - let nodes_with_files = get_nodes_with_files_by_ids(db.conn(), &missing_ids)?; - - // Load edges for all affected nodes in one batch - let all_edges = get_edges_batch(db.conn(), &missing_ids)?; - - // Build context strings - let mut context_updates: Vec<(i64, String)> = Vec::new(); - for nwf in &nodes_with_files { - let node = &nwf.node; - let edges = all_edges.get(&node.id); - let cat = categorize_edges(edges, format_route_from_metadata); - - let ctx = build_context_string(&NodeContext { - node_type: node.node_type.clone(), - name: node.name.clone(), - qualified_name: node.qualified_name.clone(), - file_path: nwf.file_path.clone(), - language: nwf.language.clone(), - signature: node.signature.clone(), - return_type: node.return_type.clone(), - param_types: node.param_types.clone(), - code_content: Some(node.code_content.clone()), - routes: cat.routes, - callees: cat.callees, - callers: cat.callers, - inherits: cat.inherits, - imports: cat.imports, - implements: cat.implements, - exports: cat.exports, - doc_comment: node.doc_comment.clone(), - }); - - context_updates.push((node.id, ctx)); - } - - // Update in DB within a transaction (avoids per-row fsync under autocommit) - if !context_updates.is_empty() { - let tx = db.conn().unchecked_transaction()?; - update_context_strings_batch(db.conn(), &context_updates)?; - tx.commit()?; - - // Re-embed if model available - if let Some(m) = model { - if db.vec_enabled() { - embed_and_store_batch(db, m, &context_updates)?; - } - } - } - - let count = context_updates.len(); - tracing::info!("[repair] Repaired context strings for {} nodes", count); - Ok(count) -} - -/// Batch size for streaming indexing. Each batch processes Phase 1+2 -/// then drops heavyweight data (ASTs, source strings) before the next batch. -const BATCH_SIZE: usize = 500; - -/// Lightweight post-batch record — no Tree or source string. -struct FileIndexed { - rel_path: String, - node_ids: Vec, - node_names: Vec, -} - -/// Build mapping from Python dotted module paths to file paths. -/// Registers both full paths and suffix paths for flexible matching. -/// e.g., "src/myapp/utils.py" matches "src.myapp.utils", "myapp.utils", and "utils". -fn build_python_module_map(python_paths: &HashSet) -> HashMap> { - let mut map: HashMap> = HashMap::new(); - for path in python_paths { - let stripped = if let Some(s) = path.strip_suffix("/__init__.py") { - s - } else if let Some(s) = path.strip_suffix(".py") { - s - } else { - continue; - }; - - // Register all suffix module paths for flexible matching - // e.g., "src/myapp/utils" -> "src.myapp.utils", "myapp.utils", "utils" - let parts: Vec<&str> = stripped.split('/').collect(); - for i in 0..parts.len() { - let dotted = parts[i..].join("."); - map.entry(dotted).or_default().push(path.clone()); - } - } - // Deduplicate - for paths in map.values_mut() { - paths.sort(); - paths.dedup(); - } - map -} - -/// Resolve Python import targets using pre-parsed module metadata. -/// For `import X` (is_module_import): finds `` nodes in resolved files. -/// For `from X import Y`: finds nodes named Y only in resolved files. -/// Returns None if module can't be resolved or no matching nodes found. -fn resolve_python_module_targets( - python_module: &str, - is_module_import: bool, - target_name: &str, - python_module_map: &HashMap>, - node_id_to_path: &HashMap, - name_to_ids: &HashMap>, -) -> Option> { - // Resolve module path to file path(s). - // Note: suffix matching in python_module_map means `import utils` may match - // multiple files (e.g., "myapp/utils.py" and "other/utils.py"). This is an - // inherent ambiguity without sys.path context; over-connecting is safer for - // dependency analysis than missing real dependencies. - let module_files = python_module_map.get(python_module)?; - - let lookup_name = if is_module_import { "" } else { target_name }; - let all_ids = name_to_ids.get(lookup_name)?; - let targets: Vec = all_ids.iter() - .filter(|nid| { - node_id_to_path.get(nid) - .map(|p| module_files.contains(p)) - .unwrap_or(false) - }) - .copied() - .collect(); - if targets.is_empty() { None } else { Some(targets) } -} - -fn index_files( - db: &Database, - root: &Path, - files: &[String], - hashes: &HashMap, - model: Option<&EmbeddingModel>, - delete_paths: &[String], - progress: Option, -) -> Result { - // SAFETY: unchecked_transaction is used because rusqlite's Transaction borrows - // &mut Connection, preventing other borrows during the transaction. Here we need - // both the transaction and read access via db.conn() (which returns &Connection - // to the same underlying connection). This is safe because: - // (1) db.conn() returns the same Connection the tx was opened on, - // (2) we never open nested transactions, - // (3) concurrent access (e.g. background embedding thread) uses separate - // DB connections; safety relies on SQLite WAL mode + busy_timeout(5000), - // not single-threadedness. - - use std::sync::atomic::{AtomicUsize, Ordering as AtomicOrdering}; - let skipped_size = AtomicUsize::new(0); - let skipped_parse = AtomicUsize::new(0); - let skipped_read = AtomicUsize::new(0); - let skipped_hash = AtomicUsize::new(0); - let skipped_language = AtomicUsize::new(0); - - let mut total_nodes_created = 0usize; - let mut total_edges_created = 0usize; - let mut all_indexed: Vec = Vec::new(); - - // Phase 0: Delete removed files in own transaction. - // - // Before cascade strips inbound REL_CALLS edges, capture them as pending - // rows. Without this, deleting file A wipes B's edge to A.foo and B is - // not in `delete_paths` (so Phase 2 won't re-extract it), leaving B with - // neither an edge nor a pending row — the same staleness window the - // "callee added later" buffering closes, just from the deletion side. - // Both directions need to round-trip through pending or the v0.18.2 fix - // is only half-complete. - if !delete_paths.is_empty() { - let tx = db.conn().unchecked_transaction()?; - - // Resolve file IDs once (delete_files_by_paths drops them) so we can - // query inbound calls before cascade fires. - let mut deleted_file_ids: Vec = Vec::with_capacity(delete_paths.len()); - for path in delete_paths { - if let Ok(Some(fid)) = db.conn().query_row( - "SELECT id FROM files WHERE path = ?1", - [path], - |row| row.get::<_, Option>(0), - ) { - deleted_file_ids.push(fid); - } - } - - let mut buffered = 0usize; - for fid in &deleted_file_ids { - let inbound = crate::storage::queries::get_inbound_calls_for_pending(db.conn(), *fid)?; - for (source_id, target_name, source_language, metadata) in inbound { - crate::storage::queries::insert_pending_unresolved_call( - db.conn(), - source_id, - &target_name, - &source_language, - metadata.as_deref(), - )?; - buffered += 1; - } - } - if buffered > 0 { - tracing::info!( - "[index] Phase 0: buffered {} inbound calls before cascade-deleting {} file(s)", - buffered, deleted_file_ids.len() - ); - } - - delete_files_by_paths(db.conn(), delete_paths)?; - tx.commit()?; - } - - // CPU-bound parse result — produced in parallel, consumed sequentially for DB insert - struct FilePreParsed { - rel_path: String, - source: String, - language: String, - tree: tree_sitter::Tree, - hash: String, - last_modified: i64, - parsed_nodes: Vec, - } - - // Pre-build Python module map once (used in all batches for import resolution) - let mut all_python_paths: HashSet = files.iter() - .filter(|f| f.ends_with(".py")) - .cloned() - .collect(); - { - let mut stmt = db.conn().prepare("SELECT path FROM files WHERE path LIKE '%.py'")?; - let rows = stmt.query_map([], |row| row.get::<_, String>(0))?; - for row in rows { - all_python_paths.insert(row?); - } - } - let python_module_map = build_python_module_map(&all_python_paths); - - // Pre-load global name->[(id, path, language)] map once before the batch loop. - // This avoids a full table scan per batch in Phase 2 relation resolution. - // The map is updated incrementally as each batch commits new nodes. - // `language` drives same-language-preferred resolution to avoid cross-language - // bare-name collisions (e.g. Rust `hasher.update()` resolving to JS `function update`). - let mut global_name_map: HashMap> = - get_all_node_names_with_ids(db.conn())?; - - // Heavyweight per-file data used during Phase 1+2, dropped after each batch - #[allow(dead_code)] - struct FileParsed { - rel_path: String, - source: String, - language: String, - tree: tree_sitter::Tree, - file_id: i64, - node_ids: Vec, - node_names: Vec, - } - - // Process files in batches — each batch does Phase 1 + Phase 2 - for batch in files.chunks(BATCH_SIZE) { - let tx = db.conn().unchecked_transaction()?; - - // --- Phase 1a: Parallel CPU-bound work (read + parse + extract nodes) --- - let pre_parsed: Vec = batch - .par_iter() - .filter_map(|rel_path| { - let language = match detect_language(rel_path) { - Some(l) => l, - None => { - skipped_language.fetch_add(1, AtomicOrdering::Relaxed); - return None; - } - }; - let abs_path = root.join(rel_path); - - let file_meta = std::fs::metadata(&abs_path).ok(); - if let Some(ref meta) = file_meta { - if meta.len() > max_file_size() { - tracing::debug!("Skipping large file ({} bytes): {}", meta.len(), rel_path); - skipped_size.fetch_add(1, AtomicOrdering::Relaxed); - return None; - } - } - - let source = match std::fs::read_to_string(&abs_path) { - Ok(s) => s, - Err(e) => { - tracing::warn!("Skipping file {}: {}", rel_path, e); - skipped_read.fetch_add(1, AtomicOrdering::Relaxed); - return None; - } - }; - - let hash = match hashes.get(rel_path.as_str()) { - Some(h) => h.clone(), - None => match hash_file(&abs_path) { - Ok(h) => h, - Err(e) => { - tracing::warn!("Skipping file (hash error): {}: {}", rel_path, e); - skipped_hash.fetch_add(1, AtomicOrdering::Relaxed); - return None; - } - }, - }; - - let tree = match parse_tree(&source, language) { - Ok(t) => t, - Err(e) => { - tracing::warn!("Parse failed for {}: {}", rel_path, e); - skipped_parse.fetch_add(1, AtomicOrdering::Relaxed); - return None; - } - }; - - let last_modified = file_meta - .and_then(|m| m.modified().ok()) - .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok()) - .map(|d| d.as_secs() as i64) - .unwrap_or(0); - - let parsed_nodes = extract_nodes_from_tree(&tree, &source, language); - - Some(FilePreParsed { - rel_path: rel_path.clone(), - source, - language: language.to_string(), - tree, - hash, - last_modified, - parsed_nodes, - }) - }) - .collect(); - - let mut batch_parsed: Vec = Vec::new(); - // Saved inbound edges from other files → batch files (to restore after cascade delete) - // Tuple: (source_id, source_file_id, target_name, relation, metadata) - let mut saved_inbound_edges: Vec<(i64, i64, String, String, Option)> = Vec::new(); - // Track file_ids in this batch to filter intra-batch edges in Phase 2c - let mut batch_file_ids: HashSet = HashSet::new(); - - // --- Phase 1b: Sequential DB inserts --- - for pp in pre_parsed { - let file_id = upsert_file(db.conn(), &FileRecord { - path: pp.rel_path.clone(), - blake3_hash: pp.hash, - last_modified: pp.last_modified, - language: Some(pp.language.clone()), - })?; - - // Save cross-file inbound edges before cascade delete destroys them - saved_inbound_edges.extend(get_inbound_cross_file_edges(db.conn(), file_id)?); - batch_file_ids.insert(file_id); - - delete_nodes_by_file(db.conn(), file_id)?; - - let mut node_ids = Vec::new(); - let mut node_names = Vec::new(); - - let module_node_id = insert_node_cached(db.conn(), &NodeRecord { - file_id, - node_type: "module".into(), - name: "".into(), - qualified_name: Some(pp.rel_path.clone()), - start_line: 1, - end_line: pp.source.lines().count() as i64, - code_content: String::new(), - signature: None, - doc_comment: None, - context_string: None, - name_tokens: None, - return_type: None, - param_types: None, - is_test: false, - })?; - node_ids.push(module_node_id); - node_names.push("".into()); - total_nodes_created += 1; - - for pn in &pp.parsed_nodes { - let name_tokens = split_identifier(&pn.name); - let node_id = insert_node_cached(db.conn(), &NodeRecord { - file_id, - node_type: pn.node_type.clone(), - name: pn.name.clone(), - qualified_name: pn.qualified_name.clone(), - start_line: pn.start_line as i64, - end_line: pn.end_line as i64, - code_content: pn.code_content.clone(), - signature: pn.signature.clone(), - doc_comment: pn.doc_comment.clone(), - context_string: None, - name_tokens: Some(name_tokens), - return_type: pn.return_type.clone(), - param_types: pn.param_types.clone(), - is_test: pn.is_test, - })?; - node_ids.push(node_id); - node_names.push(pn.name.clone()); - total_nodes_created += 1; - } - - batch_parsed.push(FileParsed { - rel_path: pp.rel_path, - source: pp.source, - language: pp.language, - tree: pp.tree, - file_id, - node_ids, - node_names, - }); - } - - // --- Phase 2: Extract relations + insert edges --- - // Build per-batch name_to_ids and node_id_to_path from the pre-loaded global map, - // excluding files in the current batch (their old nodes were deleted in Phase 1b). - let batch_file_paths: HashSet<&str> = batch_parsed.iter() - .map(|pf| pf.rel_path.as_str()).collect(); - - let mut name_to_ids: HashMap> = HashMap::new(); - let mut node_id_to_path: HashMap = HashMap::new(); - // Per-node language for same-language-preferred edge resolution (§ cross-lang collision). - let mut node_id_to_language: HashMap> = HashMap::new(); - - // Add current batch's newly inserted nodes - for pf in &batch_parsed { - for (id, name) in pf.node_ids.iter().zip(pf.node_names.iter()) { - name_to_ids.entry(name.clone()).or_default().push(*id); - node_id_to_path.insert(*id, pf.rel_path.clone()); - node_id_to_language.insert(*id, Some(pf.language.clone())); - } - } - - // Add nodes from the global map, excluding those in current batch's files - // (their old nodes were deleted and replaced by new ones above) - for (name, entries) in &global_name_map { - for (id, path, language) in entries { - if !batch_file_paths.contains(path.as_str()) { - name_to_ids.entry(name.clone()).or_default().push(*id); - node_id_to_path.insert(*id, path.clone()); - node_id_to_language.insert(*id, language.clone()); - } - } - } - - for ids in name_to_ids.values_mut() { - ids.sort(); - ids.dedup(); - } - - // Track unresolved external Python imports: (source_module_node_id, module_name) - let mut external_python_imports: Vec<(i64, String)> = Vec::new(); - // Track unresolved external symbols for sentinel node creation: - // (source_id, target_name, relation) — e.g., implements edges to external traits - let mut unresolved_externals: Vec<(i64, String, String)> = Vec::new(); - - for pf in &batch_parsed { - let relations = extract_relations_from_tree(&pf.tree, &pf.source, &pf.language); - let local_ids: HashSet = pf.node_ids.iter().copied().collect(); - - for rel in &relations { - // Contract: extract_relations_from_tree stamps every relation with - // source_language equal to the language argument. The - // same-language resolution at line 811+ depends on it. Hard - // error instead of debug_assert so a parser regression fails - // loudly in release builds too (one string compare per - // relation is negligible against the SQL writes below). - if rel.source_language != pf.language { - anyhow::bail!( - "ParsedRelation.source_language ({}) does not match file language ({}); \ - parser regressed the source_language contract", - rel.source_language, pf.language - ); - } - - let source_ids = pf.node_names.iter() - .zip(pf.node_ids.iter()) - .filter(|(name, _)| *name == &rel.source_name) - .map(|(_, id)| *id) - .collect::>(); - - // Try Python module-constrained resolution for import edges - if rel.relation == REL_IMPORTS { - if let Some(ref meta_str) = rel.metadata { - if let Ok(meta) = serde_json::from_str::(meta_str) { - if let Some(python_module) = meta.get("python_module").and_then(|v| v.as_str()) { - let is_module_import = meta.get("is_module_import") - .and_then(|v| v.as_bool()).unwrap_or(false); - if python_module_map.contains_key(python_module) { - // Internal module — try constrained resolution - if let Some(module_targets) = resolve_python_module_targets( - python_module, is_module_import, &rel.target_name, - &python_module_map, &node_id_to_path, &name_to_ids, - ) { - for &src_id in &source_ids { - for &tgt_id in &module_targets { - if src_id != tgt_id - && insert_edge_cached(db.conn(), src_id, tgt_id, &rel.relation, rel.metadata.as_deref())? { - total_edges_created += 1; - } - } - } - continue; - } - // Module found but symbol not found — fall through to default - } else { - // External module — track for virtual node creation. - // For `from X import Y`, we track the module-level dependency (X), - // not the individual symbol (Y), since we can't index external code. - for &src_id in &source_ids { - external_python_imports.push((src_id, python_module.to_string())); - } - continue; // No point in default resolution for external imports - } - } - } - } - } - - // Default resolution: global name-based lookup with language-aware layering. - // Tier order: same-file → same-language → (calls: drop) / (other: global). - // Dropping calls without a same-language match prevents Rust `hasher.update()` - // binding to an unrelated JS `function update()` via bare-name collision. - let all_target_ids = name_to_ids.get(&rel.target_name) - .cloned() - .unwrap_or_default(); - - let same_file_targets: Vec = all_target_ids.iter() - .filter(|id| local_ids.contains(id)) - .copied() - .collect(); - - let source_lang = pf.language.as_str(); - let same_language_targets: Vec = all_target_ids.iter() - .filter(|id| !local_ids.contains(id)) - .filter(|id| matches!( - node_id_to_language.get(id).and_then(|l| l.as_deref()), - Some(l) if l == source_lang - )) - .copied() - .collect(); - - let target_ids = if !same_file_targets.is_empty() { - same_file_targets - } else if rel.relation == REL_CALLS - && CROSS_FILE_CALL_NOISE.contains(&rel.target_name.as_str()) - { - // Stdlib method names (new/default/from) — drop regardless of language. - continue; - } else if !same_language_targets.is_empty() { - // Ambiguous cross-file same-language candidates (e.g. a helper - // name like `readJson` defined in multiple JS files) used to - // fan out — every same-name target got an edge, producing - // phantom callers across unrelated modules. Refine by - // non-test preference + longest common path prefix with the - // caller file. See `refine_ambiguous_targets` for fallback - // policy (keeps remaining pool on ambiguity to avoid - // regressing dead-code on bare-name Rust scoped calls). - refine_ambiguous_targets( - &same_language_targets, - &pf.rel_path, - &node_id_to_path, - ) - } else if rel.relation == REL_CALLS { - // No same-file, no same-language candidate → buffer in - // pending_unresolved_calls instead of silently dropping. - // The post-Phase-2 sweep below promotes the row to a real - // edge as soon as a same-language target appears (e.g. - // sibling file added in a later incremental pass). Memory - // `feedback_incremental_edge_timing.md` documented the bug - // this closes: B's bare-name call to `foo()` got dropped - // when foo didn't exist yet, and never re-resolved when A - // later added `foo`. Schema cascade on source_id self-cleans - // when callers are removed/reindexed. - for &src_id in &source_ids { - crate::storage::queries::insert_pending_unresolved_call( - db.conn(), - src_id, - &rel.target_name, - &pf.language, - rel.metadata.as_deref(), - )?; - } - continue; - } else { - all_target_ids - }; - - if target_ids.is_empty() - && (rel.relation == REL_IMPLEMENTS || rel.relation == REL_IMPORTS) - { - // Unresolved implements target (external trait like Write, Default) - // OR unresolved import target (JS `require('fs')`, unresolved JS - // ES-import binding). Phase 2b-ext creates `/` - // sentinel nodes so the dependency graph shows the link. - for &src_id in &source_ids { - unresolved_externals.push((src_id, rel.target_name.clone(), rel.relation.clone())); - } - } else { - for &src_id in &source_ids { - for &tgt_id in &target_ids { - if (src_id != tgt_id || rel.relation == REL_ROUTES_TO) - && insert_edge_cached(db.conn(), src_id, tgt_id, &rel.relation, rel.metadata.as_deref())? { - total_edges_created += 1; - } - } - } - } - } - } - - // Phase 2b: Create virtual nodes for external Python imports - if !external_python_imports.is_empty() { - let ext_file_id = upsert_file(db.conn(), &FileRecord { - path: "".into(), - blake3_hash: "external".into(), - last_modified: 0, - language: Some("external".into()), - })?; - - // Load existing external module nodes to avoid duplicates - let existing_ext_nodes: HashMap = - get_nodes_by_file_path(db.conn(), "")? - .into_iter() - .map(|n| (n.name.clone(), n.id)) - .collect(); - - let unique_modules: HashSet = external_python_imports.iter() - .map(|(_, m)| m.clone()).collect(); - - let mut ext_node_ids: HashMap = existing_ext_nodes; - for module_name in &unique_modules { - if !ext_node_ids.contains_key(module_name) { - let node_id = insert_node_cached(db.conn(), &NodeRecord { - file_id: ext_file_id, - node_type: "external_module".into(), - name: module_name.clone(), - qualified_name: Some(format!("/{}", module_name)), - start_line: 0, - end_line: 0, - code_content: String::new(), - signature: None, - doc_comment: None, - context_string: None, - name_tokens: None, - return_type: None, - param_types: None, - is_test: false, - })?; - ext_node_ids.insert(module_name.clone(), node_id); - total_nodes_created += 1; - } - } - - for (source_id, module_name) in &external_python_imports { - if let Some(&ext_id) = ext_node_ids.get(module_name) { - if insert_edge_cached(db.conn(), *source_id, ext_id, REL_IMPORTS, None)? { - total_edges_created += 1; - } - } - } - } - - // Phase 2b-ext: Create sentinel nodes for unresolved external symbols - // (e.g., Rust `impl Write for SharedStdout` where Write is from std::io) - if !unresolved_externals.is_empty() { - let ext_file_id = upsert_file(db.conn(), &FileRecord { - path: "".into(), - blake3_hash: "external".into(), - last_modified: 0, - language: Some("external".into()), - })?; - - let existing_ext_nodes: HashMap = - get_nodes_by_file_path(db.conn(), "")? - .into_iter() - .map(|n| (n.name.clone(), n.id)) - .collect(); - - let mut ext_node_ids: HashMap = existing_ext_nodes; - - // Collect unique targets with inferred type - let unique_targets: HashMap<&str, &str> = unresolved_externals.iter() - .map(|(_, name, rel)| { - let node_type = if rel == REL_IMPLEMENTS { "trait" } else { "module" }; - (name.as_str(), node_type) - }) - .collect(); - - for (&name, &node_type) in &unique_targets { - if !ext_node_ids.contains_key(name) { - let node_id = insert_node_cached(db.conn(), &NodeRecord { - file_id: ext_file_id, - node_type: node_type.into(), - name: name.into(), - qualified_name: Some(format!("/{}", name)), - start_line: 0, - end_line: 0, - code_content: String::new(), - signature: None, - doc_comment: None, - context_string: None, - name_tokens: None, - return_type: None, - param_types: None, - is_test: false, - })?; - ext_node_ids.insert(name.into(), node_id); - total_nodes_created += 1; - } - } - - for (source_id, target_name, relation) in &unresolved_externals { - if let Some(&ext_id) = ext_node_ids.get(target_name.as_str()) { - if insert_edge_cached(db.conn(), *source_id, ext_id, relation, None)? { - total_edges_created += 1; - } - } - } - } - - // Phase 2c: Restore cross-file inbound edges lost to cascade delete. - // When a file is re-indexed, its old nodes are deleted (cascade-deleting edges). - // Edges from OTHER files into the re-indexed file must be rebuilt using new node IDs. - if !saved_inbound_edges.is_empty() { - // Build name → new_node_id map for batch files only - let mut batch_name_to_ids: HashMap<&str, Vec> = HashMap::new(); - for pf in &batch_parsed { - for (id, name) in pf.node_ids.iter().zip(pf.node_names.iter()) { - batch_name_to_ids.entry(name.as_str()).or_default().push(*id); - } - } - - let mut restored = 0usize; - let mut skipped_intra_batch = 0usize; - for (source_id, source_file_id, target_name, relation, metadata) in &saved_inbound_edges { - // Source file is also in this batch — source_id is stale (deleted + re-created). - // Phase 2 already resolves cross-file edges for intra-batch files. - if batch_file_ids.contains(source_file_id) { - skipped_intra_batch += 1; - continue; - } - if let Some(new_target_ids) = batch_name_to_ids.get(target_name.as_str()) { - for &new_tgt_id in new_target_ids { - if *source_id != new_tgt_id - && insert_edge_cached(db.conn(), *source_id, new_tgt_id, relation, metadata.as_deref())? { - total_edges_created += 1; - restored += 1; - } - } - } - } - if restored > 0 || skipped_intra_batch > 0 { - tracing::debug!("[index] Restored {} cross-file inbound edges, skipped {} intra-batch", restored, skipped_intra_batch); - } - } - - tx.commit()?; - - let batch_file_count = batch_parsed.len(); - - // Update global_name_map: remove old entries for batch files, add new ones - for (_, entries) in global_name_map.iter_mut() { - entries.retain(|(_id, path, _lang)| !batch_file_paths.contains(path.as_str())); - } - global_name_map.retain(|_, entries| !entries.is_empty()); - - // Convert to lightweight records — drops Tree and source string - for pf in batch_parsed { - // Add newly committed nodes to the global map - let pf_lang = Some(pf.language.clone()); - for (id, name) in pf.node_ids.iter().zip(pf.node_names.iter()) { - global_name_map.entry(name.clone()) - .or_default() - .push((*id, pf.rel_path.clone(), pf_lang.clone())); - } - all_indexed.push(FileIndexed { - rel_path: pf.rel_path, - node_ids: pf.node_ids, - node_names: pf.node_names, - }); - // pf.tree and pf.source are dropped here — memory freed - } - - // Report progress after each batch - if let Some(cb) = progress { - cb(all_indexed.len(), files.len()); - } - - if files.len() > BATCH_SIZE { - tracing::info!( - "[index] batch {}/{}: {} files ({} nodes, {} edges)", - all_indexed.len(), files.len(), - batch_file_count, total_nodes_created, total_edges_created - ); - } - } - - // Phase 3: Build context strings + embeddings (single transaction, lightweight) - if !all_indexed.is_empty() { - let tx = db.conn().unchecked_transaction()?; - let all_node_ids: Vec = all_indexed.iter() - .flat_map(|fi| fi.node_ids.iter().copied()).collect(); - let all_edges = get_edges_batch(db.conn(), &all_node_ids)?; - let all_node_details: HashMap)> = { - let nodes = get_nodes_with_files_by_ids(db.conn(), &all_node_ids)?; - nodes.into_iter().map(|nwf| (nwf.node.id, (nwf.node, nwf.language))).collect() - }; - - // Phase 3a: Build all context strings (CPU-bound, parallelized with rayon) - // Flatten to (node_id, node_name, file_path) tuples for parallel iteration - let node_tasks: Vec<(i64, &str, &str)> = all_indexed.iter() - .flat_map(|fi| { - fi.node_ids.iter().enumerate().map(move |(idx, &node_id)| { - (node_id, fi.node_names[idx].as_str(), fi.rel_path.as_str()) - }) - }) - .collect(); - - let context_updates: Vec<(i64, String)> = node_tasks.par_iter() - .map(|&(node_id, node_name, file_path)| { - let edges = all_edges.get(&node_id); - let cat = categorize_edges(edges, format_route_from_metadata); - let node_detail = all_node_details.get(&node_id); - - let ctx = build_context_string(&NodeContext { - node_type: node_detail.map(|(n, _)| n.node_type.clone()).unwrap_or_default(), - name: node_name.to_string(), - qualified_name: node_detail.and_then(|(n, _)| n.qualified_name.clone()), - file_path: file_path.to_string(), - language: node_detail.and_then(|(_, lang)| lang.clone()), - signature: node_detail.and_then(|(n, _)| n.signature.clone()), - return_type: node_detail.and_then(|(n, _)| n.return_type.clone()), - param_types: node_detail.and_then(|(n, _)| n.param_types.clone()), - code_content: node_detail.map(|(n, _)| n.code_content.clone()), - routes: cat.routes, - callees: cat.callees, - callers: cat.callers, - inherits: cat.inherits, - imports: cat.imports, - implements: cat.implements, - exports: cat.exports, - doc_comment: node_detail.and_then(|(n, _)| n.doc_comment.clone()), - }); - - (node_id, ctx) - }) - .collect(); - - // Phase 3b: Batch update context strings in DB - update_context_strings_batch(db.conn(), &context_updates)?; - tx.commit()?; - - tracing::info!( - "[index] Phase 3: context strings built for {} nodes", - all_node_ids.len() - ); - - // Phase 3c: Embed outside the committed tx — recoverable on failure via repair_null_context_strings - if let Some(m) = model { - if db.vec_enabled() { - embed_and_store_batch(db, m, &context_updates)?; - } - } - } - - // Phase 2c: sweep pending_unresolved_calls — promote any rows whose - // target_name now resolves against a same-language node. Cheap when the - // table is empty (typical after a full index of a self-contained codebase). - let pending_resolved = resolve_pending_calls(db)?; - total_edges_created += pending_resolved; - if pending_resolved > 0 { - tracing::info!( - "[index] Phase 2c: resolved {} pending unresolved calls", - pending_resolved - ); - } - - // Optimize query planner statistics after bulk writes - if !all_indexed.is_empty() { - let _ = db.run_optimize(); - } - - let stats = IndexStats { - files_skipped_size: skipped_size.load(AtomicOrdering::Relaxed), - files_skipped_parse: skipped_parse.load(AtomicOrdering::Relaxed), - files_skipped_read: skipped_read.load(AtomicOrdering::Relaxed), - files_skipped_hash: skipped_hash.load(AtomicOrdering::Relaxed), - files_skipped_language: skipped_language.load(AtomicOrdering::Relaxed), - }; - - Ok(IndexResult { - files_indexed: all_indexed.len(), - nodes_created: total_nodes_created, - edges_created: total_edges_created, - stats, - }) -} - -/// Sweep `pending_unresolved_calls` against the current node state. Rows whose -/// `(target_name, source_language)` now match a real node become a `calls` -/// edge and the pending row is dropped; rows that still don't resolve stay -/// buffered for the next index pass. -/// -/// Resolution priority mirrors Phase 2: same-language candidates only (no -/// cross-language promotion — memory `feedback_edge_resolution_same_language.md` -/// flags that as the canonical false-positive class), with -/// `refine_ambiguous_targets` applied when multiple candidates share the name. -/// -/// Returns the number of edges inserted by this sweep. -fn resolve_pending_calls(db: &Database) -> Result { - let pending = crate::storage::queries::list_pending_unresolved_calls(db.conn())?; - if pending.is_empty() { - return Ok(0); - } - - // Build name → [(node_id, language)] map ONCE, then iterate pending rows - // in memory. Narrowed by `n.name IN (SELECT DISTINCT target_name ...)` so - // even a 1-row pending table doesn't trigger a full nodes-table scan on - // every incremental pass — for a 100K-node project the unfiltered SELECT - // was 100K rows × every index call, even with no work to do. - use crate::storage::queries::{insert_edge_cached, delete_pending_unresolved_call}; - let mut name_to_lang_targets: HashMap> = HashMap::new(); - let mut node_id_to_path: HashMap = HashMap::new(); - { - let mut stmt = db.conn().prepare( - "SELECT n.id, n.name, COALESCE(f.language, ''), f.path - FROM nodes n JOIN files f ON f.id = n.file_id - WHERE f.language IS NOT NULL - AND n.name IN (SELECT DISTINCT target_name FROM pending_unresolved_calls)" - )?; - let rows = stmt.query_map([], |row| { - Ok(( - row.get::<_, i64>(0)?, - row.get::<_, String>(1)?, - row.get::<_, String>(2)?, - row.get::<_, String>(3)?, - )) - })?; - for row in rows { - let (id, name, lang, path) = row?; - if lang.is_empty() { - continue; - } - name_to_lang_targets.entry(name).or_default().push((id, lang)); - node_id_to_path.insert(id, path); - } - } - - // Map source_id → source file path so refine_ambiguous_targets gets the - // proximity hint it needs. - let source_ids: Vec = pending.iter().map(|p| p.source_id).collect(); - let mut source_id_to_path: HashMap = HashMap::new(); - if !source_ids.is_empty() { - let placeholders = std::iter::repeat_n("?", source_ids.len()).collect::>().join(","); - let sql = format!( - "SELECT n.id, f.path FROM nodes n JOIN files f ON f.id = n.file_id WHERE n.id IN ({})", - placeholders - ); - let mut stmt = db.conn().prepare(&sql)?; - let params: Vec<&dyn rusqlite::ToSql> = source_ids.iter() - .map(|id| id as &dyn rusqlite::ToSql) - .collect(); - let rows = stmt.query_map(params.as_slice(), |row| { - Ok((row.get::<_, i64>(0)?, row.get::<_, String>(1)?)) - })?; - for row in rows { - let (id, path) = row?; - source_id_to_path.insert(id, path); - } - } - - let mut edges_added = 0usize; - let mut to_delete: Vec = Vec::new(); - - for row in &pending { - let candidates: Vec = name_to_lang_targets.get(&row.target_name) - .map(|entries| entries.iter() - .filter(|(_, lang)| *lang == row.source_language) - .map(|(id, _)| *id) - .filter(|id| *id != row.source_id) // self-call guard - .collect()) - .unwrap_or_default(); - - if candidates.is_empty() { - continue; // still unresolvable — leave buffered - } - - let refined = if candidates.len() > 1 { - let source_path = source_id_to_path.get(&row.source_id).cloned().unwrap_or_default(); - refine_ambiguous_targets(&candidates, &source_path, &node_id_to_path) - } else { - candidates - }; - - for tgt_id in &refined { - if insert_edge_cached( - db.conn(), - row.source_id, - *tgt_id, - REL_CALLS, - row.metadata.as_deref(), - )? { - edges_added += 1; - } - } - to_delete.push(row.id); - } - - for id in to_delete { - delete_pending_unresolved_call(db.conn(), id)?; - } - - Ok(edges_added) -} - -/// Disambiguate N same-language cross-file candidates for a single call/import -/// target. Returns a subset. A single-element result is the authoritative -/// winner; ties fall back to the full input so the caller does not -/// inadvertently drop legitimate edges. -/// -/// Heuristic: (1) prefer non-test-file candidates when the caller is not -/// itself a test file; (2) among the preferred pool, keep only those tied -/// for the longest byte-common path prefix with the caller. Previous -/// versions dropped on ambiguity, which regressed dead-code detection for -/// bare-name Rust calls like `crate::domain::foo()` where scoped_identifier -/// extraction keeps only `foo` and two `foo` definitions under `src/` tie -/// on prefix — better to keep both edges than to report `foo` as dead. -fn refine_ambiguous_targets( - candidates: &[i64], - caller_rel_path: &str, - node_id_to_path: &HashMap, -) -> Vec { - if candidates.len() <= 1 { - return candidates.to_vec(); - } - - let is_test_path = |p: &str| { - p.contains(".test.") || p.contains("_test.") - || p.starts_with("tests/") || p.contains("/tests/") - || p.starts_with("test/") || p.contains("/test/") - || p.contains(".spec.") - }; - let caller_is_test = is_test_path(caller_rel_path); - - // Pass 1: prefer non-test candidates when the caller is non-test code. - let pool: Vec = if caller_is_test { - candidates.to_vec() - } else { - let non_test: Vec = candidates.iter().copied() - .filter(|id| { - let p = node_id_to_path.get(id).map(String::as_str).unwrap_or(""); - !is_test_path(p) - }) - .collect(); - if non_test.is_empty() { candidates.to_vec() } else { non_test } - }; - - if pool.len() == 1 { return pool; } - - // Pass 2: keep only candidates tied for the longest common path prefix - // with the caller. Byte-wise prefix is a rough proxy for module locality - // — e.g. `claude-plugin/scripts/session-init.js` shares 21 bytes with - // `claude-plugin/scripts/lifecycle.js` but 0 bytes with `scripts/*`. - let prefix_len = |p: &str| -> usize { - caller_rel_path.bytes().zip(p.bytes()) - .take_while(|(a, b)| a == b) - .count() - }; - let max_prefix = pool.iter() - .map(|id| prefix_len(node_id_to_path.get(id).map(String::as_str).unwrap_or(""))) - .max() - .unwrap_or(0); - let closest: Vec = pool.iter().copied() - .filter(|id| prefix_len(node_id_to_path.get(id).map(String::as_str).unwrap_or("")) == max_prefix) - .collect(); - - if closest.len() == 1 { return closest; } - - // Still ambiguous — return the remaining pool rather than dropping. This - // keeps dead-code precision high for edges we cannot confidently prune - // (most notably Rust bare-name scoped calls) at the cost of leaving a - // small amount of fan-out; the single-winner fast path above handles - // the common case (unique non-test match, or unique closest path). - if !closest.is_empty() { closest } else { pool } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::storage::queries::{get_nodes_by_name, get_edges_from, get_import_tree}; - use tempfile::TempDir; - use std::fs; - - #[test] - fn test_full_index_pipeline() { - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - - fs::create_dir_all(project_dir.path().join("src")).unwrap(); - fs::write(project_dir.path().join("src/auth.ts"), r#" -function validateToken(token: string): boolean { - return jwt.verify(token); -} - -function handleLogin(req: Request) { - if (validateToken(req.token)) { - return createSession(req.userId); - } -} -"#).unwrap(); - - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - let result = run_full_index(&db, project_dir.path(), None, None).unwrap(); - - assert!(result.files_indexed > 0); - assert!(result.nodes_created > 0); - assert!(result.edges_created > 0); - - // Verify nodes are in DB - let nodes = get_nodes_by_name(db.conn(), "handleLogin").unwrap(); - assert_eq!(nodes.len(), 1); - - // Verify edges: handleLogin → calls → validateToken - let edges = get_edges_from(db.conn(), nodes[0].id).unwrap(); - assert!(edges.iter().any(|e| e.relation == REL_CALLS), "should have call edges"); - - // Verify context string was built - assert!(nodes[0].context_string.is_some(), "context string should be set after Phase 3"); - } - - #[test] - fn test_cross_language_bare_name_call_resolution() { - // Regression: Rust method call `hasher.update(...)` was resolving to - // JS `function update()` via global bare-name lookup, producing phantom - // Rust → JS call edges in mixed projects. Fix: same-file > same-language - // tiers; drop call edges with no same-language candidate. - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - fs::create_dir_all(project_dir.path().join("src")).unwrap(); - fs::create_dir_all(project_dir.path().join("scripts")).unwrap(); - - fs::write(project_dir.path().join("src/hasher.rs"), r#" -pub fn caller_rs() { - let mut h = Hasher::new(); - h.update(&[1, 2, 3]); - h.finalize(); -} -"#).unwrap(); - - fs::write(project_dir.path().join("scripts/helper.js"), r#" -function update() { return 1; } -function caller_js() { update(); } -"#).unwrap(); - - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - run_full_index(&db, project_dir.path(), None, None).unwrap(); - - let rust_caller = crate::storage::queries::get_nodes_with_files_by_name( - db.conn(), "caller_rs", - ).unwrap(); - let rust_caller = rust_caller.iter() - .find(|n| n.file_path == "src/hasher.rs") - .expect("Rust caller_rs should be indexed"); - let edges = get_edges_from(db.conn(), rust_caller.node.id).unwrap(); - for e in &edges { - if e.relation != REL_CALLS { continue; } - let tgt_path: Option = db.conn().query_row( - "SELECT f.path FROM nodes n JOIN files f ON n.file_id = f.id WHERE n.id = ?1", - [e.target_id], |row| row.get(0), - ).ok(); - assert!( - !tgt_path.as_deref().unwrap_or("").ends_with(".js"), - "Rust caller must not resolve calls into JS; got edge → {:?}", tgt_path, - ); - } - - let js_caller = crate::storage::queries::get_nodes_with_files_by_name( - db.conn(), "caller_js", - ).unwrap(); - let js_caller = js_caller.iter() - .find(|n| n.file_path == "scripts/helper.js") - .expect("JS caller_js should be indexed"); - let js_edges = get_edges_from(db.conn(), js_caller.node.id).unwrap(); - let js_call_targets: Vec = js_edges.iter() - .filter(|e| e.relation == REL_CALLS) - .map(|e| e.target_id) - .collect(); - assert!(!js_call_targets.is_empty(), - "JS caller_js → update edge within same file should still resolve"); - } - - #[test] - fn test_js_require_creates_external_import_edges() { - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - fs::write(project_dir.path().join("app.js"), r#" -const fs = require('fs'); -const path = require('path'); -const lifecycle = require('./lifecycle'); - -function main() { fs.readFileSync('x'); } -"#).unwrap(); - - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - run_full_index(&db, project_dir.path(), None, None).unwrap(); - - let imports: Vec = db.conn().prepare( - "SELECT DISTINCT n2.name FROM edges e - JOIN nodes n ON n.id = e.source_id - JOIN files f ON f.id = n.file_id - JOIN nodes n2 ON n2.id = e.target_id - WHERE f.path = 'app.js' AND e.relation = 'imports'" - ).unwrap() - .query_map([], |row| row.get::<_, String>(0)).unwrap() - .filter_map(Result::ok) - .collect(); - - assert!(imports.contains(&"fs".to_string()), "imports: {:?}", imports); - assert!(imports.contains(&"path".to_string()), "imports: {:?}", imports); - assert!(imports.contains(&"lifecycle".to_string()), "imports: {:?}", imports); - } - - #[test] - fn test_js_same_name_cross_file_prefers_closest_path() { - // Regression: when JS defines the same helper name in multiple files - // (e.g., `readJson` in both `claude-plugin/scripts/lifecycle.js` and - // `scripts/install-e2e.test.js`), a caller in `claude-plugin/scripts/*` - // used to fan out an edge to every same-language match, producing - // false-positive callers across unrelated modules. The resolver must - // pick the candidate with the longest common path prefix to the - // caller file (and prefer non-test files) rather than all. - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - fs::create_dir_all(project_dir.path().join("pkg/scripts")).unwrap(); - fs::create_dir_all(project_dir.path().join("tests")).unwrap(); - - fs::write(project_dir.path().join("pkg/scripts/lifecycle.js"), r#" -function readJson(p) { return 1; } -module.exports = { readJson }; -"#).unwrap(); - - fs::write(project_dir.path().join("pkg/scripts/session-init.js"), r#" -function syncLifecycleConfig() { readJson('x'); } -"#).unwrap(); - - fs::write(project_dir.path().join("tests/helpers.test.js"), r#" -function readJson(p) { return 2; } -"#).unwrap(); - - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - run_full_index(&db, project_dir.path(), None, None).unwrap(); - - // Find the caller node - let caller = crate::storage::queries::get_nodes_with_files_by_name( - db.conn(), "syncLifecycleConfig", - ).unwrap(); - let caller = caller.iter() - .find(|n| n.file_path == "pkg/scripts/session-init.js") - .expect("syncLifecycleConfig should be indexed"); - - let edges = get_edges_from(db.conn(), caller.node.id).unwrap(); - let call_edges: Vec = edges.iter() - .filter(|e| e.relation == REL_CALLS) - .map(|e| e.target_id) - .collect(); - - // Resolve target paths - let target_paths: Vec = call_edges.iter().filter_map(|tid| { - db.conn().query_row( - "SELECT f.path FROM nodes n JOIN files f ON n.file_id = f.id WHERE n.id = ?1", - [*tid], |row| row.get(0) - ).ok() - }).collect(); - - // Must pick exactly the same-dir candidate, not fan out to the test file. - assert!( - target_paths.iter().any(|p| p == "pkg/scripts/lifecycle.js"), - "should resolve to same-dir readJson; got {:?}", target_paths - ); - assert!( - !target_paths.iter().any(|p| p == "tests/helpers.test.js"), - "should NOT fan out to unrelated test-file readJson; got {:?}", target_paths - ); - } - - #[test] - fn test_js_module_level_test_callback_calls_resolve() { - // Regression: helpers defined in a JS test file that are called only - // from inside `test(() => {...})` / `describe(() => {...})` callbacks - // used to be reported as orphan by dead-code, because the anonymous - // arrow callback body attributed its calls to ``, a name - // that resolves to no node. Module-level call_expressions inside JS - // test files must attribute to `` so a same-file edge lands. - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - - fs::write(project_dir.path().join("helpers.test.js"), r#" -function mkHome() { return '/tmp/x'; } -function writeJson(p, v) { } - -test('uses helpers', () => { - const h = mkHome(); - writeJson(h, { a: 1 }); -}); -"#).unwrap(); - - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - run_full_index(&db, project_dir.path(), None, None).unwrap(); - - // Both helper names must have at least one incoming call edge. - for helper in ["mkHome", "writeJson"] { - let cnt: i64 = db.conn().query_row( - "SELECT COUNT(*) FROM edges e - JOIN nodes tn ON tn.id = e.target_id - JOIN files tf ON tf.id = tn.file_id - WHERE tn.name = ?1 AND tf.path = 'helpers.test.js' AND e.relation = 'calls'", - [helper], |row| row.get(0), - ).unwrap(); - assert!(cnt >= 1, - "{} should have at least one incoming call edge from the test callback, got {}", - helper, cnt); - } - } - - #[test] - fn test_incremental_index() { - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - - // Initial index - fs::write(project_dir.path().join("a.ts"), "function foo() {}").unwrap(); - run_full_index(&db, project_dir.path(), None, None).unwrap(); - - // Modify file - fs::write(project_dir.path().join("a.ts"), "function bar() {}").unwrap(); - - // Incremental index - let result = run_incremental_index(&db, project_dir.path(), None, None).unwrap(); - assert_eq!(result.files_indexed, 1); - - let foo = get_nodes_by_name(db.conn(), "foo").unwrap(); - assert_eq!(foo.len(), 0); - let bar = get_nodes_by_name(db.conn(), "bar").unwrap(); - assert_eq!(bar.len(), 1); - } - - #[test] - fn test_incremental_propagates_dirty_context() { - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - - // Initial: B (in b.ts) calls A (in a.ts) - fs::write(project_dir.path().join("a.ts"), "function alpha() {}").unwrap(); - fs::write(project_dir.path().join("b.ts"), "function beta() { alpha(); }").unwrap(); - run_full_index(&db, project_dir.path(), None, None).unwrap(); - - let beta_nodes = get_nodes_by_name(db.conn(), "beta").unwrap(); - assert_eq!(beta_nodes.len(), 1); - let beta_ctx_before = beta_nodes[0].context_string.clone().unwrap_or_default(); - - // Change A: rename function (alpha -> alphaRenamed) - fs::write(project_dir.path().join("a.ts"), "function alphaRenamed() {}").unwrap(); - run_incremental_index(&db, project_dir.path(), None, None).unwrap(); - - // beta's context_string should be updated (calls list changed because - // the old alpha node is gone and edge was cascade-deleted) - let beta_nodes_after = get_nodes_by_name(db.conn(), "beta").unwrap(); - assert_eq!(beta_nodes_after.len(), 1); - let beta_ctx_after = beta_nodes_after[0].context_string.clone().unwrap_or_default(); - assert_ne!(beta_ctx_before, beta_ctx_after); - } - - #[test] - fn test_deleted_file_cleanup() { - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - - fs::write(project_dir.path().join("a.ts"), "function foo() {}").unwrap(); - run_full_index(&db, project_dir.path(), None, None).unwrap(); - - fs::remove_file(project_dir.path().join("a.ts")).unwrap(); - run_incremental_index(&db, project_dir.path(), None, None).unwrap(); - - let foo = get_nodes_by_name(db.conn(), "foo").unwrap(); - assert_eq!(foo.len(), 0); - } - - #[test] - fn test_build_python_module_map() { - let mut paths = HashSet::new(); - paths.insert("myapp/utils.py".into()); - paths.insert("myapp/__init__.py".into()); - paths.insert("src/myapp/models.py".into()); - - let map = build_python_module_map(&paths); - - // Full dotted path - assert!(map.get("myapp.utils").unwrap().contains(&"myapp/utils.py".to_string())); - // Suffix path - assert!(map.get("utils").unwrap().contains(&"myapp/utils.py".to_string())); - // __init__.py maps to package - assert!(map.get("myapp").unwrap().contains(&"myapp/__init__.py".to_string())); - // Nested with src/ prefix - assert!(map.get("myapp.models").unwrap().contains(&"src/myapp/models.py".to_string())); - } - - #[test] - fn test_python_from_import_resolution() { - // Test `from myapp.utils import helper` creates correct cross-file edge - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - - fs::create_dir_all(project_dir.path().join("myapp")).unwrap(); - fs::write( - project_dir.path().join("myapp/utils.py"), - "def helper():\n return 42\n", - ).unwrap(); - fs::write( - project_dir.path().join("myapp/main.py"), - "from myapp.utils import helper\n\ndef main():\n helper()\n", - ).unwrap(); - - let result = run_full_index(&db, project_dir.path(), None, None).unwrap(); - assert!(result.edges_created > 0, "should create import edges"); - - // Verify dependency: main.py -> utils.py - let deps = get_import_tree(db.conn(), "myapp/main.py", "outgoing", 1).unwrap(); - assert!( - deps.iter().any(|d| d.file_path == "myapp/utils.py"), - "main.py should depend on utils.py, got: {:?}", - deps.iter().map(|d| &d.file_path).collect::>() - ); - } - - #[test] - fn test_python_import_module_resolution() { - // Test `import myutils` creates correct cross-file edge - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - - fs::write( - project_dir.path().join("myutils.py"), - "def do_something():\n pass\n", - ).unwrap(); - fs::write( - project_dir.path().join("main.py"), - "import myutils\n\ndef main():\n myutils.do_something()\n", - ).unwrap(); - - let result = run_full_index(&db, project_dir.path(), None, None).unwrap(); - assert!(result.edges_created > 0, "should create import edges"); - - // Verify dependency: main.py -> myutils.py - let deps = get_import_tree(db.conn(), "main.py", "outgoing", 1).unwrap(); - assert!( - deps.iter().any(|d| d.file_path == "myutils.py"), - "main.py should depend on myutils.py, got: {:?}", - deps.iter().map(|d| &d.file_path).collect::>() - ); - } - - #[test] - fn test_python_external_import_creates_virtual_nodes() { - // Test that external imports create virtual nodes in file - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - - fs::write( - project_dir.path().join("app.py"), - "import os\nfrom collections import OrderedDict\nfrom flask import Flask\n\ndef main():\n pass\n", - ).unwrap(); - - let result = run_full_index(&db, project_dir.path(), None, None).unwrap(); - assert!(result.files_indexed > 0, "should index the file"); - - // Verify file was created with virtual nodes - let ext_nodes = get_nodes_by_file_path(db.conn(), "").unwrap(); - let ext_names: Vec<&str> = ext_nodes.iter().map(|n| n.name.as_str()).collect(); - assert!(ext_names.contains(&"os"), "should have virtual node for 'os', got: {:?}", ext_names); - assert!(ext_names.contains(&"collections"), "should have virtual node for 'collections', got: {:?}", ext_names); - assert!(ext_names.contains(&"flask"), "should have virtual node for 'flask', got: {:?}", ext_names); - - // Verify dependency_graph shows as a dependency - let deps = get_import_tree(db.conn(), "app.py", "outgoing", 1).unwrap(); - assert!( - deps.iter().any(|d| d.file_path == ""), - "app.py should show dependency, got: {:?}", - deps.iter().map(|d| &d.file_path).collect::>() - ); - } - - #[test] - fn test_python_mixed_internal_external_imports() { - // Test project with both internal and external imports - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - - fs::create_dir_all(project_dir.path().join("myapp")).unwrap(); - fs::write( - project_dir.path().join("myapp/utils.py"), - "def helper():\n return 42\n", - ).unwrap(); - fs::write( - project_dir.path().join("myapp/main.py"), - "import os\nfrom myapp.utils import helper\nfrom flask import Flask\n\ndef main():\n helper()\n", - ).unwrap(); - - let result = run_full_index(&db, project_dir.path(), None, None).unwrap(); - assert!(result.edges_created > 0); - - // Should have internal dependency - let deps = get_import_tree(db.conn(), "myapp/main.py", "outgoing", 1).unwrap(); - let dep_files: Vec<&str> = deps.iter().map(|d| d.file_path.as_str()).collect(); - assert!(dep_files.contains(&"myapp/utils.py"), "should depend on internal utils.py, got: {:?}", dep_files); - - // Should also have external dependency - assert!(dep_files.contains(&""), "should depend on , got: {:?}", dep_files); - } - - #[test] - fn test_index_stats_skipped_large_file() { - // Verify that IndexResult.stats tracks files skipped due to size - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - - // Create a normal file - fs::write(project_dir.path().join("small.ts"), "function ok() {}").unwrap(); - - // Create a file exceeding MAX_FILE_SIZE (10MB) - let big_content = "a".repeat(11 * 1024 * 1024); - fs::write(project_dir.path().join("huge.ts"), &big_content).unwrap(); - - let result = run_full_index(&db, project_dir.path(), None, None).unwrap(); - assert_eq!(result.files_indexed, 1, "should index the small file"); - assert_eq!(result.stats.files_skipped_size, 1, "should track the large file skip"); - } - - #[test] - fn test_index_stats_skipped_parse_error() { - // Verify that IndexResult.stats tracks files skipped due to parse errors - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - - // Create a valid file - fs::write(project_dir.path().join("good.ts"), "function ok() {}").unwrap(); - - // Create a file with an unsupported extension that detect_language returns None for - // (this is filtered by detect_language returning None, not a parse error) - // Instead, we just verify the default stats are zero for parse errors - let result = run_full_index(&db, project_dir.path(), None, None).unwrap(); - assert_eq!(result.stats.files_skipped_parse, 0); - assert_eq!(result.stats.files_skipped_read, 0); - assert_eq!(result.stats.files_skipped_hash, 0); - } - - #[test] - fn test_index_stats_default() { - // IndexStats should implement Default - let stats = IndexStats::default(); - assert_eq!(stats.files_skipped_size, 0); - assert_eq!(stats.files_skipped_parse, 0); - assert_eq!(stats.files_skipped_read, 0); - assert_eq!(stats.files_skipped_hash, 0); - assert_eq!(stats.files_skipped_language, 0); - } - - #[test] - fn test_python_external_survives_incremental_index() { - // Test that pseudo-file persists across incremental re-indexes - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - - fs::write( - project_dir.path().join("app.py"), - "import os\n\ndef main():\n pass\n", - ).unwrap(); - - // Full index → creates with "os" node - run_full_index(&db, project_dir.path(), None, None).unwrap(); - let ext_before = get_nodes_by_file_path(db.conn(), "").unwrap(); - assert!(!ext_before.is_empty(), "should have external nodes after full index"); - - // Modify file slightly - fs::write( - project_dir.path().join("app.py"), - "import os\n\ndef main():\n return 1\n", - ).unwrap(); - - // Incremental index → should survive - run_incremental_index(&db, project_dir.path(), None, None).unwrap(); - let ext_after = get_nodes_by_file_path(db.conn(), "").unwrap(); - assert!(!ext_after.is_empty(), "external nodes should survive incremental index"); - - // Verify dependency still visible - let deps = get_import_tree(db.conn(), "app.py", "outgoing", 1).unwrap(); - assert!( - deps.iter().any(|d| d.file_path == ""), - "app.py should still show dependency after incremental index" - ); - } - - #[test] - fn test_repair_null_context_strings() { - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - - // Index a file so nodes get context strings - fs::write(project_dir.path().join("a.ts"), r#" -function alpha() { return 1; } -function beta() { alpha(); } -"#).unwrap(); - run_full_index(&db, project_dir.path(), None, None).unwrap(); - - // Verify context strings exist after index - let alpha_nodes = get_nodes_by_name(db.conn(), "alpha").unwrap(); - assert_eq!(alpha_nodes.len(), 1); - assert!(alpha_nodes[0].context_string.is_some(), "alpha should have context_string after index"); - - let beta_nodes = get_nodes_by_name(db.conn(), "beta").unwrap(); - assert_eq!(beta_nodes.len(), 1); - assert!(beta_nodes[0].context_string.is_some(), "beta should have context_string after index"); - - // Simulate Phase 3 failure: NULL out context_strings - db.conn().execute("UPDATE nodes SET context_string = NULL", []).unwrap(); - - // Verify they are now NULL - let alpha_after_null = get_nodes_by_name(db.conn(), "alpha").unwrap(); - assert!(alpha_after_null[0].context_string.is_none(), "alpha context_string should be NULL after simulated failure"); - - // Run repair - let repaired = repair_null_context_strings(&db, None).unwrap(); - assert!(repaired > 0, "should repair at least 1 node"); - - // Verify context strings were restored - let alpha_repaired = get_nodes_by_name(db.conn(), "alpha").unwrap(); - assert!(alpha_repaired[0].context_string.is_some(), "alpha should have context_string after repair"); - - let beta_repaired = get_nodes_by_name(db.conn(), "beta").unwrap(); - assert!(beta_repaired[0].context_string.is_some(), "beta should have context_string after repair"); - } - - #[test] - fn test_rust_implements_creates_sentinel_for_external_trait() { - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - - fs::write(project_dir.path().join("main.rs"), r#" -use std::io::{self, Write}; -use std::fmt; - -struct MyWriter; - -impl Write for MyWriter { - fn write(&mut self, buf: &[u8]) -> io::Result { Ok(buf.len()) } - fn flush(&mut self) -> io::Result<()> { Ok(()) } -} - -impl fmt::Display for MyWriter { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "MyWriter") - } -} -"#).unwrap(); - - let result = run_full_index(&db, project_dir.path(), None, None).unwrap(); - assert!(result.files_indexed > 0); - - // Verify sentinel nodes created for external traits - let ext_nodes = get_nodes_by_file_path(db.conn(), "").unwrap(); - let ext_names: Vec<&str> = ext_nodes.iter().map(|n| n.name.as_str()).collect(); - assert!(ext_names.contains(&"Write"), "should have sentinel for Write, got: {:?}", ext_names); - // fmt::Display keeps path prefix (as parsed by tree-sitter) - assert!(ext_names.contains(&"fmt::Display"), "should have sentinel for fmt::Display, got: {:?}", ext_names); - - // Verify sentinel type is "trait" - let write_node = ext_nodes.iter().find(|n| n.name == "Write").unwrap(); - assert_eq!(write_node.node_type, "trait", "sentinel should be type 'trait'"); - - // Verify implements edges exist: MyWriter → Write, MyWriter → Display - let edges: Vec<(String, String)> = db.conn().prepare( - "SELECT ns.name, nt.name FROM edges e - JOIN nodes ns ON ns.id = e.source_id - JOIN nodes nt ON nt.id = e.target_id - WHERE e.relation = 'implements'" - ).unwrap() - .query_map([], |row| Ok((row.get(0)?, row.get(1)?))) - .unwrap() - .collect::, _>>().unwrap(); - - assert!(edges.contains(&("MyWriter".into(), "Write".into())), - "should have MyWriter→Write implements edge, got: {:?}", edges); - assert!(edges.contains(&("MyWriter".into(), "fmt::Display".into())), - "should have MyWriter→fmt::Display implements edge, got: {:?}", edges); - } - - /// ensure_file_indexed must (a) be a no-op when on-disk hash matches the - /// stored hash, and (b) actually pick up post-edit content when it doesn't. - /// This is the contract the MCP `ensure_file_fresh_opt` wrapper relies on - /// to close the post-Edit→pre-incremental-index window. - #[test] - fn test_ensure_file_indexed_picks_up_post_edit_changes() { - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - - // Initial state: file with `alpha` - fs::write(project_dir.path().join("a.ts"), "function alpha() {}\n").unwrap(); - run_full_index(&db, project_dir.path(), None, None).unwrap(); - let names_before: Vec = get_nodes_by_name(db.conn(), "alpha") - .unwrap().into_iter().map(|n| n.name).collect(); - assert_eq!(names_before, vec!["alpha".to_string()]); - - // No-op when hashes match - let did = ensure_file_indexed(&db, project_dir.path(), "a.ts", None).unwrap(); - assert!(!did, "matching hash must be a no-op (got reindex)"); - - // Edit on disk; old `alpha` removed, new `beta` added - fs::write(project_dir.path().join("a.ts"), "function beta() {}\n").unwrap(); - let did2 = ensure_file_indexed(&db, project_dir.path(), "a.ts", None).unwrap(); - assert!(did2, "hash mismatch must trigger a reindex"); - - // alpha gone, beta present — post-Edit query would now see fresh state - assert!(get_nodes_by_name(db.conn(), "alpha").unwrap().is_empty(), - "old alpha must be evicted by single-file reindex"); - let beta = get_nodes_by_name(db.conn(), "beta").unwrap(); - assert_eq!(beta.len(), 1, "new beta must appear after single-file reindex"); - assert_eq!(beta[0].name, "beta"); - - // Calling again with no on-disk change is a no-op - let did3 = ensure_file_indexed(&db, project_dir.path(), "a.ts", None).unwrap(); - assert!(!did3, "second call with no edit must no-op"); - - // Deleting the file from disk drops the row - fs::remove_file(project_dir.path().join("a.ts")).unwrap(); - let did4 = ensure_file_indexed(&db, project_dir.path(), "a.ts", None).unwrap(); - assert!(did4, "missing file must trigger row cleanup"); - assert!(get_nodes_by_name(db.conn(), "beta").unwrap().is_empty(), - "beta must be cascade-deleted with its file"); - } - - /// Root-cause test for `feedback_incremental_edge_timing.md`: file B - /// (existing, unchanged) bare-name calls `foo()`. file A is added later - /// with `function foo() {}`. Phase 2 of B's first index pass dropped the - /// edge because `foo` was unresolvable; before this fix, A's later index - /// never re-resolved B's call → permanently missing edge in incremental - /// mode (only `rebuild-index` recovered it). - /// - /// New behavior: B's drop becomes a `pending_unresolved_calls` row; A's - /// index pass sweeps pending and promotes the row into a real edge. - #[test] - fn test_pending_unresolved_call_resolves_when_callee_added_later() { - use crate::storage::queries::{count_pending_unresolved_calls, get_node_ids_by_name}; - use crate::domain::REL_CALLS; - - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - - // Step 1: B exists alone with bare-name call to foo (foo undefined). - fs::write(project_dir.path().join("b.ts"), - "function caller_b() { foo(); }\n").unwrap(); - run_full_index(&db, project_dir.path(), None, None).unwrap(); - - // Phase 2 dropped the edge (no same-file/same-language target) and - // buffered the row instead. - assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 1, - "B's call to undefined foo must land in pending_unresolved_calls"); - - let caller_b_id = get_node_ids_by_name(db.conn(), "caller_b").unwrap() - .into_iter().next().expect("caller_b must exist").0; - - // Verify NO edge yet (foo doesn't exist in DB). - let pre_edges = crate::storage::queries::get_edges_from(db.conn(), caller_b_id).unwrap(); - assert!(pre_edges.iter().all(|e| e.relation != REL_CALLS), - "no calls edge should exist yet — foo is undefined"); - - // Step 2: A is added with foo(). Incremental index picks it up; the - // pending sweep at end of index_files promotes B's buffered call into - // a real edge. - fs::write(project_dir.path().join("a.ts"), - "export function foo() {}\n").unwrap(); - run_incremental_index(&db, project_dir.path(), None, None).unwrap(); - - let foo_id = get_node_ids_by_name(db.conn(), "foo").unwrap() - .into_iter().next().expect("foo must exist after A indexed").0; - - let post_edges = crate::storage::queries::get_edges_from(db.conn(), caller_b_id).unwrap(); - let calls_to_foo: Vec<_> = post_edges.iter() - .filter(|e| e.relation == REL_CALLS && e.target_id == foo_id) - .collect(); - assert_eq!(calls_to_foo.len(), 1, - "incremental index must promote pending call → calls edge caller_b → foo; \ - got edges: {:?}", post_edges.iter().map(|e| (&e.relation, e.target_id)).collect::>()); - - // Pending row must be drained after successful resolution. - assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 0, - "resolved pending row must be deleted after edge insertion"); - } - - /// Cross-language pending must NOT resolve cross-language. If B (TS) - /// calls `update()` and a later-indexed Rust file defines `fn update()`, - /// the pending row must stay buffered, not silently bind cross-language - /// (memory `feedback_edge_resolution_same_language.md`'s canonical - /// false-positive class). - #[test] - fn test_pending_unresolved_call_does_not_cross_language() { - use crate::storage::queries::count_pending_unresolved_calls; - - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - - // TS file with bare-name call to `update` - fs::write(project_dir.path().join("client.ts"), - "function caller_ts() { update(); }\n").unwrap(); - run_full_index(&db, project_dir.path(), None, None).unwrap(); - assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 1); - - // Rust file with `update` — different language, must NOT match. - fs::write(project_dir.path().join("hasher.rs"), - "fn update() {}\n").unwrap(); - run_incremental_index(&db, project_dir.path(), None, None).unwrap(); - - // Pending row stays — sweep refused cross-language resolution. - assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 1, - "cross-language target must NOT resolve a TS pending call to a Rust fn"); - } - - /// One caller with N undefined references must produce N pending rows; - /// when a single later-added file defines all N, all rows must resolve in - /// a single sweep. Real codebases hit this whenever a "barrel" or shared - /// utility module gets added after its consumers. - #[test] - fn test_pending_resolves_multiple_calls_in_same_caller() { - use crate::storage::queries::{count_pending_unresolved_calls, get_node_ids_by_name}; - - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - - // B has three undefined call targets — foo, bar, baz. - fs::write(project_dir.path().join("b.ts"), - "function caller_b() { foo(); bar(); baz(); }\n").unwrap(); - run_full_index(&db, project_dir.path(), None, None).unwrap(); - - assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 3, - "three bare-name calls must produce three pending rows"); - - // A defines all three. - fs::write(project_dir.path().join("a.ts"), - "export function foo() {}\nexport function bar() {}\nexport function baz() {}\n").unwrap(); - run_incremental_index(&db, project_dir.path(), None, None).unwrap(); - - assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 0, - "all three pending rows must drain once their targets exist"); - - // All three resolved into real edges. - let caller_b_id = get_node_ids_by_name(db.conn(), "caller_b").unwrap() - .into_iter().next().unwrap().0; - let edges = crate::storage::queries::get_edges_from(db.conn(), caller_b_id).unwrap(); - let calls_count = edges.iter().filter(|e| e.relation == REL_CALLS).count(); - assert_eq!(calls_count, 3, - "caller_b must have exactly three calls edges (foo, bar, baz); got {} edges total: {:?}", - calls_count, edges.iter().map(|e| (&e.relation, e.target_id)).collect::>()); - } - - /// When the caller's source file is reindexed (e.g. user edits B), the - /// cascade FK on pending_unresolved_calls(source_id) must drop B's pending - /// rows so a fresh Phase 2 can re-buffer them with the current source IDs. - /// This is the schema's load-bearing self-cleaning property — we test it - /// explicitly so a future migration that drops or weakens the FK fails - /// loudly here rather than leaking pending rows for ever-removed callers. - #[test] - fn test_pending_cascade_deletes_when_caller_file_reindexed() { - use crate::storage::queries::count_pending_unresolved_calls; - - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - - // B with undefined target → pending row created. - fs::write(project_dir.path().join("b.ts"), - "function caller_b() { undefined_target(); }\n").unwrap(); - run_full_index(&db, project_dir.path(), None, None).unwrap(); - assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 1); - - // Edit B to remove the call entirely. caller_b's old node gets - // cascade-deleted on reindex (Phase 1 deletes prior rows), and its - // pending row must follow it via ON DELETE CASCADE on source_id. - fs::write(project_dir.path().join("b.ts"), - "function caller_b() { /* call removed */ }\n").unwrap(); - run_incremental_index(&db, project_dir.path(), None, None).unwrap(); - - assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 0, - "pending row must be cascade-deleted when its source caller is removed/reindexed"); - } - - /// Inverse-direction symmetry test for `feedback_incremental_edge_timing.md`: - /// existing edge B → A.foo gets cascade-deleted when A is removed, and B - /// is NOT in changed_paths (deletion doesn't re-extract B). Without Phase 0 - /// pre-cascade buffering, B has neither edge nor pending row — a permanent - /// silent edge loss until full rebuild. The Phase 0 buffer (added by this - /// fix) must capture B's call as a pending row before cascade fires. - #[test] - fn test_pending_buffers_on_callee_file_deletion() { - use crate::storage::queries::{count_pending_unresolved_calls, get_node_ids_by_name}; - - let project_dir = TempDir::new().unwrap(); - let db_dir = TempDir::new().unwrap(); - let db = Database::open(&db_dir.path().join("index.db")).unwrap(); - - // Initial: A defines foo, B calls foo — edge B.caller_b → A.foo exists. - fs::write(project_dir.path().join("a.ts"), - "export function foo() {}\n").unwrap(); - fs::write(project_dir.path().join("b.ts"), - "function caller_b() { foo(); }\n").unwrap(); - run_full_index(&db, project_dir.path(), None, None).unwrap(); - - // No pending rows yet — call resolved at index time. - assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 0, - "fully-resolvable call must not produce a pending row"); - - let caller_b_id = get_node_ids_by_name(db.conn(), "caller_b").unwrap() - .into_iter().next().unwrap().0; - let foo_id_pre = get_node_ids_by_name(db.conn(), "foo").unwrap() - .into_iter().next().unwrap().0; - let edges_pre = crate::storage::queries::get_edges_from(db.conn(), caller_b_id).unwrap(); - assert!(edges_pre.iter().any(|e| e.relation == REL_CALLS && e.target_id == foo_id_pre), - "edge caller_b → foo must exist pre-deletion"); - - // Delete A. Phase 0 must buffer B's now-orphaned call into pending - // BEFORE cascade strips the edge. - fs::remove_file(project_dir.path().join("a.ts")).unwrap(); - run_incremental_index(&db, project_dir.path(), None, None).unwrap(); - - // foo is gone. - assert!(get_node_ids_by_name(db.conn(), "foo").unwrap().is_empty(), - "foo must be cascade-deleted with file a.ts"); - - // B's edge to old foo is gone, but pending row holds the call. - assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 1, - "Phase 0 must buffer the orphaned inbound call into pending"); - - // Re-add A — pending sweep promotes the buffered call to a fresh edge. - fs::write(project_dir.path().join("a.ts"), - "export function foo() {}\n").unwrap(); - run_incremental_index(&db, project_dir.path(), None, None).unwrap(); - - assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 0, - "pending must drain once foo reappears"); - - let foo_id_post = get_node_ids_by_name(db.conn(), "foo").unwrap() - .into_iter().next().unwrap().0; - let edges_post = crate::storage::queries::get_edges_from(db.conn(), caller_b_id).unwrap(); - assert!(edges_post.iter().any(|e| e.relation == REL_CALLS && e.target_id == foo_id_post), - "edge caller_b → foo must reappear post re-add via pending sweep"); - } -} diff --git a/src/indexer/pipeline/context.rs b/src/indexer/pipeline/context.rs new file mode 100644 index 0000000..485f4c4 --- /dev/null +++ b/src/indexer/pipeline/context.rs @@ -0,0 +1,197 @@ +//! Context-string assembly for a node + edge bundle, plus the two recovery +//! paths that re-run that assembly outside the main indexer: +//! - `regenerate_context_strings`: incremental dirty propagation (rebuilds +//! ctx for nodes whose cross-file edges flipped during a re-index). +//! - `repair_null_context_strings`: startup repair when a prior Phase 3 +//! transaction failed and left rows with NULL context_string. +//! +//! `categorize_edges` and `format_route_from_metadata` are also used by the +//! main `index_files` Phase 3 builder, so they live here as `pub(super)`. + +use anyhow::Result; +use std::collections::{HashMap, HashSet}; + +use crate::embedding::context::{build_context_string, NodeContext}; +use crate::embedding::model::EmbeddingModel; +use crate::storage::db::Database; +use crate::storage::queries::{ + get_edges_batch, get_nodes_missing_context, get_nodes_with_files_by_ids, + update_context_strings_batch, EdgeInfo, NodeResult, +}; +use crate::domain::{REL_CALLS, REL_IMPORTS, REL_INHERITS, REL_ROUTES_TO, REL_IMPLEMENTS, REL_EXPORTS}; + +use super::embed::embed_and_store_batch; + +/// Extract "METHOD path" from route edge metadata JSON, falling back to the edge name. +pub(super) fn format_route_from_metadata(metadata: Option<&str>, name: &str) -> String { + if let Some(meta) = metadata { + if let Ok(v) = serde_json::from_str::(meta) { + let method = v["method"].as_str().unwrap_or("ALL"); + if let Some(path) = v["path"].as_str() { + return format!("{} {}", method, path); + } + } + } + name.to_string() +} + +pub(super) struct CategorizedEdges { + pub callees: Vec, + pub callers: Vec, + pub inherits: Vec, + pub routes: Vec, + pub imports: Vec, + pub implements: Vec, + pub exports: Vec, +} + +pub(super) fn categorize_edges(edges: Option<&Vec>, format_route: impl Fn(Option<&str>, &str) -> String) -> CategorizedEdges { + let mut result = CategorizedEdges { + callees: Vec::new(), + callers: Vec::new(), + inherits: Vec::new(), + routes: Vec::new(), + imports: Vec::new(), + implements: Vec::new(), + exports: Vec::new(), + }; + if let Some(edge_list) = edges { + for (relation, direction, name, metadata) in edge_list { + match (relation.as_str(), direction.as_str()) { + (rel, "out") if rel == REL_CALLS => result.callees.push(name.clone()), + (rel, "in") if rel == REL_CALLS => result.callers.push(name.clone()), + (rel, "out") if rel == REL_INHERITS => result.inherits.push(name.clone()), + (rel, "out") if rel == REL_ROUTES_TO => { + result.routes.push(format_route(metadata.as_deref(), name)); + } + (rel, "out") if rel == REL_IMPORTS => result.imports.push(name.clone()), + (rel, "out") if rel == REL_IMPLEMENTS => result.implements.push(name.clone()), + (rel, "out") if rel == REL_EXPORTS => result.exports.push(name.clone()), + _ => {} + } + } + } + result +} + +/// Regenerate context strings (and embeddings) for the given set of dirty nodes. +pub(super) fn regenerate_context_strings(db: &Database, dirty_ids: &HashSet, model: Option<&EmbeddingModel>) -> Result<()> { + let tx = db.conn().unchecked_transaction()?; + let id_vec: Vec = dirty_ids.iter().copied().collect(); + let all_edges = get_edges_batch(db.conn(), &id_vec)?; + let all_nodes: HashMap)> = { + let nwfs = get_nodes_with_files_by_ids(db.conn(), &id_vec)?; + nwfs.into_iter().map(|nwf| (nwf.node.id, (nwf.node, nwf.file_path, nwf.language))).collect() + }; + + // Build all context strings first + let mut context_updates: Vec<(i64, String)> = Vec::with_capacity(dirty_ids.len()); + for &node_id in dirty_ids { + if let Some((node, file_path, language)) = all_nodes.get(&node_id) { + let edges = all_edges.get(&node_id); + let cat = categorize_edges(edges, format_route_from_metadata); + + let ctx = build_context_string(&NodeContext { + node_type: node.node_type.clone(), + name: node.name.clone(), + qualified_name: node.qualified_name.clone(), + file_path: file_path.clone(), + language: language.clone(), + signature: node.signature.clone(), + return_type: node.return_type.clone(), + param_types: node.param_types.clone(), + code_content: Some(node.code_content.clone()), + routes: cat.routes, + callees: cat.callees, + callers: cat.callers, + inherits: cat.inherits, + imports: cat.imports, + implements: cat.implements, + exports: cat.exports, + doc_comment: node.doc_comment.clone(), + }); + + context_updates.push((node_id, ctx)); + } + } + + // Batch update context strings + update_context_strings_batch(db.conn(), &context_updates)?; + tx.commit()?; + + // Embed outside the committed tx — recoverable on failure + if let Some(m) = model { + if db.vec_enabled() { + embed_and_store_batch(db, m, &context_updates)?; + } + } + Ok(()) +} + +/// Repair nodes that have NULL context_string (likely from a failed Phase 3). +/// This is called at startup after index verification. +pub fn repair_null_context_strings( + db: &Database, + model: Option<&EmbeddingModel>, +) -> Result { + let missing_ids = get_nodes_missing_context(db.conn())?; + if missing_ids.is_empty() { + return Ok(0); + } + + tracing::info!("[repair] Found {} nodes with NULL context_string, rebuilding...", missing_ids.len()); + + // Load node details with file paths + let nodes_with_files = get_nodes_with_files_by_ids(db.conn(), &missing_ids)?; + + // Load edges for all affected nodes in one batch + let all_edges = get_edges_batch(db.conn(), &missing_ids)?; + + // Build context strings + let mut context_updates: Vec<(i64, String)> = Vec::new(); + for nwf in &nodes_with_files { + let node = &nwf.node; + let edges = all_edges.get(&node.id); + let cat = categorize_edges(edges, format_route_from_metadata); + + let ctx = build_context_string(&NodeContext { + node_type: node.node_type.clone(), + name: node.name.clone(), + qualified_name: node.qualified_name.clone(), + file_path: nwf.file_path.clone(), + language: nwf.language.clone(), + signature: node.signature.clone(), + return_type: node.return_type.clone(), + param_types: node.param_types.clone(), + code_content: Some(node.code_content.clone()), + routes: cat.routes, + callees: cat.callees, + callers: cat.callers, + inherits: cat.inherits, + imports: cat.imports, + implements: cat.implements, + exports: cat.exports, + doc_comment: node.doc_comment.clone(), + }); + + context_updates.push((node.id, ctx)); + } + + // Update in DB within a transaction (avoids per-row fsync under autocommit) + if !context_updates.is_empty() { + let tx = db.conn().unchecked_transaction()?; + update_context_strings_batch(db.conn(), &context_updates)?; + tx.commit()?; + + // Re-embed if model available + if let Some(m) = model { + if db.vec_enabled() { + embed_and_store_batch(db, m, &context_updates)?; + } + } + } + + let count = context_updates.len(); + tracing::info!("[repair] Repaired context strings for {} nodes", count); + Ok(count) +} diff --git a/src/indexer/pipeline/embed.rs b/src/indexer/pipeline/embed.rs new file mode 100644 index 0000000..4dbfce0 --- /dev/null +++ b/src/indexer/pipeline/embed.rs @@ -0,0 +1,71 @@ +//! Batch embedding + vector store. Wraps `EmbeddingModel::embed_batch` with +//! a per-batch DB transaction; on batch failure falls back to per-row embed +//! so a single malformed input doesn't tank the whole sweep. +//! +//! Public so `mcp::server` can call it from the background embedding thread +//! (separate from the indexer's foreground Phase 3 path). + +use anyhow::Result; + +use crate::embedding::model::EmbeddingModel; +use crate::storage::db::Database; +use crate::storage::queries::insert_node_vectors_batch; + +/// Embed context strings using batched inference and batch-insert vectors. +/// Public so the background embedding thread in server.rs can call it. +/// Wraps vector inserts in a transaction for atomicity and performance. +pub fn embed_and_store_batch(db: &Database, model: &EmbeddingModel, context_updates: &[(i64, String)]) -> Result<()> { + if context_updates.is_empty() { + return Ok(()); + } + + let t0 = std::time::Instant::now(); + let texts: Vec<&str> = context_updates.iter().map(|(_, ctx)| ctx.as_str()).collect(); + let ids: Vec = context_updates.iter().map(|(id, _)| *id).collect(); + + let embeddings = match model.embed_batch(&texts) { + Ok(embs) => embs, + Err(e) => { + tracing::warn!("Batch embed failed, falling back to sequential: {}", e); + // Fallback: sequential embed + let mut embs = Vec::new(); + for (i, text) in texts.iter().enumerate() { + match model.embed(text) { + Ok(emb) => embs.push(Some(emb)), + Err(e2) => { + tracing::warn!("Failed to embed node {}: {}", ids[i], e2); + embs.push(None); + } + } + } + let vectors: Vec<(i64, Vec)> = ids.iter().zip(embs) + .filter_map(|(&id, emb)| emb.map(|e| (id, e))) + .collect(); + if !vectors.is_empty() { + let tx = db.conn().unchecked_transaction()?; + insert_node_vectors_batch(db.conn(), &vectors)?; + tx.commit()?; + } + tracing::info!("[embed] {} nodes (sequential fallback) in {:.1}s", + context_updates.len(), t0.elapsed().as_secs_f64()); + return Ok(()); + } + }; + + let vectors: Vec<(i64, Vec)> = ids.into_iter().zip(embeddings).collect(); + let t_embed = t0.elapsed(); + + if !vectors.is_empty() { + let tx = db.conn().unchecked_transaction()?; + insert_node_vectors_batch(db.conn(), &vectors)?; + tx.commit()?; + } + + tracing::info!("[embed] {} nodes in {:.1}s (embed {:.1}s, store {:.1}s)", + context_updates.len(), + t0.elapsed().as_secs_f64(), + t_embed.as_secs_f64(), + (t0.elapsed() - t_embed).as_secs_f64(), + ); + Ok(()) +} diff --git a/src/indexer/pipeline/index_files.rs b/src/indexer/pipeline/index_files.rs new file mode 100644 index 0000000..e4ea8db --- /dev/null +++ b/src/indexer/pipeline/index_files.rs @@ -0,0 +1,827 @@ +//! Single-pass batched indexer. Phases share local state (transaction, +//! atomics, batch_parsed, name_to_ids, global_name_map) so the function +//! itself stays whole — the *helpers* that feed it (context, embedding, +//! Python module map, ambiguity refinement, pending-call sweep) live in +//! sibling modules. +//! +//! Phase outline: +//! - 0: delete files; pre-cascade-buffer inbound calls into pending so +//! B → A.foo doesn't silently vanish when only A is in `delete_paths`. +//! - 1a: parallel CPU work (read + parse + extract nodes) via rayon. +//! - 1b: sequential DB inserts (file row, node rows; cascades old nodes). +//! - 2: extract relations, resolve to edges with same-file → same-language +//! → drop/global tier order; buffer unresolved bare-name same-language +//! calls into pending instead of dropping; track external imports/symbols. +//! - 2b / 2b-ext: virtual `` nodes for unresolved imports/traits. +//! - 2c: restore cross-file inbound edges that cascade-delete just stripped. +//! - 3: build context strings (parallel), batch-update, then embed outside tx. +//! - 2c sweep: drain `pending_unresolved_calls` against the new node state. + +use anyhow::Result; +use std::collections::{HashMap, HashSet}; +use std::path::Path; + +use rayon::prelude::*; + +use crate::embedding::context::{build_context_string, NodeContext}; +use crate::embedding::model::EmbeddingModel; +use crate::indexer::merkle::hash_file; +use crate::parser::relations::extract_relations_from_tree; +use crate::parser::treesitter::{parse_tree, extract_nodes_from_tree}; +use crate::search::tokenizer::split_identifier; +use crate::storage::db::Database; +use crate::storage::queries::{ + delete_files_by_paths, delete_nodes_by_file, + get_all_node_names_with_ids, get_edges_batch, + get_inbound_cross_file_edges, + get_nodes_by_file_path, + get_nodes_with_files_by_ids, + insert_edge_cached, insert_node_cached, + update_context_strings_batch, upsert_file, + FileRecord, NodeRecord, NodeResult, +}; +use crate::domain::{REL_CALLS, REL_IMPORTS, REL_ROUTES_TO, REL_IMPLEMENTS, max_file_size, CROSS_FILE_CALL_NOISE}; +use crate::utils::config::detect_language; + +use super::{IndexResult, IndexStats, ProgressFn}; +use super::context::{categorize_edges, format_route_from_metadata}; +use super::embed::embed_and_store_batch; +use super::python_modules::{build_python_module_map, resolve_python_module_targets}; +use super::resolve::{refine_ambiguous_targets, resolve_pending_calls}; + +/// Batch size for streaming indexing. Each batch processes Phase 1+2 +/// then drops heavyweight data (ASTs, source strings) before the next batch. +const BATCH_SIZE: usize = 500; + +/// Lightweight post-batch record — no Tree or source string. +pub(super) struct FileIndexed { + pub rel_path: String, + pub node_ids: Vec, + pub node_names: Vec, +} + +pub(super) fn index_files( + db: &Database, + root: &Path, + files: &[String], + hashes: &HashMap, + model: Option<&EmbeddingModel>, + delete_paths: &[String], + progress: Option, +) -> Result { + // SAFETY: unchecked_transaction is used because rusqlite's Transaction borrows + // &mut Connection, preventing other borrows during the transaction. Here we need + // both the transaction and read access via db.conn() (which returns &Connection + // to the same underlying connection). This is safe because: + // (1) db.conn() returns the same Connection the tx was opened on, + // (2) we never open nested transactions, + // (3) concurrent access (e.g. background embedding thread) uses separate + // DB connections; safety relies on SQLite WAL mode + busy_timeout(5000), + // not single-threadedness. + + use std::sync::atomic::{AtomicUsize, Ordering as AtomicOrdering}; + let skipped_size = AtomicUsize::new(0); + let skipped_parse = AtomicUsize::new(0); + let skipped_read = AtomicUsize::new(0); + let skipped_hash = AtomicUsize::new(0); + let skipped_language = AtomicUsize::new(0); + + let mut total_nodes_created = 0usize; + let mut total_edges_created = 0usize; + let mut all_indexed: Vec = Vec::new(); + + // Phase 0: Delete removed files in own transaction. + // + // Before cascade strips inbound REL_CALLS edges, capture them as pending + // rows. Without this, deleting file A wipes B's edge to A.foo and B is + // not in `delete_paths` (so Phase 2 won't re-extract it), leaving B with + // neither an edge nor a pending row — the same staleness window the + // "callee added later" buffering closes, just from the deletion side. + // Both directions need to round-trip through pending or the v0.18.2 fix + // is only half-complete. + if !delete_paths.is_empty() { + let tx = db.conn().unchecked_transaction()?; + + // Resolve file IDs once (delete_files_by_paths drops them) so we can + // query inbound calls before cascade fires. + let mut deleted_file_ids: Vec = Vec::with_capacity(delete_paths.len()); + for path in delete_paths { + if let Ok(Some(fid)) = db.conn().query_row( + "SELECT id FROM files WHERE path = ?1", + [path], + |row| row.get::<_, Option>(0), + ) { + deleted_file_ids.push(fid); + } + } + + let mut buffered = 0usize; + for fid in &deleted_file_ids { + let inbound = crate::storage::queries::get_inbound_calls_for_pending(db.conn(), *fid)?; + for (source_id, target_name, source_language, metadata) in inbound { + crate::storage::queries::insert_pending_unresolved_call( + db.conn(), + source_id, + &target_name, + &source_language, + metadata.as_deref(), + )?; + buffered += 1; + } + } + if buffered > 0 { + tracing::info!( + "[index] Phase 0: buffered {} inbound calls before cascade-deleting {} file(s)", + buffered, deleted_file_ids.len() + ); + } + + delete_files_by_paths(db.conn(), delete_paths)?; + tx.commit()?; + } + + // CPU-bound parse result — produced in parallel, consumed sequentially for DB insert + struct FilePreParsed { + rel_path: String, + source: String, + language: String, + tree: tree_sitter::Tree, + hash: String, + last_modified: i64, + parsed_nodes: Vec, + } + + // Pre-build Python module map once (used in all batches for import resolution) + let mut all_python_paths: HashSet = files.iter() + .filter(|f| f.ends_with(".py")) + .cloned() + .collect(); + { + let mut stmt = db.conn().prepare("SELECT path FROM files WHERE path LIKE '%.py'")?; + let rows = stmt.query_map([], |row| row.get::<_, String>(0))?; + for row in rows { + all_python_paths.insert(row?); + } + } + let python_module_map = build_python_module_map(&all_python_paths); + + // Pre-load global name->[(id, path, language)] map once before the batch loop. + // This avoids a full table scan per batch in Phase 2 relation resolution. + // The map is updated incrementally as each batch commits new nodes. + // `language` drives same-language-preferred resolution to avoid cross-language + // bare-name collisions (e.g. Rust `hasher.update()` resolving to JS `function update`). + let mut global_name_map: HashMap> = + get_all_node_names_with_ids(db.conn())?; + + // Heavyweight per-file data used during Phase 1+2, dropped after each batch + #[allow(dead_code)] + struct FileParsed { + rel_path: String, + source: String, + language: String, + tree: tree_sitter::Tree, + file_id: i64, + node_ids: Vec, + node_names: Vec, + } + + // Process files in batches — each batch does Phase 1 + Phase 2 + for batch in files.chunks(BATCH_SIZE) { + let tx = db.conn().unchecked_transaction()?; + + // --- Phase 1a: Parallel CPU-bound work (read + parse + extract nodes) --- + let pre_parsed: Vec = batch + .par_iter() + .filter_map(|rel_path| { + let language = match detect_language(rel_path) { + Some(l) => l, + None => { + skipped_language.fetch_add(1, AtomicOrdering::Relaxed); + return None; + } + }; + let abs_path = root.join(rel_path); + + let file_meta = std::fs::metadata(&abs_path).ok(); + if let Some(ref meta) = file_meta { + if meta.len() > max_file_size() { + tracing::debug!("Skipping large file ({} bytes): {}", meta.len(), rel_path); + skipped_size.fetch_add(1, AtomicOrdering::Relaxed); + return None; + } + } + + let source = match std::fs::read_to_string(&abs_path) { + Ok(s) => s, + Err(e) => { + tracing::warn!("Skipping file {}: {}", rel_path, e); + skipped_read.fetch_add(1, AtomicOrdering::Relaxed); + return None; + } + }; + + let hash = match hashes.get(rel_path.as_str()) { + Some(h) => h.clone(), + None => match hash_file(&abs_path) { + Ok(h) => h, + Err(e) => { + tracing::warn!("Skipping file (hash error): {}: {}", rel_path, e); + skipped_hash.fetch_add(1, AtomicOrdering::Relaxed); + return None; + } + }, + }; + + let tree = match parse_tree(&source, language) { + Ok(t) => t, + Err(e) => { + tracing::warn!("Parse failed for {}: {}", rel_path, e); + skipped_parse.fetch_add(1, AtomicOrdering::Relaxed); + return None; + } + }; + + let last_modified = file_meta + .and_then(|m| m.modified().ok()) + .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok()) + .map(|d| d.as_secs() as i64) + .unwrap_or(0); + + let parsed_nodes = extract_nodes_from_tree(&tree, &source, language); + + Some(FilePreParsed { + rel_path: rel_path.clone(), + source, + language: language.to_string(), + tree, + hash, + last_modified, + parsed_nodes, + }) + }) + .collect(); + + let mut batch_parsed: Vec = Vec::new(); + // Saved inbound edges from other files → batch files (to restore after cascade delete) + // Tuple: (source_id, source_file_id, target_name, relation, metadata) + let mut saved_inbound_edges: Vec<(i64, i64, String, String, Option)> = Vec::new(); + // Track file_ids in this batch to filter intra-batch edges in Phase 2c + let mut batch_file_ids: HashSet = HashSet::new(); + + // --- Phase 1b: Sequential DB inserts --- + for pp in pre_parsed { + let file_id = upsert_file(db.conn(), &FileRecord { + path: pp.rel_path.clone(), + blake3_hash: pp.hash, + last_modified: pp.last_modified, + language: Some(pp.language.clone()), + })?; + + // Save cross-file inbound edges before cascade delete destroys them + saved_inbound_edges.extend(get_inbound_cross_file_edges(db.conn(), file_id)?); + batch_file_ids.insert(file_id); + + delete_nodes_by_file(db.conn(), file_id)?; + + let mut node_ids = Vec::new(); + let mut node_names = Vec::new(); + + let module_node_id = insert_node_cached(db.conn(), &NodeRecord { + file_id, + node_type: "module".into(), + name: "".into(), + qualified_name: Some(pp.rel_path.clone()), + start_line: 1, + end_line: pp.source.lines().count() as i64, + code_content: String::new(), + signature: None, + doc_comment: None, + context_string: None, + name_tokens: None, + return_type: None, + param_types: None, + is_test: false, + })?; + node_ids.push(module_node_id); + node_names.push("".into()); + total_nodes_created += 1; + + for pn in &pp.parsed_nodes { + let name_tokens = split_identifier(&pn.name); + let node_id = insert_node_cached(db.conn(), &NodeRecord { + file_id, + node_type: pn.node_type.clone(), + name: pn.name.clone(), + qualified_name: pn.qualified_name.clone(), + start_line: pn.start_line as i64, + end_line: pn.end_line as i64, + code_content: pn.code_content.clone(), + signature: pn.signature.clone(), + doc_comment: pn.doc_comment.clone(), + context_string: None, + name_tokens: Some(name_tokens), + return_type: pn.return_type.clone(), + param_types: pn.param_types.clone(), + is_test: pn.is_test, + })?; + node_ids.push(node_id); + node_names.push(pn.name.clone()); + total_nodes_created += 1; + } + + batch_parsed.push(FileParsed { + rel_path: pp.rel_path, + source: pp.source, + language: pp.language, + tree: pp.tree, + file_id, + node_ids, + node_names, + }); + } + + // --- Phase 2: Extract relations + insert edges --- + // Build per-batch name_to_ids and node_id_to_path from the pre-loaded global map, + // excluding files in the current batch (their old nodes were deleted in Phase 1b). + let batch_file_paths: HashSet<&str> = batch_parsed.iter() + .map(|pf| pf.rel_path.as_str()).collect(); + + let mut name_to_ids: HashMap> = HashMap::new(); + let mut node_id_to_path: HashMap = HashMap::new(); + // Per-node language for same-language-preferred edge resolution (§ cross-lang collision). + let mut node_id_to_language: HashMap> = HashMap::new(); + + // Add current batch's newly inserted nodes + for pf in &batch_parsed { + for (id, name) in pf.node_ids.iter().zip(pf.node_names.iter()) { + name_to_ids.entry(name.clone()).or_default().push(*id); + node_id_to_path.insert(*id, pf.rel_path.clone()); + node_id_to_language.insert(*id, Some(pf.language.clone())); + } + } + + // Add nodes from the global map, excluding those in current batch's files + // (their old nodes were deleted and replaced by new ones above) + for (name, entries) in &global_name_map { + for (id, path, language) in entries { + if !batch_file_paths.contains(path.as_str()) { + name_to_ids.entry(name.clone()).or_default().push(*id); + node_id_to_path.insert(*id, path.clone()); + node_id_to_language.insert(*id, language.clone()); + } + } + } + + for ids in name_to_ids.values_mut() { + ids.sort(); + ids.dedup(); + } + + // Track unresolved external Python imports: (source_module_node_id, module_name) + let mut external_python_imports: Vec<(i64, String)> = Vec::new(); + // Track unresolved external symbols for sentinel node creation: + // (source_id, target_name, relation) — e.g., implements edges to external traits + let mut unresolved_externals: Vec<(i64, String, String)> = Vec::new(); + + for pf in &batch_parsed { + let relations = extract_relations_from_tree(&pf.tree, &pf.source, &pf.language); + let local_ids: HashSet = pf.node_ids.iter().copied().collect(); + + for rel in &relations { + // Contract: extract_relations_from_tree stamps every relation with + // source_language equal to the language argument. The + // same-language resolution at line 811+ depends on it. Hard + // error instead of debug_assert so a parser regression fails + // loudly in release builds too (one string compare per + // relation is negligible against the SQL writes below). + if rel.source_language != pf.language { + anyhow::bail!( + "ParsedRelation.source_language ({}) does not match file language ({}); \ + parser regressed the source_language contract", + rel.source_language, pf.language + ); + } + + let source_ids = pf.node_names.iter() + .zip(pf.node_ids.iter()) + .filter(|(name, _)| *name == &rel.source_name) + .map(|(_, id)| *id) + .collect::>(); + + // Try Python module-constrained resolution for import edges + if rel.relation == REL_IMPORTS { + if let Some(ref meta_str) = rel.metadata { + if let Ok(meta) = serde_json::from_str::(meta_str) { + if let Some(python_module) = meta.get("python_module").and_then(|v| v.as_str()) { + let is_module_import = meta.get("is_module_import") + .and_then(|v| v.as_bool()).unwrap_or(false); + if python_module_map.contains_key(python_module) { + // Internal module — try constrained resolution + if let Some(module_targets) = resolve_python_module_targets( + python_module, is_module_import, &rel.target_name, + &python_module_map, &node_id_to_path, &name_to_ids, + ) { + for &src_id in &source_ids { + for &tgt_id in &module_targets { + if src_id != tgt_id + && insert_edge_cached(db.conn(), src_id, tgt_id, &rel.relation, rel.metadata.as_deref())? { + total_edges_created += 1; + } + } + } + continue; + } + // Module found but symbol not found — fall through to default + } else { + // External module — track for virtual node creation. + // For `from X import Y`, we track the module-level dependency (X), + // not the individual symbol (Y), since we can't index external code. + for &src_id in &source_ids { + external_python_imports.push((src_id, python_module.to_string())); + } + continue; // No point in default resolution for external imports + } + } + } + } + } + + // Default resolution: global name-based lookup with language-aware layering. + // Tier order: same-file → same-language → (calls: drop) / (other: global). + // Dropping calls without a same-language match prevents Rust `hasher.update()` + // binding to an unrelated JS `function update()` via bare-name collision. + let all_target_ids = name_to_ids.get(&rel.target_name) + .cloned() + .unwrap_or_default(); + + let same_file_targets: Vec = all_target_ids.iter() + .filter(|id| local_ids.contains(id)) + .copied() + .collect(); + + let source_lang = pf.language.as_str(); + let same_language_targets: Vec = all_target_ids.iter() + .filter(|id| !local_ids.contains(id)) + .filter(|id| matches!( + node_id_to_language.get(id).and_then(|l| l.as_deref()), + Some(l) if l == source_lang + )) + .copied() + .collect(); + + let target_ids = if !same_file_targets.is_empty() { + same_file_targets + } else if rel.relation == REL_CALLS + && CROSS_FILE_CALL_NOISE.contains(&rel.target_name.as_str()) + { + // Stdlib method names (new/default/from) — drop regardless of language. + continue; + } else if !same_language_targets.is_empty() { + // Ambiguous cross-file same-language candidates (e.g. a helper + // name like `readJson` defined in multiple JS files) used to + // fan out — every same-name target got an edge, producing + // phantom callers across unrelated modules. Refine by + // non-test preference + longest common path prefix with the + // caller file. See `refine_ambiguous_targets` for fallback + // policy (keeps remaining pool on ambiguity to avoid + // regressing dead-code on bare-name Rust scoped calls). + refine_ambiguous_targets( + &same_language_targets, + &pf.rel_path, + &node_id_to_path, + ) + } else if rel.relation == REL_CALLS { + // No same-file, no same-language candidate → buffer in + // pending_unresolved_calls instead of silently dropping. + // The post-Phase-2 sweep below promotes the row to a real + // edge as soon as a same-language target appears (e.g. + // sibling file added in a later incremental pass). Memory + // `feedback_incremental_edge_timing.md` documented the bug + // this closes: B's bare-name call to `foo()` got dropped + // when foo didn't exist yet, and never re-resolved when A + // later added `foo`. Schema cascade on source_id self-cleans + // when callers are removed/reindexed. + for &src_id in &source_ids { + crate::storage::queries::insert_pending_unresolved_call( + db.conn(), + src_id, + &rel.target_name, + &pf.language, + rel.metadata.as_deref(), + )?; + } + continue; + } else { + all_target_ids + }; + + if target_ids.is_empty() + && (rel.relation == REL_IMPLEMENTS || rel.relation == REL_IMPORTS) + { + // Unresolved implements target (external trait like Write, Default) + // OR unresolved import target (JS `require('fs')`, unresolved JS + // ES-import binding). Phase 2b-ext creates `/` + // sentinel nodes so the dependency graph shows the link. + for &src_id in &source_ids { + unresolved_externals.push((src_id, rel.target_name.clone(), rel.relation.clone())); + } + } else { + for &src_id in &source_ids { + for &tgt_id in &target_ids { + if (src_id != tgt_id || rel.relation == REL_ROUTES_TO) + && insert_edge_cached(db.conn(), src_id, tgt_id, &rel.relation, rel.metadata.as_deref())? { + total_edges_created += 1; + } + } + } + } + } + } + + // Phase 2b: Create virtual nodes for external Python imports + if !external_python_imports.is_empty() { + let ext_file_id = upsert_file(db.conn(), &FileRecord { + path: "".into(), + blake3_hash: "external".into(), + last_modified: 0, + language: Some("external".into()), + })?; + + // Load existing external module nodes to avoid duplicates + let existing_ext_nodes: HashMap = + get_nodes_by_file_path(db.conn(), "")? + .into_iter() + .map(|n| (n.name.clone(), n.id)) + .collect(); + + let unique_modules: HashSet = external_python_imports.iter() + .map(|(_, m)| m.clone()).collect(); + + let mut ext_node_ids: HashMap = existing_ext_nodes; + for module_name in &unique_modules { + if !ext_node_ids.contains_key(module_name) { + let node_id = insert_node_cached(db.conn(), &NodeRecord { + file_id: ext_file_id, + node_type: "external_module".into(), + name: module_name.clone(), + qualified_name: Some(format!("/{}", module_name)), + start_line: 0, + end_line: 0, + code_content: String::new(), + signature: None, + doc_comment: None, + context_string: None, + name_tokens: None, + return_type: None, + param_types: None, + is_test: false, + })?; + ext_node_ids.insert(module_name.clone(), node_id); + total_nodes_created += 1; + } + } + + for (source_id, module_name) in &external_python_imports { + if let Some(&ext_id) = ext_node_ids.get(module_name) { + if insert_edge_cached(db.conn(), *source_id, ext_id, REL_IMPORTS, None)? { + total_edges_created += 1; + } + } + } + } + + // Phase 2b-ext: Create sentinel nodes for unresolved external symbols + // (e.g., Rust `impl Write for SharedStdout` where Write is from std::io) + if !unresolved_externals.is_empty() { + let ext_file_id = upsert_file(db.conn(), &FileRecord { + path: "".into(), + blake3_hash: "external".into(), + last_modified: 0, + language: Some("external".into()), + })?; + + let existing_ext_nodes: HashMap = + get_nodes_by_file_path(db.conn(), "")? + .into_iter() + .map(|n| (n.name.clone(), n.id)) + .collect(); + + let mut ext_node_ids: HashMap = existing_ext_nodes; + + // Collect unique targets with inferred type + let unique_targets: HashMap<&str, &str> = unresolved_externals.iter() + .map(|(_, name, rel)| { + let node_type = if rel == REL_IMPLEMENTS { "trait" } else { "module" }; + (name.as_str(), node_type) + }) + .collect(); + + for (&name, &node_type) in &unique_targets { + if !ext_node_ids.contains_key(name) { + let node_id = insert_node_cached(db.conn(), &NodeRecord { + file_id: ext_file_id, + node_type: node_type.into(), + name: name.into(), + qualified_name: Some(format!("/{}", name)), + start_line: 0, + end_line: 0, + code_content: String::new(), + signature: None, + doc_comment: None, + context_string: None, + name_tokens: None, + return_type: None, + param_types: None, + is_test: false, + })?; + ext_node_ids.insert(name.into(), node_id); + total_nodes_created += 1; + } + } + + for (source_id, target_name, relation) in &unresolved_externals { + if let Some(&ext_id) = ext_node_ids.get(target_name.as_str()) { + if insert_edge_cached(db.conn(), *source_id, ext_id, relation, None)? { + total_edges_created += 1; + } + } + } + } + + // Phase 2c: Restore cross-file inbound edges lost to cascade delete. + // When a file is re-indexed, its old nodes are deleted (cascade-deleting edges). + // Edges from OTHER files into the re-indexed file must be rebuilt using new node IDs. + if !saved_inbound_edges.is_empty() { + // Build name → new_node_id map for batch files only + let mut batch_name_to_ids: HashMap<&str, Vec> = HashMap::new(); + for pf in &batch_parsed { + for (id, name) in pf.node_ids.iter().zip(pf.node_names.iter()) { + batch_name_to_ids.entry(name.as_str()).or_default().push(*id); + } + } + + let mut restored = 0usize; + let mut skipped_intra_batch = 0usize; + for (source_id, source_file_id, target_name, relation, metadata) in &saved_inbound_edges { + // Source file is also in this batch — source_id is stale (deleted + re-created). + // Phase 2 already resolves cross-file edges for intra-batch files. + if batch_file_ids.contains(source_file_id) { + skipped_intra_batch += 1; + continue; + } + if let Some(new_target_ids) = batch_name_to_ids.get(target_name.as_str()) { + for &new_tgt_id in new_target_ids { + if *source_id != new_tgt_id + && insert_edge_cached(db.conn(), *source_id, new_tgt_id, relation, metadata.as_deref())? { + total_edges_created += 1; + restored += 1; + } + } + } + } + if restored > 0 || skipped_intra_batch > 0 { + tracing::debug!("[index] Restored {} cross-file inbound edges, skipped {} intra-batch", restored, skipped_intra_batch); + } + } + + tx.commit()?; + + let batch_file_count = batch_parsed.len(); + + // Update global_name_map: remove old entries for batch files, add new ones + for (_, entries) in global_name_map.iter_mut() { + entries.retain(|(_id, path, _lang)| !batch_file_paths.contains(path.as_str())); + } + global_name_map.retain(|_, entries| !entries.is_empty()); + + // Convert to lightweight records — drops Tree and source string + for pf in batch_parsed { + // Add newly committed nodes to the global map + let pf_lang = Some(pf.language.clone()); + for (id, name) in pf.node_ids.iter().zip(pf.node_names.iter()) { + global_name_map.entry(name.clone()) + .or_default() + .push((*id, pf.rel_path.clone(), pf_lang.clone())); + } + all_indexed.push(FileIndexed { + rel_path: pf.rel_path, + node_ids: pf.node_ids, + node_names: pf.node_names, + }); + // pf.tree and pf.source are dropped here — memory freed + } + + // Report progress after each batch + if let Some(cb) = progress { + cb(all_indexed.len(), files.len()); + } + + if files.len() > BATCH_SIZE { + tracing::info!( + "[index] batch {}/{}: {} files ({} nodes, {} edges)", + all_indexed.len(), files.len(), + batch_file_count, total_nodes_created, total_edges_created + ); + } + } + + // Phase 3: Build context strings + embeddings (single transaction, lightweight) + if !all_indexed.is_empty() { + let tx = db.conn().unchecked_transaction()?; + let all_node_ids: Vec = all_indexed.iter() + .flat_map(|fi| fi.node_ids.iter().copied()).collect(); + let all_edges = get_edges_batch(db.conn(), &all_node_ids)?; + let all_node_details: HashMap)> = { + let nodes = get_nodes_with_files_by_ids(db.conn(), &all_node_ids)?; + nodes.into_iter().map(|nwf| (nwf.node.id, (nwf.node, nwf.language))).collect() + }; + + // Phase 3a: Build all context strings (CPU-bound, parallelized with rayon) + // Flatten to (node_id, node_name, file_path) tuples for parallel iteration + let node_tasks: Vec<(i64, &str, &str)> = all_indexed.iter() + .flat_map(|fi| { + fi.node_ids.iter().enumerate().map(move |(idx, &node_id)| { + (node_id, fi.node_names[idx].as_str(), fi.rel_path.as_str()) + }) + }) + .collect(); + + let context_updates: Vec<(i64, String)> = node_tasks.par_iter() + .map(|&(node_id, node_name, file_path)| { + let edges = all_edges.get(&node_id); + let cat = categorize_edges(edges, format_route_from_metadata); + let node_detail = all_node_details.get(&node_id); + + let ctx = build_context_string(&NodeContext { + node_type: node_detail.map(|(n, _)| n.node_type.clone()).unwrap_or_default(), + name: node_name.to_string(), + qualified_name: node_detail.and_then(|(n, _)| n.qualified_name.clone()), + file_path: file_path.to_string(), + language: node_detail.and_then(|(_, lang)| lang.clone()), + signature: node_detail.and_then(|(n, _)| n.signature.clone()), + return_type: node_detail.and_then(|(n, _)| n.return_type.clone()), + param_types: node_detail.and_then(|(n, _)| n.param_types.clone()), + code_content: node_detail.map(|(n, _)| n.code_content.clone()), + routes: cat.routes, + callees: cat.callees, + callers: cat.callers, + inherits: cat.inherits, + imports: cat.imports, + implements: cat.implements, + exports: cat.exports, + doc_comment: node_detail.and_then(|(n, _)| n.doc_comment.clone()), + }); + + (node_id, ctx) + }) + .collect(); + + // Phase 3b: Batch update context strings in DB + update_context_strings_batch(db.conn(), &context_updates)?; + tx.commit()?; + + tracing::info!( + "[index] Phase 3: context strings built for {} nodes", + all_node_ids.len() + ); + + // Phase 3c: Embed outside the committed tx — recoverable on failure via repair_null_context_strings + if let Some(m) = model { + if db.vec_enabled() { + embed_and_store_batch(db, m, &context_updates)?; + } + } + } + + // Phase 2c: sweep pending_unresolved_calls — promote any rows whose + // target_name now resolves against a same-language node. Cheap when the + // table is empty (typical after a full index of a self-contained codebase). + let pending_resolved = resolve_pending_calls(db)?; + total_edges_created += pending_resolved; + if pending_resolved > 0 { + tracing::info!( + "[index] Phase 2c: resolved {} pending unresolved calls", + pending_resolved + ); + } + + // Optimize query planner statistics after bulk writes + if !all_indexed.is_empty() { + let _ = db.run_optimize(); + } + + let stats = IndexStats { + files_skipped_size: skipped_size.load(AtomicOrdering::Relaxed), + files_skipped_parse: skipped_parse.load(AtomicOrdering::Relaxed), + files_skipped_read: skipped_read.load(AtomicOrdering::Relaxed), + files_skipped_hash: skipped_hash.load(AtomicOrdering::Relaxed), + files_skipped_language: skipped_language.load(AtomicOrdering::Relaxed), + }; + + Ok(IndexResult { + files_indexed: all_indexed.len(), + nodes_created: total_nodes_created, + edges_created: total_edges_created, + stats, + }) +} diff --git a/src/indexer/pipeline/mod.rs b/src/indexer/pipeline/mod.rs new file mode 100644 index 0000000..d2dec7d --- /dev/null +++ b/src/indexer/pipeline/mod.rs @@ -0,0 +1,237 @@ +//! Indexer pipeline. Public entry points + per-concern submodules: +//! - `embed`: batch embedding store +//! - `context`: context-string assembly + recovery paths +//! - `python_modules`: dotted-path → file-path resolution map +//! - `resolve`: ambiguous-target refinement + pending-call sweep +//! - `index_files`: the giant Phase-0..3 orchestrator (kept whole — its +//! phases share local transaction/atomics/batch state) + +use anyhow::Result; +use std::collections::{HashMap, HashSet}; +use std::path::Path; + +use crate::embedding::model::EmbeddingModel; +use crate::indexer::merkle::{compute_diff, scan_directory, scan_directory_cached, DirectoryCache}; +use crate::storage::db::Database; +use crate::storage::queries::{ + delete_files_by_paths, get_all_file_hashes, get_dirty_node_ids, +}; + +mod embed; +mod context; +mod python_modules; +mod resolve; +mod index_files; + +#[cfg(test)] +mod tests; + +pub use embed::embed_and_store_batch; +pub use context::repair_null_context_strings; + +use context::regenerate_context_strings; +use index_files::index_files; + +/// Counters for indexing observability — tracks skipped items. +#[derive(Debug, Clone, Default)] +pub struct IndexStats { + pub files_skipped_size: usize, + pub files_skipped_parse: usize, + pub files_skipped_read: usize, + pub files_skipped_hash: usize, + pub files_skipped_language: usize, +} + +pub struct IndexResult { + pub files_indexed: usize, + pub nodes_created: usize, + pub edges_created: usize, + pub stats: IndexStats, +} + +/// Progress callback: called with (files_done, files_total) after each batch. +pub type ProgressFn<'a> = &'a dyn Fn(usize, usize); + +pub fn run_full_index(db: &Database, project_root: &Path, model: Option<&EmbeddingModel>, progress: Option) -> Result { + let current_hashes = scan_directory(project_root)?; + let files: Vec = current_hashes.keys().cloned().collect(); + index_files(db, project_root, &files, ¤t_hashes, model, &[], progress) +} + +/// Reindex a single file when its on-disk hash differs from the stored hash. +/// No-op when the hashes match (or `rel_path` was never indexed in a way that +/// would currently reindex it). Returns true when a reindex (or stale-row +/// cleanup) actually fired. +/// +/// Used by query-time freshness: when an MCP tool receives an explicit +/// `file_path` argument, the agent is signaling "I just edited this; please +/// answer against the current bytes." The 30s `last_incremental_check` +/// debounce in the server is too coarse for tight Edit→search loops. +/// +/// Cross-file dirty-edge handling mirrors `run_incremental_index`: collect +/// dirty node IDs **before** re-indexing (cascade delete strips old edges), +/// then regenerate context strings + embeddings once the new nodes exist. +pub fn ensure_file_indexed( + db: &Database, + project_root: &Path, + rel_path: &str, + model: Option<&EmbeddingModel>, +) -> Result { + let abs_path = project_root.join(rel_path); + + // Missing-file path: drop stale row so future queries don't return phantom nodes. + if !abs_path.is_file() { + let exists_in_db: Option = db.conn().query_row( + "SELECT id FROM files WHERE path = ?1", + [rel_path], + |row| row.get(0), + ).ok(); + if exists_in_db.is_some() { + let tx = db.conn().unchecked_transaction()?; + delete_files_by_paths(db.conn(), &[rel_path.to_string()])?; + tx.commit()?; + return Ok(true); + } + return Ok(false); + } + + // Skip files we wouldn't index in the first place (binary / wrong language). + if crate::utils::config::detect_language(rel_path).is_none() { + return Ok(false); + } + + let on_disk_hash = crate::indexer::merkle::hash_file(&abs_path)?; + let stored_hash: Option = db.conn().query_row( + "SELECT blake3_hash FROM files WHERE path = ?1", + [rel_path], + |row| row.get(0), + ).ok(); + + if stored_hash.as_deref() == Some(&on_disk_hash) { + return Ok(false); + } + + // Cross-file edges into this file's nodes need their context strings rebuilt + // *after* the node IDs are replaced — capture the dirty set BEFORE re-indexing. + let dirty_node_ids = collect_dirty_node_ids(db, std::slice::from_ref(&rel_path.to_string()))?; + + let mut hashes: HashMap = HashMap::new(); + hashes.insert(rel_path.to_string(), on_disk_hash); + let files = vec![rel_path.to_string()]; + index_files(db, project_root, &files, &hashes, model, &[], None)?; + + if !dirty_node_ids.is_empty() { + regenerate_context_strings(db, &dirty_node_ids, model)?; + } + Ok(true) +} + +pub fn run_incremental_index(db: &Database, project_root: &Path, model: Option<&EmbeddingModel>, progress: Option) -> Result { + let start = std::time::Instant::now(); + let stored_hashes = get_all_file_hashes(db.conn())?; + let current_hashes = scan_directory(project_root)?; + let diff = compute_diff(&stored_hashes, ¤t_hashes); + + // Preserve pseudo-file across incremental indexes + let deleted_files: Vec = diff.deleted_files.into_iter() + .filter(|p| p != "") + .collect(); + let to_index: Vec = [diff.new_files, diff.changed_files].concat(); + + let dirty_node_ids = if !to_index.is_empty() { + collect_dirty_node_ids(db, &to_index)? + } else { + HashSet::new() + }; + + let result = index_files(db, project_root, &to_index, ¤t_hashes, model, &deleted_files, progress)?; + + if !dirty_node_ids.is_empty() { + regenerate_context_strings(db, &dirty_node_ids, model)?; + } + + if result.files_indexed > 0 || !deleted_files.is_empty() { + tracing::info!( + "[incremental] {} files changed, {} deleted, {} nodes, {} edges, {:.1}s", + result.files_indexed, deleted_files.len(), + result.nodes_created, result.edges_created, + start.elapsed().as_secs_f64() + ); + } + + Ok(result) +} + +/// Incremental index with directory mtime cache for faster scanning. +/// Files in unchanged directories are skipped entirely. +pub fn run_incremental_index_cached( + db: &Database, + project_root: &Path, + model: Option<&EmbeddingModel>, + dir_cache: Option<&DirectoryCache>, + progress: Option, +) -> Result<(IndexResult, DirectoryCache)> { + let start = std::time::Instant::now(); + let stored_hashes = get_all_file_hashes(db.conn())?; + let (mut current_hashes, new_cache) = scan_directory_cached(project_root, dir_cache)?; + + // Merge stored hashes for files in unchanged directories. + // scan_directory_cached skips files in unchanged dirs, so we need to + // carry forward their stored hashes to prevent false "deleted" diffs. + // Use new_cache.file_mtimes (populated for ALL walked files) to check existence + // without per-file stat calls. + for (path, hash) in &stored_hashes { + if !current_hashes.contains_key(path) && new_cache.file_exists(path) { + current_hashes.insert(path.clone(), hash.clone()); + } + } + + let diff = compute_diff(&stored_hashes, ¤t_hashes); + + // Preserve pseudo-file across incremental indexes + let deleted_files: Vec = diff.deleted_files.into_iter() + .filter(|p| p != "") + .collect(); + let to_index: Vec = [diff.new_files, diff.changed_files].concat(); + + let dirty_node_ids = if !to_index.is_empty() { + collect_dirty_node_ids(db, &to_index)? + } else { + HashSet::new() + }; + + let result = index_files(db, project_root, &to_index, ¤t_hashes, model, &deleted_files, progress)?; + + if !dirty_node_ids.is_empty() { + regenerate_context_strings(db, &dirty_node_ids, model)?; + } + + if result.files_indexed > 0 || !deleted_files.is_empty() { + tracing::info!( + "[incremental] {} files changed, {} deleted, {} nodes, {} edges, {:.1}s", + result.files_indexed, deleted_files.len(), + result.nodes_created, result.edges_created, + start.elapsed().as_secs_f64() + ); + } + + Ok((result, new_cache)) +} + +/// Collect node IDs in OTHER files that have edges pointing to nodes in the changed files. +/// Must be called BEFORE re-indexing (cascade delete removes old edges). +fn collect_dirty_node_ids(db: &Database, changed_paths: &[String]) -> Result> { + let mut changed_file_ids = Vec::new(); + for path in changed_paths { + let file_id: Option = db.conn().query_row( + "SELECT id FROM files WHERE path = ?1", + [path], + |row| row.get(0), + ).ok(); + if let Some(id) = file_id { + changed_file_ids.push(id); + } + } + let ids = get_dirty_node_ids(db.conn(), &changed_file_ids)?; + Ok(ids.into_iter().collect()) +} diff --git a/src/indexer/pipeline/python_modules.rs b/src/indexer/pipeline/python_modules.rs new file mode 100644 index 0000000..720a0d4 --- /dev/null +++ b/src/indexer/pipeline/python_modules.rs @@ -0,0 +1,73 @@ +//! Python module path resolution. `import myapp.utils` and `from myapp.utils +//! import helper` carry dotted module paths that don't directly map to file +//! names, so the indexer pre-builds a `dotted_path → file_paths` map and +//! consults it during Phase 2 import-edge resolution. +//! +//! Suffix matching deliberately fans out: `utils` matches every `*/utils.py` +//! we know about. Over-connecting is the safer failure mode for dependency +//! analysis without `sys.path` context — a missed dependency is harder to +//! debug than an extra one. + +use std::collections::{HashMap, HashSet}; + +/// Build mapping from Python dotted module paths to file paths. +/// Registers both full paths and suffix paths for flexible matching. +/// e.g., "src/myapp/utils.py" matches "src.myapp.utils", "myapp.utils", and "utils". +pub(super) fn build_python_module_map(python_paths: &HashSet) -> HashMap> { + let mut map: HashMap> = HashMap::new(); + for path in python_paths { + let stripped = if let Some(s) = path.strip_suffix("/__init__.py") { + s + } else if let Some(s) = path.strip_suffix(".py") { + s + } else { + continue; + }; + + // Register all suffix module paths for flexible matching + // e.g., "src/myapp/utils" -> "src.myapp.utils", "myapp.utils", "utils" + let parts: Vec<&str> = stripped.split('/').collect(); + for i in 0..parts.len() { + let dotted = parts[i..].join("."); + map.entry(dotted).or_default().push(path.clone()); + } + } + // Deduplicate + for paths in map.values_mut() { + paths.sort(); + paths.dedup(); + } + map +} + +/// Resolve Python import targets using pre-parsed module metadata. +/// For `import X` (is_module_import): finds `` nodes in resolved files. +/// For `from X import Y`: finds nodes named Y only in resolved files. +/// Returns None if module can't be resolved or no matching nodes found. +pub(super) fn resolve_python_module_targets( + python_module: &str, + is_module_import: bool, + target_name: &str, + python_module_map: &HashMap>, + node_id_to_path: &HashMap, + name_to_ids: &HashMap>, +) -> Option> { + // Resolve module path to file path(s). + // Note: suffix matching in python_module_map means `import utils` may match + // multiple files (e.g., "myapp/utils.py" and "other/utils.py"). This is an + // inherent ambiguity without sys.path context; over-connecting is safer for + // dependency analysis than missing real dependencies. + let module_files = python_module_map.get(python_module)?; + + let lookup_name = if is_module_import { "" } else { target_name }; + let all_ids = name_to_ids.get(lookup_name)?; + let targets: Vec = all_ids.iter() + .filter(|nid| { + node_id_to_path.get(nid) + .map(|p| module_files.contains(p)) + .unwrap_or(false) + }) + .copied() + .collect(); + if targets.is_empty() { None } else { Some(targets) } +} diff --git a/src/indexer/pipeline/resolve.rs b/src/indexer/pipeline/resolve.rs new file mode 100644 index 0000000..f09cd3d --- /dev/null +++ b/src/indexer/pipeline/resolve.rs @@ -0,0 +1,204 @@ +//! Cross-file call resolution helpers shared by the main `index_files` walk +//! and the post-index `pending_unresolved_calls` sweep. +//! +//! - `refine_ambiguous_targets`: disambiguator — when a call's target name +//! matches N same-language nodes across files, prefer non-test paths and +//! the longest common path prefix with the caller. +//! - `resolve_pending_calls`: drains buffered same-language-but-callee-not-yet- +//! indexed rows once the callee appears (post-incremental sweep). + +use anyhow::Result; +use std::collections::HashMap; + +use crate::storage::db::Database; +use crate::storage::queries::{ + delete_pending_unresolved_call, insert_edge_cached, list_pending_unresolved_calls, +}; +use crate::domain::REL_CALLS; + +/// Disambiguate N same-language cross-file candidates for a single call/import +/// target. Returns a subset. A single-element result is the authoritative +/// winner; ties fall back to the full input so the caller does not +/// inadvertently drop legitimate edges. +/// +/// Heuristic: (1) prefer non-test-file candidates when the caller is not +/// itself a test file; (2) among the preferred pool, keep only those tied +/// for the longest byte-common path prefix with the caller. Previous +/// versions dropped on ambiguity, which regressed dead-code detection for +/// bare-name Rust calls like `crate::domain::foo()` where scoped_identifier +/// extraction keeps only `foo` and two `foo` definitions under `src/` tie +/// on prefix — better to keep both edges than to report `foo` as dead. +pub(super) fn refine_ambiguous_targets( + candidates: &[i64], + caller_rel_path: &str, + node_id_to_path: &HashMap, +) -> Vec { + if candidates.len() <= 1 { + return candidates.to_vec(); + } + + let is_test_path = |p: &str| { + p.contains(".test.") || p.contains("_test.") + || p.starts_with("tests/") || p.contains("/tests/") + || p.starts_with("test/") || p.contains("/test/") + || p.contains(".spec.") + }; + let caller_is_test = is_test_path(caller_rel_path); + + // Pass 1: prefer non-test candidates when the caller is non-test code. + let pool: Vec = if caller_is_test { + candidates.to_vec() + } else { + let non_test: Vec = candidates.iter().copied() + .filter(|id| { + let p = node_id_to_path.get(id).map(String::as_str).unwrap_or(""); + !is_test_path(p) + }) + .collect(); + if non_test.is_empty() { candidates.to_vec() } else { non_test } + }; + + if pool.len() == 1 { return pool; } + + // Pass 2: keep only candidates tied for the longest common path prefix + // with the caller. Byte-wise prefix is a rough proxy for module locality + // — e.g. `claude-plugin/scripts/session-init.js` shares 21 bytes with + // `claude-plugin/scripts/lifecycle.js` but 0 bytes with `scripts/*`. + let prefix_len = |p: &str| -> usize { + caller_rel_path.bytes().zip(p.bytes()) + .take_while(|(a, b)| a == b) + .count() + }; + let max_prefix = pool.iter() + .map(|id| prefix_len(node_id_to_path.get(id).map(String::as_str).unwrap_or(""))) + .max() + .unwrap_or(0); + let closest: Vec = pool.iter().copied() + .filter(|id| prefix_len(node_id_to_path.get(id).map(String::as_str).unwrap_or("")) == max_prefix) + .collect(); + + if closest.len() == 1 { return closest; } + + // Still ambiguous — return the remaining pool rather than dropping. This + // keeps dead-code precision high for edges we cannot confidently prune + // (most notably Rust bare-name scoped calls) at the cost of leaving a + // small amount of fan-out; the single-winner fast path above handles + // the common case (unique non-test match, or unique closest path). + if !closest.is_empty() { closest } else { pool } +} + +/// Sweep `pending_unresolved_calls` against the current node state. Rows whose +/// `(target_name, source_language)` now match a real node become a `calls` +/// edge and the pending row is dropped; rows that still don't resolve stay +/// buffered for the next index pass. +/// +/// Resolution priority mirrors Phase 2: same-language candidates only (no +/// cross-language promotion — memory `feedback_edge_resolution_same_language.md` +/// flags that as the canonical false-positive class), with +/// `refine_ambiguous_targets` applied when multiple candidates share the name. +/// +/// Returns the number of edges inserted by this sweep. +pub(super) fn resolve_pending_calls(db: &Database) -> Result { + let pending = list_pending_unresolved_calls(db.conn())?; + if pending.is_empty() { + return Ok(0); + } + + // Build name → [(node_id, language)] map ONCE, then iterate pending rows + // in memory. Narrowed by `n.name IN (SELECT DISTINCT target_name ...)` so + // even a 1-row pending table doesn't trigger a full nodes-table scan on + // every incremental pass — for a 100K-node project the unfiltered SELECT + // was 100K rows × every index call, even with no work to do. + let mut name_to_lang_targets: HashMap> = HashMap::new(); + let mut node_id_to_path: HashMap = HashMap::new(); + { + let mut stmt = db.conn().prepare( + "SELECT n.id, n.name, COALESCE(f.language, ''), f.path + FROM nodes n JOIN files f ON f.id = n.file_id + WHERE f.language IS NOT NULL + AND n.name IN (SELECT DISTINCT target_name FROM pending_unresolved_calls)" + )?; + let rows = stmt.query_map([], |row| { + Ok(( + row.get::<_, i64>(0)?, + row.get::<_, String>(1)?, + row.get::<_, String>(2)?, + row.get::<_, String>(3)?, + )) + })?; + for row in rows { + let (id, name, lang, path) = row?; + if lang.is_empty() { + continue; + } + name_to_lang_targets.entry(name).or_default().push((id, lang)); + node_id_to_path.insert(id, path); + } + } + + // Map source_id → source file path so refine_ambiguous_targets gets the + // proximity hint it needs. + let source_ids: Vec = pending.iter().map(|p| p.source_id).collect(); + let mut source_id_to_path: HashMap = HashMap::new(); + if !source_ids.is_empty() { + let placeholders = std::iter::repeat_n("?", source_ids.len()).collect::>().join(","); + let sql = format!( + "SELECT n.id, f.path FROM nodes n JOIN files f ON f.id = n.file_id WHERE n.id IN ({})", + placeholders + ); + let mut stmt = db.conn().prepare(&sql)?; + let params: Vec<&dyn rusqlite::ToSql> = source_ids.iter() + .map(|id| id as &dyn rusqlite::ToSql) + .collect(); + let rows = stmt.query_map(params.as_slice(), |row| { + Ok((row.get::<_, i64>(0)?, row.get::<_, String>(1)?)) + })?; + for row in rows { + let (id, path) = row?; + source_id_to_path.insert(id, path); + } + } + + let mut edges_added = 0usize; + let mut to_delete: Vec = Vec::new(); + + for row in &pending { + let candidates: Vec = name_to_lang_targets.get(&row.target_name) + .map(|entries| entries.iter() + .filter(|(_, lang)| *lang == row.source_language) + .map(|(id, _)| *id) + .filter(|id| *id != row.source_id) // self-call guard + .collect()) + .unwrap_or_default(); + + if candidates.is_empty() { + continue; // still unresolvable — leave buffered + } + + let refined = if candidates.len() > 1 { + let source_path = source_id_to_path.get(&row.source_id).cloned().unwrap_or_default(); + refine_ambiguous_targets(&candidates, &source_path, &node_id_to_path) + } else { + candidates + }; + + for tgt_id in &refined { + if insert_edge_cached( + db.conn(), + row.source_id, + *tgt_id, + REL_CALLS, + row.metadata.as_deref(), + )? { + edges_added += 1; + } + } + to_delete.push(row.id); + } + + for id in to_delete { + delete_pending_unresolved_call(db.conn(), id)?; + } + + Ok(edges_added) +} diff --git a/src/indexer/pipeline/tests.rs b/src/indexer/pipeline/tests.rs new file mode 100644 index 0000000..0bf3668 --- /dev/null +++ b/src/indexer/pipeline/tests.rs @@ -0,0 +1,884 @@ +use super::*; +use super::python_modules::build_python_module_map; +use crate::storage::queries::{ + get_nodes_by_file_path, get_nodes_by_name, get_edges_from, get_import_tree, +}; +use crate::domain::REL_CALLS; +use tempfile::TempDir; +use std::fs; + +#[test] +fn test_full_index_pipeline() { + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + + fs::create_dir_all(project_dir.path().join("src")).unwrap(); + fs::write(project_dir.path().join("src/auth.ts"), r#" +function validateToken(token: string): boolean { + return jwt.verify(token); +} + +function handleLogin(req: Request) { + if (validateToken(req.token)) { + return createSession(req.userId); + } +} +"#).unwrap(); + + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + let result = run_full_index(&db, project_dir.path(), None, None).unwrap(); + + assert!(result.files_indexed > 0); + assert!(result.nodes_created > 0); + assert!(result.edges_created > 0); + + // Verify nodes are in DB + let nodes = get_nodes_by_name(db.conn(), "handleLogin").unwrap(); + assert_eq!(nodes.len(), 1); + + // Verify edges: handleLogin → calls → validateToken + let edges = get_edges_from(db.conn(), nodes[0].id).unwrap(); + assert!(edges.iter().any(|e| e.relation == REL_CALLS), "should have call edges"); + + // Verify context string was built + assert!(nodes[0].context_string.is_some(), "context string should be set after Phase 3"); +} + +#[test] +fn test_cross_language_bare_name_call_resolution() { + // Regression: Rust method call `hasher.update(...)` was resolving to + // JS `function update()` via global bare-name lookup, producing phantom + // Rust → JS call edges in mixed projects. Fix: same-file > same-language + // tiers; drop call edges with no same-language candidate. + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + fs::create_dir_all(project_dir.path().join("src")).unwrap(); + fs::create_dir_all(project_dir.path().join("scripts")).unwrap(); + + fs::write(project_dir.path().join("src/hasher.rs"), r#" +pub fn caller_rs() { + let mut h = Hasher::new(); + h.update(&[1, 2, 3]); + h.finalize(); +} +"#).unwrap(); + + fs::write(project_dir.path().join("scripts/helper.js"), r#" +function update() { return 1; } +function caller_js() { update(); } +"#).unwrap(); + + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + run_full_index(&db, project_dir.path(), None, None).unwrap(); + + let rust_caller = crate::storage::queries::get_nodes_with_files_by_name( + db.conn(), "caller_rs", + ).unwrap(); + let rust_caller = rust_caller.iter() + .find(|n| n.file_path == "src/hasher.rs") + .expect("Rust caller_rs should be indexed"); + let edges = get_edges_from(db.conn(), rust_caller.node.id).unwrap(); + for e in &edges { + if e.relation != REL_CALLS { continue; } + let tgt_path: Option = db.conn().query_row( + "SELECT f.path FROM nodes n JOIN files f ON n.file_id = f.id WHERE n.id = ?1", + [e.target_id], |row| row.get(0), + ).ok(); + assert!( + !tgt_path.as_deref().unwrap_or("").ends_with(".js"), + "Rust caller must not resolve calls into JS; got edge → {:?}", tgt_path, + ); + } + + let js_caller = crate::storage::queries::get_nodes_with_files_by_name( + db.conn(), "caller_js", + ).unwrap(); + let js_caller = js_caller.iter() + .find(|n| n.file_path == "scripts/helper.js") + .expect("JS caller_js should be indexed"); + let js_edges = get_edges_from(db.conn(), js_caller.node.id).unwrap(); + let js_call_targets: Vec = js_edges.iter() + .filter(|e| e.relation == REL_CALLS) + .map(|e| e.target_id) + .collect(); + assert!(!js_call_targets.is_empty(), + "JS caller_js → update edge within same file should still resolve"); +} + +#[test] +fn test_js_require_creates_external_import_edges() { + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + fs::write(project_dir.path().join("app.js"), r#" +const fs = require('fs'); +const path = require('path'); +const lifecycle = require('./lifecycle'); + +function main() { fs.readFileSync('x'); } +"#).unwrap(); + + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + run_full_index(&db, project_dir.path(), None, None).unwrap(); + + let imports: Vec = db.conn().prepare( + "SELECT DISTINCT n2.name FROM edges e + JOIN nodes n ON n.id = e.source_id + JOIN files f ON f.id = n.file_id + JOIN nodes n2 ON n2.id = e.target_id + WHERE f.path = 'app.js' AND e.relation = 'imports'" + ).unwrap() + .query_map([], |row| row.get::<_, String>(0)).unwrap() + .filter_map(Result::ok) + .collect(); + + assert!(imports.contains(&"fs".to_string()), "imports: {:?}", imports); + assert!(imports.contains(&"path".to_string()), "imports: {:?}", imports); + assert!(imports.contains(&"lifecycle".to_string()), "imports: {:?}", imports); +} + +#[test] +fn test_js_same_name_cross_file_prefers_closest_path() { + // Regression: when JS defines the same helper name in multiple files + // (e.g., `readJson` in both `claude-plugin/scripts/lifecycle.js` and + // `scripts/install-e2e.test.js`), a caller in `claude-plugin/scripts/*` + // used to fan out an edge to every same-language match, producing + // false-positive callers across unrelated modules. The resolver must + // pick the candidate with the longest common path prefix to the + // caller file (and prefer non-test files) rather than all. + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + fs::create_dir_all(project_dir.path().join("pkg/scripts")).unwrap(); + fs::create_dir_all(project_dir.path().join("tests")).unwrap(); + + fs::write(project_dir.path().join("pkg/scripts/lifecycle.js"), r#" +function readJson(p) { return 1; } +module.exports = { readJson }; +"#).unwrap(); + + fs::write(project_dir.path().join("pkg/scripts/session-init.js"), r#" +function syncLifecycleConfig() { readJson('x'); } +"#).unwrap(); + + fs::write(project_dir.path().join("tests/helpers.test.js"), r#" +function readJson(p) { return 2; } +"#).unwrap(); + + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + run_full_index(&db, project_dir.path(), None, None).unwrap(); + + // Find the caller node + let caller = crate::storage::queries::get_nodes_with_files_by_name( + db.conn(), "syncLifecycleConfig", + ).unwrap(); + let caller = caller.iter() + .find(|n| n.file_path == "pkg/scripts/session-init.js") + .expect("syncLifecycleConfig should be indexed"); + + let edges = get_edges_from(db.conn(), caller.node.id).unwrap(); + let call_edges: Vec = edges.iter() + .filter(|e| e.relation == REL_CALLS) + .map(|e| e.target_id) + .collect(); + + // Resolve target paths + let target_paths: Vec = call_edges.iter().filter_map(|tid| { + db.conn().query_row( + "SELECT f.path FROM nodes n JOIN files f ON n.file_id = f.id WHERE n.id = ?1", + [*tid], |row| row.get(0) + ).ok() + }).collect(); + + // Must pick exactly the same-dir candidate, not fan out to the test file. + assert!( + target_paths.iter().any(|p| p == "pkg/scripts/lifecycle.js"), + "should resolve to same-dir readJson; got {:?}", target_paths + ); + assert!( + !target_paths.iter().any(|p| p == "tests/helpers.test.js"), + "should NOT fan out to unrelated test-file readJson; got {:?}", target_paths + ); +} + +#[test] +fn test_js_module_level_test_callback_calls_resolve() { + // Regression: helpers defined in a JS test file that are called only + // from inside `test(() => {...})` / `describe(() => {...})` callbacks + // used to be reported as orphan by dead-code, because the anonymous + // arrow callback body attributed its calls to ``, a name + // that resolves to no node. Module-level call_expressions inside JS + // test files must attribute to `` so a same-file edge lands. + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + + fs::write(project_dir.path().join("helpers.test.js"), r#" +function mkHome() { return '/tmp/x'; } +function writeJson(p, v) { } + +test('uses helpers', () => { + const h = mkHome(); + writeJson(h, { a: 1 }); +}); +"#).unwrap(); + + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + run_full_index(&db, project_dir.path(), None, None).unwrap(); + + // Both helper names must have at least one incoming call edge. + for helper in ["mkHome", "writeJson"] { + let cnt: i64 = db.conn().query_row( + "SELECT COUNT(*) FROM edges e + JOIN nodes tn ON tn.id = e.target_id + JOIN files tf ON tf.id = tn.file_id + WHERE tn.name = ?1 AND tf.path = 'helpers.test.js' AND e.relation = 'calls'", + [helper], |row| row.get(0), + ).unwrap(); + assert!(cnt >= 1, + "{} should have at least one incoming call edge from the test callback, got {}", + helper, cnt); + } +} + +#[test] +fn test_incremental_index() { + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + + // Initial index + fs::write(project_dir.path().join("a.ts"), "function foo() {}").unwrap(); + run_full_index(&db, project_dir.path(), None, None).unwrap(); + + // Modify file + fs::write(project_dir.path().join("a.ts"), "function bar() {}").unwrap(); + + // Incremental index + let result = run_incremental_index(&db, project_dir.path(), None, None).unwrap(); + assert_eq!(result.files_indexed, 1); + + let foo = get_nodes_by_name(db.conn(), "foo").unwrap(); + assert_eq!(foo.len(), 0); + let bar = get_nodes_by_name(db.conn(), "bar").unwrap(); + assert_eq!(bar.len(), 1); +} + +#[test] +fn test_incremental_propagates_dirty_context() { + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + + // Initial: B (in b.ts) calls A (in a.ts) + fs::write(project_dir.path().join("a.ts"), "function alpha() {}").unwrap(); + fs::write(project_dir.path().join("b.ts"), "function beta() { alpha(); }").unwrap(); + run_full_index(&db, project_dir.path(), None, None).unwrap(); + + let beta_nodes = get_nodes_by_name(db.conn(), "beta").unwrap(); + assert_eq!(beta_nodes.len(), 1); + let beta_ctx_before = beta_nodes[0].context_string.clone().unwrap_or_default(); + + // Change A: rename function (alpha -> alphaRenamed) + fs::write(project_dir.path().join("a.ts"), "function alphaRenamed() {}").unwrap(); + run_incremental_index(&db, project_dir.path(), None, None).unwrap(); + + // beta's context_string should be updated (calls list changed because + // the old alpha node is gone and edge was cascade-deleted) + let beta_nodes_after = get_nodes_by_name(db.conn(), "beta").unwrap(); + assert_eq!(beta_nodes_after.len(), 1); + let beta_ctx_after = beta_nodes_after[0].context_string.clone().unwrap_or_default(); + assert_ne!(beta_ctx_before, beta_ctx_after); +} + +#[test] +fn test_deleted_file_cleanup() { + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + + fs::write(project_dir.path().join("a.ts"), "function foo() {}").unwrap(); + run_full_index(&db, project_dir.path(), None, None).unwrap(); + + fs::remove_file(project_dir.path().join("a.ts")).unwrap(); + run_incremental_index(&db, project_dir.path(), None, None).unwrap(); + + let foo = get_nodes_by_name(db.conn(), "foo").unwrap(); + assert_eq!(foo.len(), 0); +} + +#[test] +fn test_build_python_module_map() { + let mut paths = HashSet::new(); + paths.insert("myapp/utils.py".into()); + paths.insert("myapp/__init__.py".into()); + paths.insert("src/myapp/models.py".into()); + + let map = build_python_module_map(&paths); + + // Full dotted path + assert!(map.get("myapp.utils").unwrap().contains(&"myapp/utils.py".to_string())); + // Suffix path + assert!(map.get("utils").unwrap().contains(&"myapp/utils.py".to_string())); + // __init__.py maps to package + assert!(map.get("myapp").unwrap().contains(&"myapp/__init__.py".to_string())); + // Nested with src/ prefix + assert!(map.get("myapp.models").unwrap().contains(&"src/myapp/models.py".to_string())); +} + +#[test] +fn test_python_from_import_resolution() { + // Test `from myapp.utils import helper` creates correct cross-file edge + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + + fs::create_dir_all(project_dir.path().join("myapp")).unwrap(); + fs::write( + project_dir.path().join("myapp/utils.py"), + "def helper():\n return 42\n", + ).unwrap(); + fs::write( + project_dir.path().join("myapp/main.py"), + "from myapp.utils import helper\n\ndef main():\n helper()\n", + ).unwrap(); + + let result = run_full_index(&db, project_dir.path(), None, None).unwrap(); + assert!(result.edges_created > 0, "should create import edges"); + + // Verify dependency: main.py -> utils.py + let deps = get_import_tree(db.conn(), "myapp/main.py", "outgoing", 1).unwrap(); + assert!( + deps.iter().any(|d| d.file_path == "myapp/utils.py"), + "main.py should depend on utils.py, got: {:?}", + deps.iter().map(|d| &d.file_path).collect::>() + ); +} + +#[test] +fn test_python_import_module_resolution() { + // Test `import myutils` creates correct cross-file edge + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + + fs::write( + project_dir.path().join("myutils.py"), + "def do_something():\n pass\n", + ).unwrap(); + fs::write( + project_dir.path().join("main.py"), + "import myutils\n\ndef main():\n myutils.do_something()\n", + ).unwrap(); + + let result = run_full_index(&db, project_dir.path(), None, None).unwrap(); + assert!(result.edges_created > 0, "should create import edges"); + + // Verify dependency: main.py -> myutils.py + let deps = get_import_tree(db.conn(), "main.py", "outgoing", 1).unwrap(); + assert!( + deps.iter().any(|d| d.file_path == "myutils.py"), + "main.py should depend on myutils.py, got: {:?}", + deps.iter().map(|d| &d.file_path).collect::>() + ); +} + +#[test] +fn test_python_external_import_creates_virtual_nodes() { + // Test that external imports create virtual nodes in file + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + + fs::write( + project_dir.path().join("app.py"), + "import os\nfrom collections import OrderedDict\nfrom flask import Flask\n\ndef main():\n pass\n", + ).unwrap(); + + let result = run_full_index(&db, project_dir.path(), None, None).unwrap(); + assert!(result.files_indexed > 0, "should index the file"); + + // Verify file was created with virtual nodes + let ext_nodes = get_nodes_by_file_path(db.conn(), "").unwrap(); + let ext_names: Vec<&str> = ext_nodes.iter().map(|n| n.name.as_str()).collect(); + assert!(ext_names.contains(&"os"), "should have virtual node for 'os', got: {:?}", ext_names); + assert!(ext_names.contains(&"collections"), "should have virtual node for 'collections', got: {:?}", ext_names); + assert!(ext_names.contains(&"flask"), "should have virtual node for 'flask', got: {:?}", ext_names); + + // Verify dependency_graph shows as a dependency + let deps = get_import_tree(db.conn(), "app.py", "outgoing", 1).unwrap(); + assert!( + deps.iter().any(|d| d.file_path == ""), + "app.py should show dependency, got: {:?}", + deps.iter().map(|d| &d.file_path).collect::>() + ); +} + +#[test] +fn test_python_mixed_internal_external_imports() { + // Test project with both internal and external imports + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + + fs::create_dir_all(project_dir.path().join("myapp")).unwrap(); + fs::write( + project_dir.path().join("myapp/utils.py"), + "def helper():\n return 42\n", + ).unwrap(); + fs::write( + project_dir.path().join("myapp/main.py"), + "import os\nfrom myapp.utils import helper\nfrom flask import Flask\n\ndef main():\n helper()\n", + ).unwrap(); + + let result = run_full_index(&db, project_dir.path(), None, None).unwrap(); + assert!(result.edges_created > 0); + + // Should have internal dependency + let deps = get_import_tree(db.conn(), "myapp/main.py", "outgoing", 1).unwrap(); + let dep_files: Vec<&str> = deps.iter().map(|d| d.file_path.as_str()).collect(); + assert!(dep_files.contains(&"myapp/utils.py"), "should depend on internal utils.py, got: {:?}", dep_files); + + // Should also have external dependency + assert!(dep_files.contains(&""), "should depend on , got: {:?}", dep_files); +} + +#[test] +fn test_index_stats_skipped_large_file() { + // Verify that IndexResult.stats tracks files skipped due to size + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + + // Create a normal file + fs::write(project_dir.path().join("small.ts"), "function ok() {}").unwrap(); + + // Create a file exceeding MAX_FILE_SIZE (10MB) + let big_content = "a".repeat(11 * 1024 * 1024); + fs::write(project_dir.path().join("huge.ts"), &big_content).unwrap(); + + let result = run_full_index(&db, project_dir.path(), None, None).unwrap(); + assert_eq!(result.files_indexed, 1, "should index the small file"); + assert_eq!(result.stats.files_skipped_size, 1, "should track the large file skip"); +} + +#[test] +fn test_index_stats_skipped_parse_error() { + // Verify that IndexResult.stats tracks files skipped due to parse errors + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + + // Create a valid file + fs::write(project_dir.path().join("good.ts"), "function ok() {}").unwrap(); + + // Create a file with an unsupported extension that detect_language returns None for + // (this is filtered by detect_language returning None, not a parse error) + // Instead, we just verify the default stats are zero for parse errors + let result = run_full_index(&db, project_dir.path(), None, None).unwrap(); + assert_eq!(result.stats.files_skipped_parse, 0); + assert_eq!(result.stats.files_skipped_read, 0); + assert_eq!(result.stats.files_skipped_hash, 0); +} + +#[test] +fn test_index_stats_default() { + // IndexStats should implement Default + let stats = IndexStats::default(); + assert_eq!(stats.files_skipped_size, 0); + assert_eq!(stats.files_skipped_parse, 0); + assert_eq!(stats.files_skipped_read, 0); + assert_eq!(stats.files_skipped_hash, 0); + assert_eq!(stats.files_skipped_language, 0); +} + +#[test] +fn test_python_external_survives_incremental_index() { + // Test that pseudo-file persists across incremental re-indexes + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + + fs::write( + project_dir.path().join("app.py"), + "import os\n\ndef main():\n pass\n", + ).unwrap(); + + // Full index → creates with "os" node + run_full_index(&db, project_dir.path(), None, None).unwrap(); + let ext_before = get_nodes_by_file_path(db.conn(), "").unwrap(); + assert!(!ext_before.is_empty(), "should have external nodes after full index"); + + // Modify file slightly + fs::write( + project_dir.path().join("app.py"), + "import os\n\ndef main():\n return 1\n", + ).unwrap(); + + // Incremental index → should survive + run_incremental_index(&db, project_dir.path(), None, None).unwrap(); + let ext_after = get_nodes_by_file_path(db.conn(), "").unwrap(); + assert!(!ext_after.is_empty(), "external nodes should survive incremental index"); + + // Verify dependency still visible + let deps = get_import_tree(db.conn(), "app.py", "outgoing", 1).unwrap(); + assert!( + deps.iter().any(|d| d.file_path == ""), + "app.py should still show dependency after incremental index" + ); +} + +#[test] +fn test_repair_null_context_strings() { + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + + // Index a file so nodes get context strings + fs::write(project_dir.path().join("a.ts"), r#" +function alpha() { return 1; } +function beta() { alpha(); } +"#).unwrap(); + run_full_index(&db, project_dir.path(), None, None).unwrap(); + + // Verify context strings exist after index + let alpha_nodes = get_nodes_by_name(db.conn(), "alpha").unwrap(); + assert_eq!(alpha_nodes.len(), 1); + assert!(alpha_nodes[0].context_string.is_some(), "alpha should have context_string after index"); + + let beta_nodes = get_nodes_by_name(db.conn(), "beta").unwrap(); + assert_eq!(beta_nodes.len(), 1); + assert!(beta_nodes[0].context_string.is_some(), "beta should have context_string after index"); + + // Simulate Phase 3 failure: NULL out context_strings + db.conn().execute("UPDATE nodes SET context_string = NULL", []).unwrap(); + + // Verify they are now NULL + let alpha_after_null = get_nodes_by_name(db.conn(), "alpha").unwrap(); + assert!(alpha_after_null[0].context_string.is_none(), "alpha context_string should be NULL after simulated failure"); + + // Run repair + let repaired = repair_null_context_strings(&db, None).unwrap(); + assert!(repaired > 0, "should repair at least 1 node"); + + // Verify context strings were restored + let alpha_repaired = get_nodes_by_name(db.conn(), "alpha").unwrap(); + assert!(alpha_repaired[0].context_string.is_some(), "alpha should have context_string after repair"); + + let beta_repaired = get_nodes_by_name(db.conn(), "beta").unwrap(); + assert!(beta_repaired[0].context_string.is_some(), "beta should have context_string after repair"); +} + +#[test] +fn test_rust_implements_creates_sentinel_for_external_trait() { + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + + fs::write(project_dir.path().join("main.rs"), r#" +use std::io::{self, Write}; +use std::fmt; + +struct MyWriter; + +impl Write for MyWriter { + fn write(&mut self, buf: &[u8]) -> io::Result { Ok(buf.len()) } + fn flush(&mut self) -> io::Result<()> { Ok(()) } +} + +impl fmt::Display for MyWriter { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "MyWriter") + } +} +"#).unwrap(); + + let result = run_full_index(&db, project_dir.path(), None, None).unwrap(); + assert!(result.files_indexed > 0); + + // Verify sentinel nodes created for external traits + let ext_nodes = get_nodes_by_file_path(db.conn(), "").unwrap(); + let ext_names: Vec<&str> = ext_nodes.iter().map(|n| n.name.as_str()).collect(); + assert!(ext_names.contains(&"Write"), "should have sentinel for Write, got: {:?}", ext_names); + // fmt::Display keeps path prefix (as parsed by tree-sitter) + assert!(ext_names.contains(&"fmt::Display"), "should have sentinel for fmt::Display, got: {:?}", ext_names); + + // Verify sentinel type is "trait" + let write_node = ext_nodes.iter().find(|n| n.name == "Write").unwrap(); + assert_eq!(write_node.node_type, "trait", "sentinel should be type 'trait'"); + + // Verify implements edges exist: MyWriter → Write, MyWriter → Display + let edges: Vec<(String, String)> = db.conn().prepare( + "SELECT ns.name, nt.name FROM edges e + JOIN nodes ns ON ns.id = e.source_id + JOIN nodes nt ON nt.id = e.target_id + WHERE e.relation = 'implements'" + ).unwrap() + .query_map([], |row| Ok((row.get(0)?, row.get(1)?))) + .unwrap() + .collect::, _>>().unwrap(); + + assert!(edges.contains(&("MyWriter".into(), "Write".into())), + "should have MyWriter→Write implements edge, got: {:?}", edges); + assert!(edges.contains(&("MyWriter".into(), "fmt::Display".into())), + "should have MyWriter→fmt::Display implements edge, got: {:?}", edges); +} + +/// ensure_file_indexed must (a) be a no-op when on-disk hash matches the +/// stored hash, and (b) actually pick up post-edit content when it doesn't. +/// This is the contract the MCP `ensure_file_fresh_opt` wrapper relies on +/// to close the post-Edit→pre-incremental-index window. +#[test] +fn test_ensure_file_indexed_picks_up_post_edit_changes() { + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + + // Initial state: file with `alpha` + fs::write(project_dir.path().join("a.ts"), "function alpha() {}\n").unwrap(); + run_full_index(&db, project_dir.path(), None, None).unwrap(); + let names_before: Vec = get_nodes_by_name(db.conn(), "alpha") + .unwrap().into_iter().map(|n| n.name).collect(); + assert_eq!(names_before, vec!["alpha".to_string()]); + + // No-op when hashes match + let did = ensure_file_indexed(&db, project_dir.path(), "a.ts", None).unwrap(); + assert!(!did, "matching hash must be a no-op (got reindex)"); + + // Edit on disk; old `alpha` removed, new `beta` added + fs::write(project_dir.path().join("a.ts"), "function beta() {}\n").unwrap(); + let did2 = ensure_file_indexed(&db, project_dir.path(), "a.ts", None).unwrap(); + assert!(did2, "hash mismatch must trigger a reindex"); + + // alpha gone, beta present — post-Edit query would now see fresh state + assert!(get_nodes_by_name(db.conn(), "alpha").unwrap().is_empty(), + "old alpha must be evicted by single-file reindex"); + let beta = get_nodes_by_name(db.conn(), "beta").unwrap(); + assert_eq!(beta.len(), 1, "new beta must appear after single-file reindex"); + assert_eq!(beta[0].name, "beta"); + + // Calling again with no on-disk change is a no-op + let did3 = ensure_file_indexed(&db, project_dir.path(), "a.ts", None).unwrap(); + assert!(!did3, "second call with no edit must no-op"); + + // Deleting the file from disk drops the row + fs::remove_file(project_dir.path().join("a.ts")).unwrap(); + let did4 = ensure_file_indexed(&db, project_dir.path(), "a.ts", None).unwrap(); + assert!(did4, "missing file must trigger row cleanup"); + assert!(get_nodes_by_name(db.conn(), "beta").unwrap().is_empty(), + "beta must be cascade-deleted with its file"); +} + +/// Root-cause test for `feedback_incremental_edge_timing.md`: file B +/// (existing, unchanged) bare-name calls `foo()`. file A is added later +/// with `function foo() {}`. Phase 2 of B's first index pass dropped the +/// edge because `foo` was unresolvable; before this fix, A's later index +/// never re-resolved B's call → permanently missing edge in incremental +/// mode (only `rebuild-index` recovered it). +/// +/// New behavior: B's drop becomes a `pending_unresolved_calls` row; A's +/// index pass sweeps pending and promotes the row into a real edge. +#[test] +fn test_pending_unresolved_call_resolves_when_callee_added_later() { + use crate::storage::queries::{count_pending_unresolved_calls, get_node_ids_by_name}; + + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + + // Step 1: B exists alone with bare-name call to foo (foo undefined). + fs::write(project_dir.path().join("b.ts"), + "function caller_b() { foo(); }\n").unwrap(); + run_full_index(&db, project_dir.path(), None, None).unwrap(); + + // Phase 2 dropped the edge (no same-file/same-language target) and + // buffered the row instead. + assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 1, + "B's call to undefined foo must land in pending_unresolved_calls"); + + let caller_b_id = get_node_ids_by_name(db.conn(), "caller_b").unwrap() + .into_iter().next().expect("caller_b must exist").0; + + // Verify NO edge yet (foo doesn't exist in DB). + let pre_edges = crate::storage::queries::get_edges_from(db.conn(), caller_b_id).unwrap(); + assert!(pre_edges.iter().all(|e| e.relation != REL_CALLS), + "no calls edge should exist yet — foo is undefined"); + + // Step 2: A is added with foo(). Incremental index picks it up; the + // pending sweep at end of index_files promotes B's buffered call into + // a real edge. + fs::write(project_dir.path().join("a.ts"), + "export function foo() {}\n").unwrap(); + run_incremental_index(&db, project_dir.path(), None, None).unwrap(); + + let foo_id = get_node_ids_by_name(db.conn(), "foo").unwrap() + .into_iter().next().expect("foo must exist after A indexed").0; + + let post_edges = crate::storage::queries::get_edges_from(db.conn(), caller_b_id).unwrap(); + let calls_to_foo: Vec<_> = post_edges.iter() + .filter(|e| e.relation == REL_CALLS && e.target_id == foo_id) + .collect(); + assert_eq!(calls_to_foo.len(), 1, + "incremental index must promote pending call → calls edge caller_b → foo; \ + got edges: {:?}", post_edges.iter().map(|e| (&e.relation, e.target_id)).collect::>()); + + // Pending row must be drained after successful resolution. + assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 0, + "resolved pending row must be deleted after edge insertion"); +} + +/// Cross-language pending must NOT resolve cross-language. If B (TS) +/// calls `update()` and a later-indexed Rust file defines `fn update()`, +/// the pending row must stay buffered, not silently bind cross-language +/// (memory `feedback_edge_resolution_same_language.md`'s canonical +/// false-positive class). +#[test] +fn test_pending_unresolved_call_does_not_cross_language() { + use crate::storage::queries::count_pending_unresolved_calls; + + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + + // TS file with bare-name call to `update` + fs::write(project_dir.path().join("client.ts"), + "function caller_ts() { update(); }\n").unwrap(); + run_full_index(&db, project_dir.path(), None, None).unwrap(); + assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 1); + + // Rust file with `update` — different language, must NOT match. + fs::write(project_dir.path().join("hasher.rs"), + "fn update() {}\n").unwrap(); + run_incremental_index(&db, project_dir.path(), None, None).unwrap(); + + // Pending row stays — sweep refused cross-language resolution. + assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 1, + "cross-language target must NOT resolve a TS pending call to a Rust fn"); +} + +/// One caller with N undefined references must produce N pending rows; +/// when a single later-added file defines all N, all rows must resolve in +/// a single sweep. Real codebases hit this whenever a "barrel" or shared +/// utility module gets added after its consumers. +#[test] +fn test_pending_resolves_multiple_calls_in_same_caller() { + use crate::storage::queries::{count_pending_unresolved_calls, get_node_ids_by_name}; + + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + + // B has three undefined call targets — foo, bar, baz. + fs::write(project_dir.path().join("b.ts"), + "function caller_b() { foo(); bar(); baz(); }\n").unwrap(); + run_full_index(&db, project_dir.path(), None, None).unwrap(); + + assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 3, + "three bare-name calls must produce three pending rows"); + + // A defines all three. + fs::write(project_dir.path().join("a.ts"), + "export function foo() {}\nexport function bar() {}\nexport function baz() {}\n").unwrap(); + run_incremental_index(&db, project_dir.path(), None, None).unwrap(); + + assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 0, + "all three pending rows must drain once their targets exist"); + + // All three resolved into real edges. + let caller_b_id = get_node_ids_by_name(db.conn(), "caller_b").unwrap() + .into_iter().next().unwrap().0; + let edges = crate::storage::queries::get_edges_from(db.conn(), caller_b_id).unwrap(); + let calls_count = edges.iter().filter(|e| e.relation == REL_CALLS).count(); + assert_eq!(calls_count, 3, + "caller_b must have exactly three calls edges (foo, bar, baz); got {} edges total: {:?}", + calls_count, edges.iter().map(|e| (&e.relation, e.target_id)).collect::>()); +} + +/// When the caller's source file is reindexed (e.g. user edits B), the +/// cascade FK on pending_unresolved_calls(source_id) must drop B's pending +/// rows so a fresh Phase 2 can re-buffer them with the current source IDs. +/// This is the schema's load-bearing self-cleaning property — we test it +/// explicitly so a future migration that drops or weakens the FK fails +/// loudly here rather than leaking pending rows for ever-removed callers. +#[test] +fn test_pending_cascade_deletes_when_caller_file_reindexed() { + use crate::storage::queries::count_pending_unresolved_calls; + + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + + // B with undefined target → pending row created. + fs::write(project_dir.path().join("b.ts"), + "function caller_b() { undefined_target(); }\n").unwrap(); + run_full_index(&db, project_dir.path(), None, None).unwrap(); + assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 1); + + // Edit B to remove the call entirely. caller_b's old node gets + // cascade-deleted on reindex (Phase 1 deletes prior rows), and its + // pending row must follow it via ON DELETE CASCADE on source_id. + fs::write(project_dir.path().join("b.ts"), + "function caller_b() { /* call removed */ }\n").unwrap(); + run_incremental_index(&db, project_dir.path(), None, None).unwrap(); + + assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 0, + "pending row must be cascade-deleted when its source caller is removed/reindexed"); +} + +/// Inverse-direction symmetry test for `feedback_incremental_edge_timing.md`: +/// existing edge B → A.foo gets cascade-deleted when A is removed, and B +/// is NOT in changed_paths (deletion doesn't re-extract B). Without Phase 0 +/// pre-cascade buffering, B has neither edge nor pending row — a permanent +/// silent edge loss until full rebuild. The Phase 0 buffer (added by this +/// fix) must capture B's call as a pending row before cascade fires. +#[test] +fn test_pending_buffers_on_callee_file_deletion() { + use crate::storage::queries::{count_pending_unresolved_calls, get_node_ids_by_name}; + + let project_dir = TempDir::new().unwrap(); + let db_dir = TempDir::new().unwrap(); + let db = Database::open(&db_dir.path().join("index.db")).unwrap(); + + // Initial: A defines foo, B calls foo — edge B.caller_b → A.foo exists. + fs::write(project_dir.path().join("a.ts"), + "export function foo() {}\n").unwrap(); + fs::write(project_dir.path().join("b.ts"), + "function caller_b() { foo(); }\n").unwrap(); + run_full_index(&db, project_dir.path(), None, None).unwrap(); + + // No pending rows yet — call resolved at index time. + assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 0, + "fully-resolvable call must not produce a pending row"); + + let caller_b_id = get_node_ids_by_name(db.conn(), "caller_b").unwrap() + .into_iter().next().unwrap().0; + let foo_id_pre = get_node_ids_by_name(db.conn(), "foo").unwrap() + .into_iter().next().unwrap().0; + let edges_pre = crate::storage::queries::get_edges_from(db.conn(), caller_b_id).unwrap(); + assert!(edges_pre.iter().any(|e| e.relation == REL_CALLS && e.target_id == foo_id_pre), + "edge caller_b → foo must exist pre-deletion"); + + // Delete A. Phase 0 must buffer B's now-orphaned call into pending + // BEFORE cascade strips the edge. + fs::remove_file(project_dir.path().join("a.ts")).unwrap(); + run_incremental_index(&db, project_dir.path(), None, None).unwrap(); + + // foo is gone. + assert!(get_node_ids_by_name(db.conn(), "foo").unwrap().is_empty(), + "foo must be cascade-deleted with file a.ts"); + + // B's edge to old foo is gone, but pending row holds the call. + assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 1, + "Phase 0 must buffer the orphaned inbound call into pending"); + + // Re-add A — pending sweep promotes the buffered call to a fresh edge. + fs::write(project_dir.path().join("a.ts"), + "export function foo() {}\n").unwrap(); + run_incremental_index(&db, project_dir.path(), None, None).unwrap(); + + assert_eq!(count_pending_unresolved_calls(db.conn()).unwrap(), 0, + "pending must drain once foo reappears"); + + let foo_id_post = get_node_ids_by_name(db.conn(), "foo").unwrap() + .into_iter().next().unwrap().0; + let edges_post = crate::storage::queries::get_edges_from(db.conn(), caller_b_id).unwrap(); + assert!(edges_post.iter().any(|e| e.relation == REL_CALLS && e.target_id == foo_id_post), + "edge caller_b → foo must reappear post re-add via pending sweep"); +}