diff --git a/code_review_graph/graph.py b/code_review_graph/graph.py index 2fe2ccd..ad26630 100644 --- a/code_review_graph/graph.py +++ b/code_review_graph/graph.py @@ -306,24 +306,43 @@ def get_all_files(self) -> list[str]: return [r["file_path"] for r in rows] def search_nodes(self, query: str, limit: int = 20) -> list[GraphNode]: - """Keyword search across node names with multi-word AND logic. + """Keyword search across node names. - Each word in the query must match independently (case-insensitive) - against the node name or qualified name. For example, - ``"firebase auth"`` matches ``verify_firebase_token`` and - ``FirebaseAuth`` but not ``get_user``. + Tries FTS5 first (fast, tokenized matching), then falls back to + LIKE-based substring search when FTS5 returns no results. """ - words = query.lower().split() + words = query.split() if not words: return [] + # Phase 1: FTS5 search (uses the indexed nodes_fts table) + try: + if len(words) == 1: + fts_query = '"' + query.replace('"', '""') + '"' + else: + fts_query = " AND ".join( + '"' + w.replace('"', '""') + '"' for w in words + ) + rows = self._conn.execute( + "SELECT n.* FROM nodes_fts f " + "JOIN nodes n ON f.rowid = n.id " + "WHERE nodes_fts MATCH ? LIMIT ?", + (fts_query, limit), + ).fetchall() + if rows: + return [self._row_to_node(r) for r in rows] + except Exception: + pass # FTS5 table may not exist on older schemas + + # Phase 2: LIKE fallback (substring matching) conditions: list[str] = [] params: list[str | int] = [] for word in words: + w = word.lower() conditions.append( "(LOWER(name) LIKE ? OR LOWER(qualified_name) LIKE ?)" ) - params.extend([f"%{word}%", f"%{word}%"]) + params.extend([f"%{w}%", f"%{w}%"]) where = " AND ".join(conditions) sql = f"SELECT * FROM nodes WHERE {where} LIMIT ?" # nosec B608 diff --git a/code_review_graph/migrations.py b/code_review_graph/migrations.py index 0342a76..91ff287 100644 --- a/code_review_graph/migrations.py +++ b/code_review_graph/migrations.py @@ -156,6 +156,15 @@ def _migrate_v5(conn: sqlite3.Connection) -> None: logger.info("Migration v5: created nodes_fts FTS5 virtual table") +def _migrate_v6(conn: sqlite3.Connection) -> None: + """v6: Add composite index on edges for upsert_edge performance.""" + conn.execute(""" + CREATE INDEX IF NOT EXISTS idx_edges_composite + ON edges(kind, source_qualified, target_qualified, file_path, line) + """) + logger.info("Migration v6: created composite edge index") + + # --------------------------------------------------------------------------- # Migration registry # --------------------------------------------------------------------------- @@ -165,6 +174,7 @@ def _migrate_v5(conn: sqlite3.Connection) -> None: 3: _migrate_v3, 4: _migrate_v4, 5: _migrate_v5, + 6: _migrate_v6, } LATEST_VERSION = max(MIGRATIONS.keys()) diff --git a/code_review_graph/search.py b/code_review_graph/search.py index d2eb84e..59020ce 100644 --- a/code_review_graph/search.py +++ b/code_review_graph/search.py @@ -143,8 +143,14 @@ def _fts_search( Returns list of ``(node_id, bm25_score)`` tuples. The BM25 score is negated so higher = better (FTS5 returns negative BM25). """ - # Sanitize: wrap in double quotes to prevent FTS5 operator injection - safe_query = '"' + query.replace('"', '""') + '"' + # Split multi-word queries into AND-joined terms so "graph store" matches + # both "GraphStore" and nodes containing both words (not just exact phrase). + # Each term is quoted to prevent FTS5 operator injection. + terms = query.split() + if len(terms) <= 1: + safe_query = '"' + query.replace('"', '""') + '"' + else: + safe_query = " AND ".join('"' + t.replace('"', '""') + '"' for t in terms) try: rows = conn.execute( @@ -357,6 +363,8 @@ def hybrid_search( boost *= kind_boosts["_qualified"] if context_set and file_path in context_set: boost *= 1.5 + if row["is_test"]: + boost *= 0.5 boosted.append((node_id, score * boost)) diff --git a/code_review_graph/tools/query.py b/code_review_graph/tools/query.py index 70fe8f1..ff67784 100644 --- a/code_review_graph/tools/query.py +++ b/code_review_graph/tools/query.py @@ -174,14 +174,20 @@ def query_graph( node = candidates[0] target = node.qualified_name elif len(candidates) > 1: - return { - "status": "ambiguous", - "summary": ( - f"Multiple matches for '{target}'. " - "Please use a qualified name." - ), - "candidates": [node_to_dict(c) for c in candidates], - } + # Prefer non-test nodes when exactly one production candidate + non_test = [c for c in candidates if not c.is_test] + if len(non_test) == 1: + node = non_test[0] + target = node.qualified_name + else: + return { + "status": "ambiguous", + "summary": ( + f"Multiple matches for '{target}'. " + "Please use a qualified name." + ), + "candidates": [node_to_dict(c) for c in candidates], + } if not node and pattern != "file_summary": return { @@ -192,10 +198,12 @@ def query_graph( qn = node.qualified_name if node else target if pattern == "callers_of": + seen_qn: set[str] = set() for e in store.get_edges_by_target(qn): if e.kind == "CALLS": caller = store.get_node(e.source_qualified) - if caller: + if caller and caller.qualified_name not in seen_qn: + seen_qn.add(caller.qualified_name) results.append(node_to_dict(caller)) edges_out.append(edge_to_dict(e)) # Fallback: CALLS edges store unqualified target names @@ -204,15 +212,18 @@ def query_graph( if not results and node: for e in store.search_edges_by_target_name(node.name): caller = store.get_node(e.source_qualified) - if caller: + if caller and caller.qualified_name not in seen_qn: + seen_qn.add(caller.qualified_name) results.append(node_to_dict(caller)) edges_out.append(edge_to_dict(e)) elif pattern == "callees_of": + seen_qn: set[str] = set() for e in store.get_edges_by_source(qn): if e.kind == "CALLS": callee = store.get_node(e.target_qualified) - if callee: + if callee and callee.qualified_name not in seen_qn: + seen_qn.add(callee.qualified_name) results.append(node_to_dict(callee)) edges_out.append(edge_to_dict(e)) @@ -261,10 +272,12 @@ def query_graph( results.append(node_to_dict(t)) elif pattern == "inheritors_of": + seen_qn: set[str] = set() for e in store.get_edges_by_target(qn): if e.kind in ("INHERITS", "IMPLEMENTS"): child = store.get_node(e.source_qualified) - if child: + if child and child.qualified_name not in seen_qn: + seen_qn.add(child.qualified_name) results.append(node_to_dict(child)) edges_out.append(edge_to_dict(e))