mcp-cpp-project-indexer/cpp_index_utils.py at main · mef-programming/mcp-cpp-project-indexer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
from __future__ import annotations

import hashlib
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Any


# ---------------------------------------------------------------------------
# Time / JSON / hashing
# ---------------------------------------------------------------------------

def utc_now_iso() -> str:
    return (
        datetime.now(timezone.utc)
        .replace(microsecond=0)
        .isoformat()
        .replace("+00:00", "Z")
    )


def sha256_hex(data: bytes | str) -> str:
    if isinstance(data, str):
        data = data.encode("utf-8")

    return hashlib.sha256(data).hexdigest()


def canonical_json(data: Any) -> str:
    return json.dumps(
        data,
        ensure_ascii=False,
        sort_keys=True,
        separators=(",", ":"),
    )


def save_json(path: Path, data: Any) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(
        json.dumps(data, indent=2, ensure_ascii=False),
        encoding="utf-8",
    )


def load_json(path: Path) -> Any:
    return json.loads(path.read_text(encoding="utf-8"))


# ---------------------------------------------------------------------------
# Source loading / path identity
# ---------------------------------------------------------------------------

def detect_newline_kind(raw: bytes) -> str:
    crlf = raw.count(b"\r\n")
    tmp = raw.replace(b"\r\n", b"")
    lf = tmp.count(b"\n")
    cr = tmp.count(b"\r")

    kinds = sum(1 for value in (crlf, lf, cr) if value > 0)

    if kinds > 1:
        return "mixed"

    if crlf:
        return "crlf"

    if lf:
        return "lf"

    if cr:
        return "cr"

    return "unknown"


def decode_source(raw: bytes) -> tuple[str, str]:
    # Most project files should be UTF-8. cp1252 is a pragmatic fallback for
    # legacy Windows source files. The indexer is a routing tool, so preserving
    # line numbers is more important than perfect text recovery for rare bytes.
    for encoding in ("utf-8-sig", "utf-8", "cp1252"):
        try:
            return raw.decode(encoding), encoding
        except UnicodeDecodeError:
            continue

    return raw.decode("utf-8", errors="replace"), "utf-8-replace"


def split_source_lines_preserve_count(text: str) -> list[str]:
    # splitlines() avoids keeping newline characters while preserving the line
    # count expected by 1-based source ranges. A trailing final newline does not
    # create an extra logical source line in normal editor line numbering.
    return text.splitlines()


def normalized_relative_path(path: Path, project_root: Path | None) -> str:
    resolved = path.resolve()

    if project_root is not None:
        try:
            relative = resolved.relative_to(project_root.resolve())
        except ValueError:
            relative = Path(path.name)
    else:
        relative = Path(path.name)

    return relative.as_posix()


def normalize_path_for_hash(
    relative_path: str,
    *,
    case_insensitive_paths: bool = True,
) -> str:
    normalized = relative_path.replace("\\", "/")

    while "//" in normalized:
        normalized = normalized.replace("//", "/")

    normalized = normalized.strip("/")

    if case_insensitive_paths:
        normalized = normalized.casefold()

    return normalized


def make_path_hash(
    relative_path: str,
    *,
    case_insensitive_paths: bool = True,
) -> str:
    return sha256_hex(
        normalize_path_for_hash(
            relative_path,
            case_insensitive_paths=case_insensitive_paths,
        )
    )


def make_content_hash(raw: bytes) -> str:
    return sha256_hex(raw)


def make_file_id(path_hash: str, *, length: int = 24) -> str:
    return f"f_{path_hash[:length]}"


def safe_name(text: str) -> str:
    return (
        text.replace("/", "_")
        .replace("\\", "_")
        .replace(":", "_")
        .replace("*", "_")
        .replace("?", "_")
        .replace('"', "_")
        .replace("<", "_")
        .replace(">", "_")
        .replace("|", "_")
    )


# ---------------------------------------------------------------------------
# Source text / signature normalization
# ---------------------------------------------------------------------------

def normalize_signature_spacing(signature: str) -> str:
    signature = re.sub(r"\s+", " ", signature).strip()
    signature = re.sub(r"\s+,", ",", signature)
    signature = re.sub(r"\(\s+", "(", signature)
    signature = re.sub(r"\s+\)", ")", signature)
    signature = re.sub(r"\[\s+", "[", signature)
    signature = re.sub(r"\s+\]", "]", signature)
    signature = re.sub(r"\s+;", ";", signature)
    signature = re.sub(r"\s+:", " :", signature)
    signature = re.sub(r"\s+<\s+", " <", signature)
    signature = re.sub(r"\s+>", ">", signature)
    signature = re.sub(r"\s+::\s+", "::", signature)
    signature = re.sub(r"~\s+", "~", signature)
    return signature


def source_text_range(
    lines: list[str],
    start_line: int,
    end_line: int,
    end_col0_exclusive: int | None = None,
) -> str:
    if start_line < 1:
        start_line = 1

    if end_line > len(lines):
        end_line = len(lines)

    if start_line > end_line:
        return ""

    parts: list[str] = []

    for line_no in range(start_line, end_line + 1):
        line = lines[line_no - 1]

        if line_no == end_line and end_col0_exclusive is not None:
            line = line[:end_col0_exclusive]

        parts.append(line)

    return normalize_signature_spacing(" ".join(parts))


def strip_line_prefix(line: str) -> str:
    return re.sub(r"^\d{4,}:\s?", "", line)


# ---------------------------------------------------------------------------
# Symbol identity
# ---------------------------------------------------------------------------

def make_signature_hash(signature_key: dict[str, Any]) -> str:
    return sha256_hex(canonical_json(signature_key))


def make_symbol_id(
    *,
    file_id: str,
    start_line: int,
    end_line: int,
    signature_hash: str,
    hash_length: int = 12,
) -> str:
    short_file_id = file_id

    if short_file_id.startswith("f_"):
        short_file_id = short_file_id[2:]

    return (
        f"s_f_{short_file_id}_"
        f"{start_line:06d}_{end_line:06d}_"
        f"{signature_hash[:hash_length]}"
    )


def short_name_from_qualified_name(name: str) -> str:
    if not name:
        return ""

    # This is intentionally simple. Operator names may contain spaces, but they
    # are still after the final :: in qualified form.
    return name.split("::")[-1]


def container_from_qualified_name(name: str) -> str | None:
    if "::" not in name:
        return None

    return "::".join(name.split("::")[:-1])


# ---------------------------------------------------------------------------
# Range validation helpers
# ---------------------------------------------------------------------------

def is_valid_line_range(
    *,
    start_line: int,
    end_line: int,
    line_count: int,
) -> bool:
    return 1 <= start_line <= end_line <= line_count


def require_valid_line_range(
    *,
    start_line: int,
    end_line: int,
    line_count: int,
    context: str,
) -> None:
    if not is_valid_line_range(
        start_line=start_line,
        end_line=end_line,
        line_count=line_count,
    ):
        raise ValueError(
            f"Invalid source range for {context}: "
            f"{start_line}-{end_line}, line_count={line_count}"
        )


def range_contains_line(
    *,
    start_line: int,
    end_line: int,
    line_no: int,
) -> bool:
    return start_line <= line_no <= end_line