Skip to content

Commit 3e1ca82

Browse files
committed
unified: Split corpus tests into source code and generated output
The corpus tests interleaved hand-written content (test cases) with generated content (printed ASTs). This made merge conflicts hard to resolve because you can't just regnerate the printed ASTs without potentially throwing away new test cases that came from either branch (or depending on whether the merge conflict markers appeared, the corpus test could be ruined completely). The old design did have one nice advantage: Reviewers could see the printed ASTs alongside the source code from which it was generated. To preserve this feature, the source code for the test case is itself included in the generated output file.
1 parent 3983e4d commit 3e1ca82

2 files changed

Lines changed: 137 additions & 162 deletions

File tree

unified/AGENTS.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,11 @@ This is a CodeQL extractor based on tree-sitter.
1919

2020
- To run tests for the parser and mapping, run `cargo test` in the `extractor` directory.
2121

22-
- Do not edit the printed ASTs in `extractor/test/corpus` directly. To regenerate the ASTs, run `scripts/update-corpus.sh`.
22+
- Extractor test cases are located at `extractor/test/corpus/swift/*/*.swift`.
23+
24+
- Each test case has a corresponding `.output` file containing its generated output along with a copy of the test case itself.
25+
26+
- Check the output files for correctness but do not edit them manually. Regenerate them with `scripts/update-corpus.sh`.
2327

2428
## CodeQL Testing
2529
- If you changed the extractor code, always rebuild it before running CodeQL tests.

unified/extractor/tests/corpus_tests.rs

Lines changed: 132 additions & 161 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ mod languages;
99

1010
#[derive(Debug)]
1111
struct CorpusCase {
12-
name: String,
1312
input: String,
1413
raw: String,
1514
expected: String,
@@ -21,129 +20,43 @@ fn update_mode_enabled() -> bool {
2120
.unwrap_or(false)
2221
}
2322

24-
fn is_header_rule(line: &str) -> bool {
25-
let trimmed = line.trim();
26-
trimmed.len() >= 3 && trimmed.chars().all(|c| c == '=')
27-
}
28-
29-
fn is_next_case_header(lines: &[&str], i: usize) -> bool {
30-
is_header_rule(lines[i])
31-
&& i + 2 < lines.len()
32-
&& !lines[i + 1].trim().is_empty()
33-
&& is_header_rule(lines[i + 2])
34-
}
35-
36-
fn parse_corpus(content: &str) -> Vec<CorpusCase> {
37-
let lines: Vec<&str> = content.lines().collect();
38-
let mut i = 0;
39-
let mut cases = Vec::new();
40-
41-
while i < lines.len() {
42-
while i < lines.len() && lines[i].trim().is_empty() {
43-
i += 1;
44-
}
45-
if i >= lines.len() {
46-
break;
47-
}
48-
49-
assert!(
50-
is_header_rule(lines[i]),
51-
"Expected header delimiter at line {}",
52-
i + 1
53-
);
54-
i += 1;
55-
56-
assert!(i < lines.len(), "Missing test name at line {}", i + 1);
57-
let name = lines[i].trim().to_string();
58-
i += 1;
59-
60-
assert!(
61-
i < lines.len() && is_header_rule(lines[i]),
62-
"Missing closing header delimiter for case {name}"
63-
);
64-
i += 1;
65-
66-
let input_start = i;
67-
while i < lines.len() && lines[i].trim() != "---" {
68-
if is_next_case_header(&lines, i) {
69-
break;
70-
}
71-
i += 1;
72-
}
73-
let input = lines[input_start..i].join("\n").trim_end().to_string();
74-
let raw;
75-
let expected;
76-
if i >= lines.len() || lines[i].trim() != "---" {
77-
// No `---` separator before next case (or EOF). Treat the
78-
// remaining sections as empty.
79-
raw = String::new();
80-
expected = String::new();
81-
} else {
82-
i += 1;
83-
84-
// Raw tree-sitter parse section. New-format files have a second
85-
// `---` separator between the raw tree and the mapped AST. Legacy
86-
// files (with only one separator) have no raw section — in that
87-
// case `raw` stays empty and update mode will populate it.
88-
let raw_start = i;
89-
let mut next_sep = i;
90-
while next_sep < lines.len() && lines[next_sep].trim() != "---" {
91-
if is_next_case_header(&lines, next_sep) {
92-
break;
93-
}
94-
next_sep += 1;
95-
}
96-
raw = if next_sep < lines.len() && lines[next_sep].trim() == "---" {
97-
let raw_text = lines[raw_start..next_sep].join("\n").trim().to_string();
98-
i = next_sep + 1;
99-
raw_text
100-
} else {
101-
String::new()
102-
};
103-
104-
let expected_start = i;
105-
while i < lines.len() {
106-
if is_next_case_header(&lines, i) {
107-
break;
108-
}
109-
i += 1;
110-
}
111-
expected = lines[expected_start..i].join("\n").trim().to_string();
112-
}
23+
/// Parse a corpus `.output` file. The file holds a single test case made of
24+
/// three sections separated by `---` delimiter lines:
25+
///
26+
/// ```text
27+
/// <test case source>
28+
///
29+
/// ---
30+
///
31+
/// <raw tree-sitter parse tree>
32+
///
33+
/// ---
34+
///
35+
/// <mapped AST>
36+
/// ```
37+
///
38+
/// The test name is the file name, so there is no header section. Missing
39+
/// trailing sections (e.g. a freshly added file) are treated as empty.
40+
fn parse_corpus(content: &str) -> CorpusCase {
41+
let lines: Vec<&str> = content.split('\n').collect();
42+
let mut sections = lines
43+
.split(|line| line.trim() == "---")
44+
.map(|chunk| chunk.join("\n").trim().to_string());
11345

114-
cases.push(CorpusCase {
115-
name,
116-
input,
117-
raw,
118-
expected,
119-
});
46+
CorpusCase {
47+
input: sections.next().unwrap_or_default(),
48+
raw: sections.next().unwrap_or_default(),
49+
expected: sections.next().unwrap_or_default(),
12050
}
121-
122-
cases
12351
}
12452

125-
fn render_corpus(cases: &[CorpusCase]) -> String {
126-
let mut out = String::new();
127-
128-
for (idx, case) in cases.iter().enumerate() {
129-
if idx > 0 {
130-
// Blank line between cases.
131-
out.push('\n');
132-
}
133-
out.push_str("===\n");
134-
out.push_str(case.name.trim());
135-
out.push_str("\n===\n\n");
136-
out.push_str(case.input.trim());
137-
out.push_str("\n\n---\n\n");
138-
out.push_str(case.raw.trim());
139-
out.push_str("\n\n---\n\n");
140-
out.push_str(case.expected.trim());
141-
// Single trailing newline per case; the inter-case blank line is
142-
// added by the prefix above, and the file ends with exactly one `\n`.
143-
out.push('\n');
144-
}
145-
146-
out
53+
fn render_corpus(case: &CorpusCase) -> String {
54+
format!(
55+
"{}\n\n---\n\n{}\n\n---\n\n{}\n",
56+
case.input.trim(),
57+
case.raw.trim(),
58+
case.expected.trim()
59+
)
14760
}
14861

14962
fn run_desugaring(lang: &simple::LanguageSpec, input: &str) -> Result<yeast::Ast, String> {
@@ -182,6 +95,26 @@ fn dump_raw_parse(lang: &simple::LanguageSpec, input: &str) -> Result<String, St
18295
Ok(dump_ast(&ast, ast.get_root(), input))
18396
}
18497

98+
/// Collect the set of corpus test "stems" (paths without an extension) under
99+
/// `dir`. A stem is discovered from either a `.swift` source file or a
100+
/// `.output` file, so that a `.swift` with no `.output` (a freshly added test)
101+
/// and an orphaned `.output` with no `.swift` are both surfaced.
102+
fn collect_corpus_stems(dir: &Path, out: &mut Vec<std::path::PathBuf>) {
103+
let entries = fs::read_dir(dir)
104+
.unwrap_or_else(|e| panic!("Failed to read corpus directory {}: {e}", dir.display()));
105+
for entry in entries {
106+
let path = entry.expect("Failed to read corpus entry").path();
107+
if path.is_dir() {
108+
collect_corpus_stems(&path, out);
109+
} else if path
110+
.extension()
111+
.is_some_and(|ext| ext == "swift" || ext == "output")
112+
{
113+
out.push(path.with_extension(""));
114+
}
115+
}
116+
}
117+
185118
#[test]
186119
fn test_corpus() {
187120
let update_mode = update_mode_enabled();
@@ -200,77 +133,115 @@ fn test_corpus() {
200133
continue;
201134
}
202135

203-
let mut corpus_files: Vec<_> = fs::read_dir(&lang_corpus_dir)
204-
.unwrap_or_else(|e| {
205-
panic!(
206-
"Failed to read corpus directory {}: {e}",
207-
lang_corpus_dir.display()
208-
)
209-
})
210-
.map(|entry| entry.expect("Failed to read corpus entry").path())
211-
.filter(|path| path.extension().is_some_and(|ext| ext == "txt"))
212-
.collect();
213-
corpus_files.sort();
136+
let mut stems = Vec::new();
137+
collect_corpus_stems(&lang_corpus_dir, &mut stems);
138+
stems.sort();
139+
stems.dedup();
214140

215-
for corpus_path in corpus_files {
216-
let content = fs::read_to_string(&corpus_path)
217-
.unwrap_or_else(|e| panic!("Failed to read {}: {e}", corpus_path.display()));
218-
let mut cases = parse_corpus(&content);
141+
for stem in stems {
142+
let swift_path = stem.with_extension("swift");
143+
let output_path = stem.with_extension("output");
219144
let mut failures = Vec::new();
220-
assert!(
221-
!cases.is_empty(),
222-
"No corpus cases found in {}",
223-
corpus_path.display()
224-
);
225145

226-
for case in &mut cases {
227-
match dump_raw_parse(&lang, &case.input) {
146+
// The canonical test case lives in the `.swift` file and is the
147+
// source of truth. An `.output` file without a `.swift` sibling is
148+
// an orphan: there is nothing to drive it from.
149+
if !swift_path.exists() {
150+
panic!(
151+
"Found {} with no corresponding test case; add {} or remove the output file",
152+
output_path.display(),
153+
swift_path.display()
154+
);
155+
}
156+
157+
let swift_input = fs::read_to_string(&swift_path)
158+
.unwrap_or_else(|e| panic!("Failed to read {}: {e}", swift_path.display()))
159+
.trim()
160+
.to_string();
161+
162+
// Build the case from the existing `.output` file when present.
163+
// When it is missing (a freshly added `.swift`), start from an
164+
// empty case: update mode will generate it, and a normal test run
165+
// reports the missing output.
166+
let mut case = if output_path.exists() {
167+
let content = fs::read_to_string(&output_path)
168+
.unwrap_or_else(|e| panic!("Failed to read {}: {e}", output_path.display()));
169+
parse_corpus(&content)
170+
} else {
171+
if !update_mode {
172+
failures.push(format!(
173+
"Missing output file {}; run scripts/update-corpus.sh to generate it",
174+
output_path.display()
175+
));
176+
}
177+
CorpusCase {
178+
input: String::new(),
179+
raw: String::new(),
180+
expected: String::new(),
181+
}
182+
};
183+
184+
{
185+
// The input section in the `.output` file is a generated copy
186+
// of the `.swift` source, kept only so reviewers can see the
187+
// source alongside its printed ASTs.
188+
if update_mode {
189+
case.input = swift_input.clone();
190+
} else if output_path.exists() && case.input.trim() != swift_input {
191+
failures.push(format!(
192+
"Test case copy out of date in {}; rerun update-corpus to regenerate from {}",
193+
output_path.display(),
194+
swift_path.display()
195+
));
196+
}
197+
// Ensure the AST passes below operate on the source of truth.
198+
let case_input = swift_input.clone();
199+
200+
match dump_raw_parse(&lang, &case_input) {
228201
Err(e) => {
229202
failures.push(format!(
230-
"Raw parse failed for {} in {}: {}",
231-
case.name,
232-
corpus_path.display(),
203+
"Raw parse failed in {}: {}",
204+
output_path.display(),
233205
e
234206
));
235207
}
236208
Ok(actual_raw) => {
237209
if update_mode {
238210
case.raw = actual_raw.trim().to_string();
239-
} else if case.raw.trim() != actual_raw.trim() {
211+
} else if output_path.exists() && case.raw.trim() != actual_raw.trim() {
240212
failures.push(format!(
241-
"Raw parse mismatch in {}: \"{}\"\nEXPECTED:\n\n{}\n\nACTUAL:\n\n{}",
242-
corpus_path.display(),
243-
case.name,
213+
"Raw parse mismatch in {}:\nEXPECTED:\n\n{}\n\nACTUAL:\n\n{}",
214+
output_path.display(),
244215
case.raw.trim(),
245216
actual_raw.trim()
246217
));
247218
}
248219
}
249220
}
250221

251-
match run_desugaring(&lang, &case.input) {
222+
match run_desugaring(&lang, &case_input) {
252223
Err(e) => {
253224
failures.push(format!(
254-
"Desugaring failed for {} in {}: {}",
255-
case.name,
256-
corpus_path.display(),
225+
"Desugaring failed in {}: {}",
226+
output_path.display(),
257227
e
258228
));
259229
}
260230
Ok(actual) => {
261231
let actual_dump = dump_ast_with_type_errors(
262232
&actual,
263233
actual.get_root(),
264-
&case.input,
234+
&case_input,
265235
&output_schema,
266236
);
267237
if update_mode {
268238
case.expected = actual_dump.trim().to_string();
269-
} else if case.expected.trim() != actual_dump.trim() {
239+
} else if output_path.exists()
240+
&& case.expected.trim() != actual_dump.trim()
241+
{
270242
failures.push(format!(
271-
"Test failed in {}: \"{}\"\nEXPECTED:\n\n{}\n\nACTUAL:\n\n{}",
272-
corpus_path.display(),
273-
case.name,
243+
"Test failed in {}:\nEXPECTED:\n\n{}\n\nACTUAL:\n\n{}",
244+
output_path.display(),
274245
case.expected.trim(),
275246
actual_dump.trim()
276247
));
@@ -282,12 +253,12 @@ fn test_corpus() {
282253
assert!(failures.is_empty(), "{}", failures.join("\n\n") + "\n\n");
283254

284255
if update_mode {
285-
let updated = render_corpus(&cases);
286-
let write_result = fs::write(&corpus_path, updated);
256+
let updated = render_corpus(&case);
257+
let write_result = fs::write(&output_path, updated);
287258
assert!(
288259
write_result.is_ok(),
289260
"Failed to update corpus file {}: {}",
290-
corpus_path.display(),
261+
output_path.display(),
291262
write_result
292263
.err()
293264
.map_or_else(String::new, |e| e.to_string())

0 commit comments

Comments
 (0)