Skip to content

Commit 79f337e

Browse files
committed
chore: update CLAUDE.md test counts (1152), add corpus_test.cpp, fix .gitignore
1 parent 427da91 commit 79f337e

3 files changed

Lines changed: 223 additions & 3 deletions

File tree

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535
libsqlparser.a
3636
sqlengine
3737
run_tests
38+
run_tests_debug
39+
corpus_test
3840
run_bench
3941
run_bench_compare
4042
bench/sqlparser_rs_bench/target/

CLAUDE.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,13 @@ High-performance hand-written recursive descent SQL parser and query engine for
99
## Build Commands
1010

1111
```bash
12-
make all # Build library + run all 871 tests
12+
make all # Build library + run all 1152 tests
1313
make lib # Build only libsqlparser.a
1414
make test # Build + run tests
1515
make bench # Build + run benchmarks
1616
make bench-compare # Run comparison vs libpg_query (requires libpg_query built)
1717
make build-corpus-test # Build corpus test harness
18+
make build-sqlengine # Build interactive SQL engine CLI
1819
make clean # Remove all build artifacts
1920
```
2021

@@ -141,7 +142,7 @@ Everything is in `namespace sql_engine`. Templates are parameterized on `Dialect
141142
142143
## Tests
143144
144-
Google Test. 1,008 tests across 34 test files. Validated against 86K+ external queries (PostgreSQL regression, MySQL MTR, CockroachDB, Vitess, TiDB, sqlparser-rs, SQLGlot).
145+
Google Test. 1,152 tests across 48 test files. Validated against 86K+ external queries (PostgreSQL regression, MySQL MTR, CockroachDB, Vitess, TiDB, sqlparser-rs, SQLGlot).
145146
146147
Run a single test: `./run_tests --gtest_filter="*SetTest*"`
147148
@@ -151,7 +152,10 @@ Run a single test: `./run_tests --gtest_filter="*SetTest*"`
151152
`test_tokenizer.cpp`, `test_classifier.cpp`, `test_expression.cpp`, `test_select.cpp`, `test_insert.cpp`, `test_update.cpp`, `test_delete.cpp`, `test_set.cpp`, `test_compound.cpp`, `test_emitter.cpp`, `test_digest.cpp`, `test_stmt_cache.cpp`, `test_arena.cpp`, `test_misc_stmts.cpp`
152153
153154
**Engine:**
154-
`test_value.cpp`, `test_row.cpp`, `test_coercion.cpp`, `test_null_semantics.cpp`, `test_like.cpp`, `test_expression_eval.cpp`, `test_eval_integration.cpp`, `test_catalog.cpp`, `test_registry.cpp`, `test_arithmetic.cpp`, `test_comparison.cpp`, `test_cast.cpp`, `test_string_funcs.cpp`, `test_operators.cpp`, `test_plan_builder.cpp`, `test_plan_executor.cpp`
155+
`test_value.cpp`, `test_row.cpp`, `test_coercion.cpp`, `test_null_semantics.cpp`, `test_like.cpp`, `test_expression_eval.cpp`, `test_eval_integration.cpp`, `test_catalog.cpp`, `test_registry.cpp`, `test_arithmetic.cpp`, `test_comparison.cpp`, `test_cast.cpp`, `test_string_funcs.cpp`, `test_operators.cpp`, `test_plan_builder.cpp`, `test_plan_executor.cpp`, `test_result_set.cpp`, `test_datetime_format.cpp`, `test_datetime_funcs.cpp`, `test_cte.cpp`, `test_window.cpp`, `test_subquery.cpp`, `test_optimizer.cpp`
156+
157+
**Distributed / Integration:**
158+
`test_dml.cpp`, `test_distributed_dml.cpp`, `test_distributed_planner.cpp`, `test_distributed_real.cpp`, `test_distributed_txn.cpp`, `test_mysql_executor.cpp`, `test_pgsql_executor.cpp`, `test_session.cpp`, `test_local_txn.cpp`, `test_single_backend_txn.cpp`
155159
156160
## Benchmarks
157161

tests/corpus_test.cpp

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
#include "sql_parser/parser.h"
2+
#include <iostream>
3+
#include <fstream>
4+
#include <string>
5+
#include <vector>
6+
#include <algorithm>
7+
#include <map>
8+
9+
using namespace sql_parser;
10+
11+
static std::string trim(const std::string& s) {
12+
size_t start = s.find_first_not_of(" \t\r\n");
13+
if (start == std::string::npos) return "";
14+
size_t end = s.find_last_not_of(" \t\r\n");
15+
return s.substr(start, end - start + 1);
16+
}
17+
18+
static std::string classify_error(const std::string& sql) {
19+
// Identify the leading keyword(s) for failure categorization
20+
std::string upper;
21+
for (size_t i = 0; i < std::min(sql.size(), (size_t)80); i++) {
22+
char c = sql[i];
23+
if (c >= 'a' && c <= 'z') c -= 32;
24+
upper += c;
25+
}
26+
27+
// Check for specific syntax patterns
28+
if (upper.find("WITH ") == 0 || upper.find("WITH\t") == 0) return "WITH/CTE";
29+
if (upper.find("CREATE ") == 0) return "CREATE";
30+
if (upper.find("ALTER ") == 0) return "ALTER";
31+
if (upper.find("DROP ") == 0) return "DROP";
32+
if (upper.find("GRANT ") == 0 || upper.find("REVOKE ") == 0) return "GRANT/REVOKE";
33+
if (upper.find("SELECT ") == 0 || upper.find("SELECT\t") == 0 || upper.find("(SELECT") == 0) return "SELECT";
34+
if (upper.find("INSERT ") == 0) return "INSERT";
35+
if (upper.find("UPDATE ") == 0) return "UPDATE";
36+
if (upper.find("DELETE ") == 0) return "DELETE";
37+
if (upper.find("SET ") == 0) return "SET";
38+
if (upper.find("SHOW ") == 0) return "SHOW";
39+
if (upper.find("BEGIN") == 0 || upper.find("START ") == 0) return "TRANSACTION";
40+
if (upper.find("COMMIT") == 0 || upper.find("ROLLBACK") == 0) return "TRANSACTION";
41+
if (upper.find("EXPLAIN ") == 0 || upper.find("DESCRIBE ") == 0) return "EXPLAIN";
42+
if (upper.find("TRUNCATE ") == 0) return "TRUNCATE";
43+
if (upper.find("LOCK ") == 0 || upper.find("UNLOCK ") == 0) return "LOCK";
44+
if (upper.find("PREPARE ") == 0 || upper.find("EXECUTE ") == 0 || upper.find("DEALLOCATE ") == 0) return "PREPARED_STMT";
45+
if (upper.find("USE ") == 0) return "USE";
46+
if (upper.find("LOAD ") == 0) return "LOAD";
47+
if (upper.find("CALL ") == 0) return "CALL";
48+
if (upper.find("DO ") == 0) return "DO";
49+
if (upper.find("REPLACE ") == 0) return "REPLACE";
50+
if (upper.find("MERGE ") == 0) return "MERGE";
51+
if (upper.find("COPY ") == 0) return "COPY";
52+
if (upper.find("VACUUM ") == 0 || upper.find("ANALYZE ") == 0 || upper.find("ANALYSE ") == 0) return "MAINTENANCE";
53+
if (upper.find("COMMENT ") == 0) return "COMMENT";
54+
if (upper.find("IMPORT ") == 0 || upper.find("EXPORT ") == 0) return "IMPORT/EXPORT";
55+
if (upper.find("REINDEX") == 0) return "REINDEX";
56+
if (upper.find("CLUSTER") == 0) return "CLUSTER";
57+
if (upper.find("REFRESH") == 0) return "REFRESH";
58+
if (upper.find("DISCARD") == 0) return "DISCARD";
59+
if (upper.find("REASSIGN") == 0) return "REASSIGN";
60+
if (upper.find("SECURITY") == 0) return "SECURITY";
61+
if (upper.find("VALUES") == 0) return "VALUES_STMT";
62+
if (upper.find("TABLE ") == 0) return "TABLE_STMT";
63+
if (upper.find("FETCH ") == 0 || upper.find("MOVE ") == 0) return "CURSOR";
64+
if (upper.find("DECLARE ") == 0) return "CURSOR";
65+
if (upper.find("CLOSE ") == 0) return "CURSOR";
66+
if (upper.find("LISTEN") == 0 || upper.find("NOTIFY") == 0 || upper.find("UNLISTEN") == 0) return "LISTEN/NOTIFY";
67+
if (upper.find("RELEASE ") == 0 || upper.find("SAVEPOINT ") == 0) return "SAVEPOINT";
68+
if (upper.find("RESET ") == 0) return "RESET";
69+
if (upper.find("ABORT") == 0) return "ABORT";
70+
if (upper.find("END") == 0) return "END";
71+
72+
return "OTHER(" + upper.substr(0, std::min(upper.size(), (size_t)20)) + ")";
73+
}
74+
75+
int main(int argc, char** argv) {
76+
if (argc < 2) {
77+
std::cerr << "Usage: corpus_test <mysql|pgsql> [file...]\n";
78+
std::cerr << " Reads SQL queries (one per line) from files or stdin.\n";
79+
std::cerr << " Lines starting with -- or # are skipped.\n";
80+
return 1;
81+
}
82+
83+
bool is_mysql = std::string(argv[1]) == "mysql";
84+
85+
std::vector<std::istream*> inputs;
86+
std::vector<std::ifstream*> owned;
87+
if (argc > 2) {
88+
for (int i = 2; i < argc; i++) {
89+
auto* f = new std::ifstream(argv[i]);
90+
if (!f->is_open()) {
91+
std::cerr << "Cannot open: " << argv[i] << "\n";
92+
delete f;
93+
continue;
94+
}
95+
inputs.push_back(f);
96+
owned.push_back(f);
97+
}
98+
} else {
99+
inputs.push_back(&std::cin);
100+
}
101+
102+
int total = 0, ok = 0, partial = 0, errors = 0;
103+
std::map<std::string, int> error_categories;
104+
std::map<std::string, int> partial_categories;
105+
std::vector<std::pair<std::string, std::string>> error_examples; // (sql, error_info)
106+
std::vector<std::pair<std::string, std::string>> partial_examples; // (sql, category)
107+
108+
auto run_mysql = [&](const std::string& sql) {
109+
Parser<Dialect::MySQL> parser;
110+
return parser.parse(sql.c_str(), sql.size());
111+
};
112+
auto run_pgsql = [&](const std::string& sql) {
113+
Parser<Dialect::PostgreSQL> parser;
114+
return parser.parse(sql.c_str(), sql.size());
115+
};
116+
117+
for (auto* in : inputs) {
118+
std::string line;
119+
while (std::getline(*in, line)) {
120+
std::string sql = trim(line);
121+
if (sql.empty()) continue;
122+
if (sql[0] == '#') continue;
123+
if (sql.size() >= 2 && sql[0] == '-' && sql[1] == '-') continue;
124+
125+
// Strip trailing semicolons (our parser handles them, but some test files have extra)
126+
while (!sql.empty() && sql.back() == ';') sql.pop_back();
127+
sql = trim(sql);
128+
if (sql.empty()) continue;
129+
130+
total++;
131+
132+
ParseResult result = is_mysql ? run_mysql(sql) : run_pgsql(sql);
133+
134+
switch (result.status) {
135+
case ParseResult::OK:
136+
ok++;
137+
break;
138+
case ParseResult::PARTIAL: {
139+
partial++;
140+
std::string cat = classify_error(sql);
141+
partial_categories[cat]++;
142+
if (partial_examples.size() < 50) {
143+
partial_examples.push_back({sql, cat});
144+
}
145+
break;
146+
}
147+
case ParseResult::ERROR: {
148+
errors++;
149+
std::string cat = classify_error(sql);
150+
error_categories[cat]++;
151+
if (error_examples.size() < 50) {
152+
std::string info = "offset=" + std::to_string(result.error.offset);
153+
if (result.error.message.ptr && result.error.message.len > 0) {
154+
info += " msg=" + std::string(result.error.message.ptr, result.error.message.len);
155+
}
156+
error_examples.push_back({sql, info});
157+
}
158+
break;
159+
}
160+
}
161+
}
162+
}
163+
164+
// Report
165+
std::cout << "\n=== Corpus Test Results ===\n";
166+
std::cout << "Dialect: " << (is_mysql ? "MySQL" : "PostgreSQL") << "\n";
167+
std::cout << "Total queries: " << total << "\n";
168+
std::cout << "OK: " << ok << " (" << (total > 0 ? 100.0 * ok / total : 0) << "%)\n";
169+
std::cout << "PARTIAL: " << partial << " (" << (total > 0 ? 100.0 * partial / total : 0) << "%)\n";
170+
std::cout << "ERROR: " << errors << " (" << (total > 0 ? 100.0 * errors / total : 0) << "%)\n";
171+
std::cout << "Success rate (OK+PARTIAL): " << (total > 0 ? 100.0 * (ok + partial) / total : 0) << "%\n";
172+
173+
if (!error_categories.empty()) {
174+
std::cout << "\n--- ERROR categories ---\n";
175+
// Sort by count descending
176+
std::vector<std::pair<std::string, int>> sorted(error_categories.begin(), error_categories.end());
177+
std::sort(sorted.begin(), sorted.end(), [](auto& a, auto& b) { return a.second > b.second; });
178+
for (auto& [cat, cnt] : sorted) {
179+
std::cout << " " << cat << ": " << cnt << "\n";
180+
}
181+
}
182+
183+
if (!partial_categories.empty()) {
184+
std::cout << "\n--- PARTIAL categories ---\n";
185+
std::vector<std::pair<std::string, int>> sorted(partial_categories.begin(), partial_categories.end());
186+
std::sort(sorted.begin(), sorted.end(), [](auto& a, auto& b) { return a.second > b.second; });
187+
for (auto& [cat, cnt] : sorted) {
188+
std::cout << " " << cat << ": " << cnt << "\n";
189+
}
190+
}
191+
192+
if (!error_examples.empty()) {
193+
std::cout << "\n--- Top ERROR examples (up to 10) ---\n";
194+
for (size_t i = 0; i < std::min(error_examples.size(), (size_t)10); i++) {
195+
std::string display_sql = error_examples[i].first;
196+
if (display_sql.size() > 120) display_sql = display_sql.substr(0, 120) + "...";
197+
std::cout << " [" << (i+1) << "] " << error_examples[i].second << "\n";
198+
std::cout << " SQL: " << display_sql << "\n";
199+
}
200+
}
201+
202+
if (!partial_examples.empty()) {
203+
std::cout << "\n--- Top PARTIAL examples (up to 10) ---\n";
204+
for (size_t i = 0; i < std::min(partial_examples.size(), (size_t)10); i++) {
205+
std::string display_sql = partial_examples[i].first;
206+
if (display_sql.size() > 120) display_sql = display_sql.substr(0, 120) + "...";
207+
std::cout << " [" << (i+1) << "] category=" << partial_examples[i].second << "\n";
208+
std::cout << " SQL: " << display_sql << "\n";
209+
}
210+
}
211+
212+
for (auto* f : owned) delete f;
213+
return 0;
214+
}

0 commit comments

Comments
 (0)