Skip to content

Commit 288c4ed

Browse files
authored
Add hand-written SQL parser with full MySQL + PostgreSQL support
Complete SQL parser implementation across 11 plans: ## Features - **Deep parsing (Tier 1):** SELECT, INSERT, UPDATE, DELETE, SET, REPLACE - **Compound queries:** UNION/INTERSECT/EXCEPT with precedence and nesting - **Tier 2 classification:** All other statement types (DDL, DCL, transactions, etc.) - **Query emitter:** AST → SQL reconstruction with round-trip support - **Prepared statement cache:** LRU cache, parse_and_cache/execute, bindings-aware emitter - **Query digest:** AST-based normalization + token-level fallback, FNV-1a hash, IN/VALUES collapsing - **Both dialects:** MySQL and PostgreSQL via compile-time templating - **Performance:** SET parse 127ns, SELECT simple 235ns, arena reset 4ns ## Stats - 372 tests, 18 benchmarks, 0 warnings - CI: Ubuntu 22.04/24.04 (g++ + clang++) + macOS — all green Closes #3, #5, #6, #7, #8, #9
1 parent 93ae22c commit 288c4ed

482 files changed

Lines changed: 159927 additions & 0 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/ci.yml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
name: CI
2+
3+
on:
4+
push:
5+
branches: [main]
6+
pull_request:
7+
branches: [main]
8+
9+
jobs:
10+
build-and-test:
11+
strategy:
12+
matrix:
13+
os: [ubuntu-22.04, ubuntu-24.04]
14+
compiler: [g++, clang++]
15+
runs-on: ${{ matrix.os }}
16+
steps:
17+
- uses: actions/checkout@v4
18+
19+
- name: Build and test
20+
env:
21+
CXX: ${{ matrix.compiler }}
22+
run: make -f Makefile.new clean && make -f Makefile.new all
23+
24+
macos:
25+
runs-on: macos-latest
26+
steps:
27+
- uses: actions/checkout@v4
28+
29+
- name: Build and test
30+
run: make -f Makefile.new clean && make -f Makefile.new all

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,7 @@ src/*_parser/*_lexer.yy.c
4242
src/*_parser/*_parser.output
4343
src/*_parser/*_parser.report
4444

45+
# New parser build artifacts
46+
libsqlparser.a
47+
run_tests
48+
run_bench

Makefile.new

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
CXX = g++
2+
CXXFLAGS = -std=c++17 -Wall -Wextra -g -O2
3+
CPPFLAGS = -I./include -I./third_party/googletest/googletest/include
4+
5+
PROJECT_ROOT = .
6+
SRC_DIR = $(PROJECT_ROOT)/src/sql_parser
7+
INCLUDE_DIR = $(PROJECT_ROOT)/include/sql_parser
8+
TEST_DIR = $(PROJECT_ROOT)/tests
9+
10+
# Library sources
11+
LIB_SRCS = $(SRC_DIR)/arena.cpp $(SRC_DIR)/parser.cpp
12+
LIB_OBJS = $(LIB_SRCS:.cpp=.o)
13+
LIB_TARGET = $(PROJECT_ROOT)/libsqlparser.a
14+
15+
# Google Test library
16+
GTEST_DIR = $(PROJECT_ROOT)/third_party/googletest/googletest
17+
GTEST_SRC = $(GTEST_DIR)/src/gtest-all.cc
18+
GTEST_OBJ = $(GTEST_DIR)/src/gtest-all.o
19+
GTEST_CPPFLAGS = -I$(GTEST_DIR)/include -I$(GTEST_DIR)
20+
21+
# Test sources
22+
TEST_SRCS = $(TEST_DIR)/test_main.cpp \
23+
$(TEST_DIR)/test_arena.cpp \
24+
$(TEST_DIR)/test_tokenizer.cpp \
25+
$(TEST_DIR)/test_classifier.cpp \
26+
$(TEST_DIR)/test_expression.cpp \
27+
$(TEST_DIR)/test_set.cpp \
28+
$(TEST_DIR)/test_select.cpp \
29+
$(TEST_DIR)/test_emitter.cpp \
30+
$(TEST_DIR)/test_stmt_cache.cpp \
31+
$(TEST_DIR)/test_insert.cpp \
32+
$(TEST_DIR)/test_update.cpp \
33+
$(TEST_DIR)/test_delete.cpp \
34+
$(TEST_DIR)/test_compound.cpp \
35+
$(TEST_DIR)/test_digest.cpp
36+
TEST_OBJS = $(TEST_SRCS:.cpp=.o)
37+
TEST_TARGET = $(PROJECT_ROOT)/run_tests
38+
39+
# Google Benchmark
40+
GBENCH_DIR = $(PROJECT_ROOT)/third_party/benchmark
41+
GBENCH_SRCS = $(filter-out $(GBENCH_DIR)/src/benchmark_main.cc, $(wildcard $(GBENCH_DIR)/src/*.cc))
42+
GBENCH_OBJS = $(GBENCH_SRCS:.cc=.o)
43+
GBENCH_CPPFLAGS = -I$(GBENCH_DIR)/include -I$(GBENCH_DIR)/src -DHAVE_STD_REGEX -DHAVE_STEADY_CLOCK
44+
45+
BENCH_DIR = $(PROJECT_ROOT)/bench
46+
BENCH_SRCS = $(BENCH_DIR)/bench_main.cpp $(BENCH_DIR)/bench_parser.cpp
47+
BENCH_OBJS = $(BENCH_SRCS:.cpp=.o)
48+
BENCH_TARGET = $(PROJECT_ROOT)/run_bench
49+
50+
.PHONY: all lib test bench clean
51+
52+
all: lib test
53+
54+
lib: $(LIB_TARGET)
55+
56+
$(LIB_TARGET): $(LIB_OBJS)
57+
ar rcs $@ $^
58+
@echo "Built $@"
59+
60+
$(SRC_DIR)/%.o: $(SRC_DIR)/%.cpp
61+
$(CXX) $(CXXFLAGS) $(CPPFLAGS) -c $< -o $@
62+
63+
# Google Test object
64+
$(GTEST_OBJ): $(GTEST_SRC)
65+
$(CXX) $(CXXFLAGS) $(GTEST_CPPFLAGS) -c $< -o $@
66+
67+
# Test objects
68+
$(TEST_DIR)/%.o: $(TEST_DIR)/%.cpp
69+
$(CXX) $(CXXFLAGS) $(CPPFLAGS) $(GTEST_CPPFLAGS) -c $< -o $@
70+
71+
test: $(TEST_TARGET)
72+
./$(TEST_TARGET)
73+
74+
$(TEST_TARGET): $(TEST_OBJS) $(GTEST_OBJ) $(LIB_TARGET)
75+
$(CXX) $(CXXFLAGS) -o $@ $(TEST_OBJS) $(GTEST_OBJ) -L$(PROJECT_ROOT) -lsqlparser -lpthread
76+
77+
# Benchmark objects
78+
$(GBENCH_DIR)/src/%.o: $(GBENCH_DIR)/src/%.cc
79+
$(CXX) $(CXXFLAGS) $(GBENCH_CPPFLAGS) -c $< -o $@
80+
81+
$(BENCH_DIR)/%.o: $(BENCH_DIR)/%.cpp
82+
$(CXX) $(CXXFLAGS) $(CPPFLAGS) $(GBENCH_CPPFLAGS) -c $< -o $@
83+
84+
bench: $(BENCH_TARGET)
85+
./$(BENCH_TARGET) --benchmark_format=console
86+
87+
$(BENCH_TARGET): $(BENCH_OBJS) $(GBENCH_OBJS) $(LIB_TARGET)
88+
$(CXX) $(CXXFLAGS) -o $@ $(BENCH_OBJS) $(GBENCH_OBJS) -L$(PROJECT_ROOT) -lsqlparser -lpthread
89+
90+
clean:
91+
rm -f $(LIB_OBJS) $(LIB_TARGET) $(TEST_OBJS) $(GTEST_OBJ) $(TEST_TARGET)
92+
rm -f $(BENCH_OBJS) $(GBENCH_OBJS) $(BENCH_TARGET)
93+
@echo "Cleaned."

bench/bench_main.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#include <benchmark/benchmark.h>
2+
3+
BENCHMARK_MAIN();

bench/bench_parser.cpp

Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
#include <benchmark/benchmark.h>
2+
#include "sql_parser/parser.h"
3+
#include "sql_parser/emitter.h"
4+
5+
using namespace sql_parser;
6+
7+
// ========== Tier 2: Classification ==========
8+
// Target: <100ns
9+
10+
static void BM_Classify_Insert(benchmark::State& state) {
11+
Parser<Dialect::MySQL> parser;
12+
const char* sql = "INSERT INTO users VALUES (1, 'name', 'email')";
13+
size_t len = strlen(sql);
14+
for (auto _ : state) {
15+
auto r = parser.parse(sql, len);
16+
benchmark::DoNotOptimize(r.stmt_type);
17+
}
18+
}
19+
BENCHMARK(BM_Classify_Insert);
20+
21+
static void BM_Classify_Update(benchmark::State& state) {
22+
Parser<Dialect::MySQL> parser;
23+
const char* sql = "UPDATE users SET name = 'x' WHERE id = 1";
24+
size_t len = strlen(sql);
25+
for (auto _ : state) {
26+
auto r = parser.parse(sql, len);
27+
benchmark::DoNotOptimize(r.stmt_type);
28+
}
29+
}
30+
BENCHMARK(BM_Classify_Update);
31+
32+
static void BM_Classify_Delete(benchmark::State& state) {
33+
Parser<Dialect::MySQL> parser;
34+
const char* sql = "DELETE FROM users WHERE id = 1";
35+
size_t len = strlen(sql);
36+
for (auto _ : state) {
37+
auto r = parser.parse(sql, len);
38+
benchmark::DoNotOptimize(r.stmt_type);
39+
}
40+
}
41+
BENCHMARK(BM_Classify_Delete);
42+
43+
static void BM_Classify_Begin(benchmark::State& state) {
44+
Parser<Dialect::MySQL> parser;
45+
const char* sql = "BEGIN";
46+
size_t len = strlen(sql);
47+
for (auto _ : state) {
48+
auto r = parser.parse(sql, len);
49+
benchmark::DoNotOptimize(r.stmt_type);
50+
}
51+
}
52+
BENCHMARK(BM_Classify_Begin);
53+
54+
// ========== Tier 1: SET parse ==========
55+
// Target: <300ns
56+
57+
static void BM_Set_Simple(benchmark::State& state) {
58+
Parser<Dialect::MySQL> parser;
59+
const char* sql = "SET @@session.wait_timeout = 600";
60+
size_t len = strlen(sql);
61+
for (auto _ : state) {
62+
auto r = parser.parse(sql, len);
63+
benchmark::DoNotOptimize(r.ast);
64+
}
65+
}
66+
BENCHMARK(BM_Set_Simple);
67+
68+
static void BM_Set_Names(benchmark::State& state) {
69+
Parser<Dialect::MySQL> parser;
70+
const char* sql = "SET NAMES utf8mb4 COLLATE utf8mb4_unicode_ci";
71+
size_t len = strlen(sql);
72+
for (auto _ : state) {
73+
auto r = parser.parse(sql, len);
74+
benchmark::DoNotOptimize(r.ast);
75+
}
76+
}
77+
BENCHMARK(BM_Set_Names);
78+
79+
static void BM_Set_MultiVar(benchmark::State& state) {
80+
Parser<Dialect::MySQL> parser;
81+
const char* sql = "SET autocommit = 1, wait_timeout = 28800, sql_mode = 'STRICT_TRANS_TABLES'";
82+
size_t len = strlen(sql);
83+
for (auto _ : state) {
84+
auto r = parser.parse(sql, len);
85+
benchmark::DoNotOptimize(r.ast);
86+
}
87+
}
88+
BENCHMARK(BM_Set_MultiVar);
89+
90+
static void BM_Set_FunctionRHS(benchmark::State& state) {
91+
Parser<Dialect::MySQL> parser;
92+
const char* sql = "SET sql_mode = CONCAT(@@sql_mode, ',STRICT_TRANS_TABLES')";
93+
size_t len = strlen(sql);
94+
for (auto _ : state) {
95+
auto r = parser.parse(sql, len);
96+
benchmark::DoNotOptimize(r.ast);
97+
}
98+
}
99+
BENCHMARK(BM_Set_FunctionRHS);
100+
101+
// ========== Tier 1: SELECT parse ==========
102+
// Target: <500ns simple, <2us complex
103+
104+
static void BM_Select_Simple(benchmark::State& state) {
105+
Parser<Dialect::MySQL> parser;
106+
const char* sql = "SELECT col FROM t WHERE id = 1";
107+
size_t len = strlen(sql);
108+
for (auto _ : state) {
109+
auto r = parser.parse(sql, len);
110+
benchmark::DoNotOptimize(r.ast);
111+
}
112+
}
113+
BENCHMARK(BM_Select_Simple);
114+
115+
static void BM_Select_MultiColumn(benchmark::State& state) {
116+
Parser<Dialect::MySQL> parser;
117+
const char* sql = "SELECT id, name, email, status FROM users WHERE active = 1 ORDER BY name LIMIT 100";
118+
size_t len = strlen(sql);
119+
for (auto _ : state) {
120+
auto r = parser.parse(sql, len);
121+
benchmark::DoNotOptimize(r.ast);
122+
}
123+
}
124+
BENCHMARK(BM_Select_MultiColumn);
125+
126+
static void BM_Select_Join(benchmark::State& state) {
127+
Parser<Dialect::MySQL> parser;
128+
const char* sql = "SELECT u.id, o.total FROM users u JOIN orders o ON u.id = o.user_id WHERE o.status = 'active'";
129+
size_t len = strlen(sql);
130+
for (auto _ : state) {
131+
auto r = parser.parse(sql, len);
132+
benchmark::DoNotOptimize(r.ast);
133+
}
134+
}
135+
BENCHMARK(BM_Select_Join);
136+
137+
static void BM_Select_Complex(benchmark::State& state) {
138+
Parser<Dialect::MySQL> parser;
139+
const char* sql =
140+
"SELECT u.id, u.name, COUNT(o.id) AS order_count "
141+
"FROM users u "
142+
"LEFT JOIN orders o ON u.id = o.user_id "
143+
"WHERE u.status = 'active' AND u.created_at > '2024-01-01' "
144+
"GROUP BY u.id, u.name "
145+
"HAVING COUNT(o.id) > 5 "
146+
"ORDER BY order_count DESC "
147+
"LIMIT 50 OFFSET 10";
148+
size_t len = strlen(sql);
149+
for (auto _ : state) {
150+
auto r = parser.parse(sql, len);
151+
benchmark::DoNotOptimize(r.ast);
152+
}
153+
}
154+
BENCHMARK(BM_Select_Complex);
155+
156+
static void BM_Select_MultiJoin(benchmark::State& state) {
157+
Parser<Dialect::MySQL> parser;
158+
const char* sql =
159+
"SELECT a.id, b.name, c.value, d.total "
160+
"FROM t1 a "
161+
"JOIN t2 b ON a.id = b.a_id "
162+
"LEFT JOIN t3 c ON b.id = c.b_id "
163+
"JOIN t4 d ON c.id = d.c_id "
164+
"WHERE a.status = 1 AND d.total > 100 "
165+
"ORDER BY d.total DESC "
166+
"LIMIT 20";
167+
size_t len = strlen(sql);
168+
for (auto _ : state) {
169+
auto r = parser.parse(sql, len);
170+
benchmark::DoNotOptimize(r.ast);
171+
}
172+
}
173+
BENCHMARK(BM_Select_MultiJoin);
174+
175+
// ========== Query Reconstruction (round-trip) ==========
176+
// Target: <500ns
177+
178+
static void BM_Emit_SetSimple(benchmark::State& state) {
179+
Parser<Dialect::MySQL> parser;
180+
const char* sql = "SET autocommit = 1";
181+
size_t len = strlen(sql);
182+
for (auto _ : state) {
183+
auto r = parser.parse(sql, len);
184+
Emitter<Dialect::MySQL> emitter(parser.arena());
185+
emitter.emit(r.ast);
186+
benchmark::DoNotOptimize(emitter.result());
187+
}
188+
}
189+
BENCHMARK(BM_Emit_SetSimple);
190+
191+
static void BM_Emit_SelectSimple(benchmark::State& state) {
192+
Parser<Dialect::MySQL> parser;
193+
const char* sql = "SELECT * FROM users WHERE id = 1";
194+
size_t len = strlen(sql);
195+
for (auto _ : state) {
196+
auto r = parser.parse(sql, len);
197+
Emitter<Dialect::MySQL> emitter(parser.arena());
198+
emitter.emit(r.ast);
199+
benchmark::DoNotOptimize(emitter.result());
200+
}
201+
}
202+
BENCHMARK(BM_Emit_SelectSimple);
203+
204+
// ========== Arena reset ==========
205+
// Target: <10ns
206+
207+
static void BM_ArenaReset(benchmark::State& state) {
208+
Arena arena(65536);
209+
for (auto _ : state) {
210+
arena.allocate(256); // allocate something
211+
arena.reset();
212+
benchmark::DoNotOptimize(arena.bytes_used());
213+
}
214+
}
215+
BENCHMARK(BM_ArenaReset);
216+
217+
// ========== PostgreSQL ==========
218+
219+
static void BM_PgSQL_Select_Simple(benchmark::State& state) {
220+
Parser<Dialect::PostgreSQL> parser;
221+
const char* sql = "SELECT col FROM t WHERE id = 1";
222+
size_t len = strlen(sql);
223+
for (auto _ : state) {
224+
auto r = parser.parse(sql, len);
225+
benchmark::DoNotOptimize(r.ast);
226+
}
227+
}
228+
BENCHMARK(BM_PgSQL_Select_Simple);
229+
230+
static void BM_PgSQL_Set_Simple(benchmark::State& state) {
231+
Parser<Dialect::PostgreSQL> parser;
232+
const char* sql = "SET work_mem = '256MB'";
233+
size_t len = strlen(sql);
234+
for (auto _ : state) {
235+
auto r = parser.parse(sql, len);
236+
benchmark::DoNotOptimize(r.ast);
237+
}
238+
}
239+
BENCHMARK(BM_PgSQL_Set_Simple);

0 commit comments

Comments
 (0)