Skip to content

Commit 5b3ee4f

Browse files
committed
feat: add benchmark automation, corpus testing CI, and report generation
- scripts/run_benchmarks.sh: builds release (-O3), runs Google Benchmark, downloads and tests against 6 SQL corpora, generates markdown report - scripts/publish_report.sh: wrapper for physical server to run and save reports - CI: add benchmark job (uploads JSON artifact) and corpus-test job - Makefile.new: add build-corpus-test target and corpus_test binary to clean - docs/benchmarks/latest.md: initial benchmark report
1 parent b63b194 commit 5b3ee4f

6 files changed

Lines changed: 340 additions & 2 deletions

File tree

.github/workflows/ci.yml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,45 @@ jobs:
2828

2929
- name: Build and test
3030
run: make -f Makefile.new clean && make -f Makefile.new all
31+
32+
benchmark:
33+
runs-on: ubuntu-24.04
34+
steps:
35+
- uses: actions/checkout@v4
36+
37+
- name: Build release and run benchmarks
38+
run: |
39+
sed 's/-g -O2/-O3/' Makefile.new > /tmp/Makefile.release
40+
make -f /tmp/Makefile.release clean
41+
make -f /tmp/Makefile.release lib
42+
make -f /tmp/Makefile.release test
43+
make -f /tmp/Makefile.release bench 2>&1 | tail -20
44+
./run_bench --benchmark_format=json > benchmark_results.json 2>/dev/null
45+
46+
- name: Upload benchmark results
47+
uses: actions/upload-artifact@v4
48+
with:
49+
name: benchmark-results
50+
path: benchmark_results.json
51+
52+
corpus-test:
53+
runs-on: ubuntu-24.04
54+
steps:
55+
- uses: actions/checkout@v4
56+
57+
- name: Build
58+
run: make -f Makefile.new all && make -f Makefile.new build-corpus-test
59+
60+
- name: Download corpora and test
61+
run: |
62+
mkdir -p /tmp/sql_corpora
63+
64+
# SQLGlot (easiest, fastest)
65+
git clone --depth 1 -q https://github.com/tobymao/sqlglot.git /tmp/sql_corpora/sqlglot
66+
cat /tmp/sql_corpora/sqlglot/tests/fixtures/identity.sql | ./corpus_test mysql
67+
68+
# TPC-H
69+
git clone --depth 1 -q https://github.com/tvondra/pg_tpch.git /tmp/sql_corpora/pg_tpch
70+
for f in /tmp/sql_corpora/pg_tpch/queries/*.sql; do
71+
cat "$f" | sed 's/--.*$//' | tr '\n' ' ' | sed 's/;/;\n/g' | grep -v '^\s*$'
72+
done | ./corpus_test pgsql

Makefile.new

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,13 @@ BENCH_SRCS = $(BENCH_DIR)/bench_main.cpp $(BENCH_DIR)/bench_parser.cpp
4848
BENCH_OBJS = $(BENCH_SRCS:.cpp=.o)
4949
BENCH_TARGET = $(PROJECT_ROOT)/run_bench
5050

51-
.PHONY: all lib test bench clean
51+
# Corpus test
52+
CORPUS_TEST_SRC = $(TEST_DIR)/corpus_test.cpp
53+
CORPUS_TEST_TARGET = $(PROJECT_ROOT)/corpus_test
54+
55+
.PHONY: all lib test bench build-corpus-test clean
56+
57+
build-corpus-test: $(CORPUS_TEST_TARGET)
5258

5359
all: lib test
5460

@@ -88,7 +94,10 @@ bench: $(BENCH_TARGET)
8894
$(BENCH_TARGET): $(BENCH_OBJS) $(GBENCH_OBJS) $(LIB_TARGET)
8995
$(CXX) $(CXXFLAGS) -o $@ $(BENCH_OBJS) $(GBENCH_OBJS) -L$(PROJECT_ROOT) -lsqlparser -lpthread
9096

97+
$(CORPUS_TEST_TARGET): $(CORPUS_TEST_SRC) $(LIB_TARGET)
98+
$(CXX) $(CXXFLAGS) $(CPPFLAGS) -o $@ $< -L$(PROJECT_ROOT) -lsqlparser
99+
91100
clean:
92101
rm -f $(LIB_OBJS) $(LIB_TARGET) $(TEST_OBJS) $(GTEST_OBJ) $(TEST_TARGET)
93-
rm -f $(BENCH_OBJS) $(GBENCH_OBJS) $(BENCH_TARGET)
102+
rm -f $(BENCH_OBJS) $(GBENCH_OBJS) $(BENCH_TARGET) $(CORPUS_TEST_TARGET)
94103
@echo "Cleaned."

docs/benchmarks/.gitkeep

Whitespace-only changes.

docs/benchmarks/latest.md

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# SQL Parser Performance Report
2+
3+
**Date:** 2026-03-24 17:38 UTC
4+
**Host:** ci-big6-202511.vm
5+
**CPU:** AMD Ryzen 9 5950X 16-Core Processor
6+
**OS:** Linux 6.17.0-14-generic
7+
**Compiler:** g++ (Ubuntu 13.3.0-6ubuntu2~24.04.1) 13.3.0
8+
**Git:** main @ b63b194
9+
**Unit tests:** 430 passing
10+
11+
---
12+
13+
## Benchmark Results (Release -O3)
14+
15+
| Operation | Latency | Target | Status |
16+
|---|---|---|---|
17+
| BM_Classify_Insert | 207 ns | <500ns | MET |
18+
| BM_Classify_Update | 240 ns | <500ns | MET |
19+
| BM_Classify_Delete | 185 ns | <500ns | MET |
20+
| BM_Classify_Begin | 36.0 ns | <100ns | MET |
21+
| BM_Set_Simple | 128 ns | <300ns | MET |
22+
| BM_Set_Names | 127 ns | <300ns | MET |
23+
| BM_Set_MultiVar | 254 ns | <300ns | MET |
24+
| BM_Set_FunctionRHS | 219 ns | <300ns | MET |
25+
| BM_Select_Simple | 229 ns | <500ns | MET |
26+
| BM_Select_MultiColumn | 467 ns | <500ns | MET |
27+
| BM_Select_Join | 599 ns | <2us | MET |
28+
| BM_Select_Complex | 1405 ns | <2us | MET |
29+
| BM_Select_MultiJoin | 1446 ns | <2us | MET |
30+
| BM_Emit_SetSimple | 139 ns | <500ns | MET |
31+
| BM_Emit_SelectSimple | 258 ns | <500ns | MET |
32+
| BM_ArenaReset | 3.76 ns | <10ns | MET |
33+
| BM_PgSQL_Select_Simple | 221 ns |||
34+
| BM_PgSQL_Set_Simple | 95.8 ns |||
35+
36+
---
37+
38+
## Corpus Test Results
39+
40+
| Corpus | Dialect | Queries | OK | PARTIAL | ERROR |
41+
|---|---|---|---|---|---|
42+
| PostgreSQL regression | pgsql | 55562 | 55342 (99.6202%) | 204 | 7 |
43+
| SQLGlot | mysql | 954 | 941 (98.6373%) | 13 | 0 |
44+
| CockroachDB | pgsql | 5000 | 4998 (99.96%) | 2 | 0 |
45+
46+
| Vitess | mysql | 2000 | 1996 (99.8%) | 4 | 0 |
47+
| TiDB | mysql | 3000 | 2991 (99.7%) | 9 | 0 |
48+
49+
50+
---
51+
52+
*Generated by `scripts/run_benchmarks.sh`*

scripts/publish_report.sh

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/bin/bash
2+
# publish_report.sh — Run full benchmarks on physical server, commit report
3+
set -e
4+
5+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
6+
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
7+
REPORT_DIR="$PROJECT_DIR/docs/benchmarks"
8+
9+
cd "$PROJECT_DIR"
10+
git pull --ff-only
11+
12+
mkdir -p "$REPORT_DIR"
13+
14+
# Generate report
15+
"$SCRIPT_DIR/run_benchmarks.sh" "$REPORT_DIR/latest.md"
16+
17+
# Also save a dated copy
18+
DATE=$(date +%Y-%m-%d)
19+
cp "$REPORT_DIR/latest.md" "$REPORT_DIR/report-$DATE.md"
20+
21+
echo ""
22+
echo "Reports saved to:"
23+
echo " $REPORT_DIR/latest.md"
24+
echo " $REPORT_DIR/report-$DATE.md"
25+
echo ""
26+
echo "To publish: git add docs/benchmarks/ && git commit -m 'bench: update performance report' && git push"

scripts/run_benchmarks.sh

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
#!/bin/bash
2+
# run_benchmarks.sh — Build release, run benchmarks, generate report
3+
set -e
4+
5+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
6+
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
7+
REPORT_FILE="${1:-benchmark_report.md}"
8+
CORPUS_DIR="/tmp/sql_corpora"
9+
10+
cd "$PROJECT_DIR"
11+
12+
echo "=== Building release (-O3) ==="
13+
# Create a release makefile (copy Makefile.new, swap -g -O2 for -O3)
14+
sed 's/-g -O2/-O3/' Makefile.new > /tmp/Makefile.release
15+
make -f /tmp/Makefile.release clean >/dev/null 2>&1
16+
make -f /tmp/Makefile.release lib >/dev/null 2>&1
17+
make -f /tmp/Makefile.release test 2>&1 | tail -1 # verify tests pass
18+
make -f /tmp/Makefile.release build-corpus-test >/dev/null 2>&1
19+
20+
echo "=== Running benchmarks ==="
21+
make -f /tmp/Makefile.release bench 2>&1 | grep "^BM_" > /tmp/bench_results.txt
22+
23+
# Also run with JSON output for machine parsing
24+
./run_bench --benchmark_format=json > /tmp/bench_results.json 2>/dev/null || true
25+
26+
echo "=== Downloading corpus data (if needed) ==="
27+
# Download corpora if not already present
28+
mkdir -p "$CORPUS_DIR"
29+
30+
if [ ! -d "$CORPUS_DIR/sqlglot" ]; then
31+
echo "Downloading SQLGlot..."
32+
git clone --depth 1 -q https://github.com/tobymao/sqlglot.git "$CORPUS_DIR/sqlglot"
33+
fi
34+
35+
if [ ! -d "$CORPUS_DIR/cockroach" ]; then
36+
echo "Downloading CockroachDB testdata..."
37+
git clone --depth 1 --sparse -q https://github.com/cockroachdb/cockroach.git "$CORPUS_DIR/cockroach"
38+
cd "$CORPUS_DIR/cockroach" && git sparse-checkout set pkg/sql/parser/testdata 2>/dev/null && cd "$PROJECT_DIR"
39+
fi
40+
41+
if [ ! -d "$CORPUS_DIR/pg_tpch" ]; then
42+
echo "Downloading TPC-H..."
43+
git clone --depth 1 -q https://github.com/tvondra/pg_tpch.git "$CORPUS_DIR/pg_tpch"
44+
fi
45+
46+
if [ ! -d "$CORPUS_DIR/postgres" ]; then
47+
echo "Downloading PostgreSQL regression suite..."
48+
git clone --depth 1 --sparse -q https://github.com/postgres/postgres.git "$CORPUS_DIR/postgres"
49+
cd "$CORPUS_DIR/postgres" && git sparse-checkout set src/test/regress/sql 2>/dev/null && cd "$PROJECT_DIR"
50+
fi
51+
52+
if [ ! -d "$CORPUS_DIR/vitess" ]; then
53+
echo "Downloading Vitess..."
54+
git clone --depth 1 --sparse -q https://github.com/vitessio/vitess.git "$CORPUS_DIR/vitess"
55+
cd "$CORPUS_DIR/vitess" && git sparse-checkout set go/vt/sqlparser 2>/dev/null && cd "$PROJECT_DIR"
56+
fi
57+
58+
if [ ! -d "$CORPUS_DIR/tidb" ]; then
59+
echo "Downloading TiDB parser..."
60+
git clone --depth 1 --sparse -q https://github.com/pingcap/tidb.git "$CORPUS_DIR/tidb"
61+
cd "$CORPUS_DIR/tidb" && git sparse-checkout set pkg/parser 2>/dev/null && cd "$PROJECT_DIR"
62+
fi
63+
64+
echo "=== Preparing corpus files ==="
65+
66+
# SQLGlot - one query per line
67+
cat "$CORPUS_DIR/sqlglot/tests/fixtures/identity.sql" \
68+
"$CORPUS_DIR/sqlglot/tests/fixtures/tpch.sql" \
69+
"$CORPUS_DIR/sqlglot/tests/fixtures/tpcds.sql" \
70+
"$CORPUS_DIR/sqlglot/tests/fixtures/optimizer/tpch.sql" 2>/dev/null | \
71+
grep -v '^$' | grep -v '^--' > /tmp/corpus_sqlglot.sql 2>/dev/null || true
72+
73+
# CockroachDB - extract SQL lines
74+
grep -h "^[A-Z]" "$CORPUS_DIR/cockroach/pkg/sql/parser/testdata/"* 2>/dev/null | \
75+
grep -v "^--" | grep -v "^#" | grep -v "^$" | head -5000 > /tmp/corpus_cockroach.sql 2>/dev/null || true
76+
77+
# TPC-H
78+
cat "$CORPUS_DIR/pg_tpch/queries/"*.sql 2>/dev/null | \
79+
sed 's/--.*$//' | tr '\n' ' ' | sed 's/;/;\n/g' | \
80+
grep -v '^\s*$' | sed 's/^\s*//' > /tmp/corpus_tpch.sql 2>/dev/null || true
81+
82+
# PostgreSQL regression
83+
cat "$CORPUS_DIR/postgres/src/test/regress/sql/"*.sql 2>/dev/null | \
84+
sed 's/--.*$//' | tr '\n' ' ' | sed 's/;/;\n/g' | \
85+
grep -v '^\s*$' | sed 's/^\s*//' > /tmp/corpus_pg_regress.sql 2>/dev/null || true
86+
87+
# Vitess
88+
grep -ohP '"((?:select|SELECT|insert|INSERT|update|UPDATE|delete|DELETE|set|SET|create|CREATE|alter|ALTER|drop|DROP|explain|EXPLAIN)[^"]*)"' \
89+
"$CORPUS_DIR/vitess/go/vt/sqlparser/parse_test.go" 2>/dev/null | \
90+
sed 's/^"//' | sed 's/"$//' | head -2000 > /tmp/corpus_vitess.sql 2>/dev/null || true
91+
92+
# TiDB
93+
grep -ohP '"((?:select|SELECT|insert|INSERT|update|UPDATE|delete|DELETE|set|SET|create|CREATE)[^"]*)"' \
94+
"$CORPUS_DIR/tidb/pkg/parser/parser_test.go" 2>/dev/null | \
95+
sed 's/^"//' | sed 's/"$//' | head -3000 > /tmp/corpus_tidb.sql 2>/dev/null || true
96+
97+
echo "=== Running corpus tests ==="
98+
99+
run_corpus() {
100+
local name="$1"
101+
local dialect="$2"
102+
local file="$3"
103+
if [ -f "$file" ] && [ -s "$file" ]; then
104+
local count=$(wc -l < "$file")
105+
local result=$(./corpus_test "$dialect" < "$file" 2>/dev/null)
106+
local ok=$(echo "$result" | grep "^OK:" | sed 's/^OK:\s*//' | grep -oP '^\d+')
107+
local partial=$(echo "$result" | grep "^PARTIAL:" | sed 's/^PARTIAL:\s*//' | grep -oP '^\d+')
108+
local error=$(echo "$result" | grep "^ERROR:" | sed 's/^ERROR:\s*//' | grep -oP '^\d+')
109+
local ok_pct=$(echo "$result" | grep "^OK:" | grep -oP '\([\d.]+%' | tr -d '(')
110+
echo "| $name | $dialect | $count | ${ok:-0} (${ok_pct:-0%}) | ${partial:-0} | ${error:-0} |"
111+
fi
112+
}
113+
114+
# Collect corpus results
115+
CORPUS_RESULTS=""
116+
CORPUS_RESULTS+=$(run_corpus "PostgreSQL regression" "pgsql" "/tmp/corpus_pg_regress.sql")$'\n'
117+
CORPUS_RESULTS+=$(run_corpus "SQLGlot" "mysql" "/tmp/corpus_sqlglot.sql")$'\n'
118+
CORPUS_RESULTS+=$(run_corpus "CockroachDB" "pgsql" "/tmp/corpus_cockroach.sql")$'\n'
119+
CORPUS_RESULTS+=$(run_corpus "TPC-H" "pgsql" "/tmp/corpus_tpch.sql")$'\n'
120+
CORPUS_RESULTS+=$(run_corpus "Vitess" "mysql" "/tmp/corpus_vitess.sql")$'\n'
121+
CORPUS_RESULTS+=$(run_corpus "TiDB" "mysql" "/tmp/corpus_tidb.sql")$'\n'
122+
123+
echo "=== Generating report ==="
124+
125+
# Get system info
126+
HOSTNAME=$(hostname)
127+
CPU=$(lscpu 2>/dev/null | grep "Model name" | sed 's/Model name:\s*//' || sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "Unknown")
128+
OS=$(uname -sr)
129+
COMPILER=$(g++ --version | head -1)
130+
GIT_SHA=$(git rev-parse --short HEAD)
131+
GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
132+
DATE=$(date -u +"%Y-%m-%d %H:%M UTC")
133+
TEST_COUNT=$(make -f /tmp/Makefile.release test 2>&1 | grep "PASSED" | grep -oP '\d+' | head -1)
134+
135+
cat > "$REPORT_FILE" << REPORT
136+
# SQL Parser Performance Report
137+
138+
**Date:** $DATE
139+
**Host:** $HOSTNAME
140+
**CPU:** $CPU
141+
**OS:** $OS
142+
**Compiler:** $COMPILER
143+
**Git:** $GIT_BRANCH @ $GIT_SHA
144+
**Unit tests:** $TEST_COUNT passing
145+
146+
---
147+
148+
## Benchmark Results (Release -O3)
149+
150+
| Operation | Latency | Target | Status |
151+
|---|---|---|---|
152+
$(while IFS= read -r line; do
153+
name=$(echo "$line" | awk '{print $1}')
154+
time=$(echo "$line" | awk '{print $2}')
155+
unit=$(echo "$line" | awk '{print $3}')
156+
157+
# Map benchmark names to targets
158+
target=""
159+
case "$name" in
160+
BM_Classify_Begin) target="<100ns" ;;
161+
BM_Classify_*) target="<500ns" ;;
162+
BM_Set_Simple) target="<300ns" ;;
163+
BM_Set_Names) target="<300ns" ;;
164+
BM_Set_MultiVar) target="<300ns" ;;
165+
BM_Set_FunctionRHS) target="<300ns" ;;
166+
BM_Select_Simple) target="<500ns" ;;
167+
BM_Select_MultiColumn) target="<500ns" ;;
168+
BM_Select_Join) target="<2us" ;;
169+
BM_Select_Complex) target="<2us" ;;
170+
BM_Select_MultiJoin) target="<2us" ;;
171+
BM_Emit_*) target="<500ns" ;;
172+
BM_ArenaReset) target="<10ns" ;;
173+
BM_PgSQL_*) target="" ;;
174+
esac
175+
176+
# Determine met/not met
177+
status="MET"
178+
if [ -n "$target" ] && [ "$target" != "" ]; then
179+
target_ns=$(echo "$target" | tr -dc '0-9')
180+
if echo "$target" | grep -q "us"; then
181+
target_ns=$((target_ns * 1000))
182+
fi
183+
time_ns=$(printf "%.0f" "$time")
184+
if [ "$time_ns" -gt "$target_ns" ] 2>/dev/null; then
185+
status="MISSED"
186+
fi
187+
else
188+
status=""
189+
fi
190+
191+
echo "| $name | ${time} ${unit} | ${target:-—} | $status |"
192+
done < /tmp/bench_results.txt)
193+
194+
---
195+
196+
## Corpus Test Results
197+
198+
| Corpus | Dialect | Queries | OK | PARTIAL | ERROR |
199+
|---|---|---|---|---|---|
200+
$CORPUS_RESULTS
201+
202+
---
203+
204+
*Generated by \`scripts/run_benchmarks.sh\`*
205+
REPORT
206+
207+
echo "Report written to: $REPORT_FILE"
208+
echo ""
209+
cat "$REPORT_FILE"

0 commit comments

Comments
 (0)