Skip to content

Commit e415b16

Browse files
committed
Add query digest/normalization module (Plan 11)
Implement AST-based and token-level query digest that normalizes SQL queries for rules matching. Literals become ?, IN lists collapse to IN (?), VALUES rows collapse to a single row, aliases are skipped, and keywords are uppercased. Both paths produce a normalized string and 64-bit FNV-1a hash. - Add EmitMode::DIGEST to Emitter with modified literal/IN/VALUES/alias emission - Create digest.h with Digest<D> class, DigestResult, FnvHash - Token-level fallback works for all statement types (Tier 2 included) - 34 new tests covering literal normalization, IN collapsing, VALUES collapsing, keyword uppercasing, hash consistency, cross-tier consistency, placeholder passthrough, NULL preservation
1 parent 5c8f2bd commit e415b16

5 files changed

Lines changed: 643 additions & 16 deletions

File tree

Makefile.new

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ TEST_SRCS = $(TEST_DIR)/test_main.cpp \
3131
$(TEST_DIR)/test_insert.cpp \
3232
$(TEST_DIR)/test_update.cpp \
3333
$(TEST_DIR)/test_delete.cpp \
34-
$(TEST_DIR)/test_compound.cpp
34+
$(TEST_DIR)/test_compound.cpp \
35+
$(TEST_DIR)/test_digest.cpp
3536
TEST_OBJS = $(TEST_SRCS:.cpp=.o)
3637
TEST_TARGET = $(PROJECT_ROOT)/run_tests
3738

include/sql_parser/digest.h

Lines changed: 287 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,287 @@
1+
#ifndef SQL_PARSER_DIGEST_H
2+
#define SQL_PARSER_DIGEST_H
3+
4+
#include "sql_parser/common.h"
5+
#include "sql_parser/arena.h"
6+
#include "sql_parser/tokenizer.h"
7+
#include "sql_parser/ast.h"
8+
#include "sql_parser/emitter.h"
9+
#include "sql_parser/string_builder.h"
10+
#include <cstdint>
11+
12+
namespace sql_parser {
13+
14+
struct DigestResult {
15+
StringRef normalized; // "SELECT * FROM t WHERE id = ?"
16+
uint64_t hash; // 64-bit FNV-1a hash
17+
};
18+
19+
// FNV-1a 64-bit hash -- simple, fast, no external dependency
20+
struct FnvHash {
21+
static constexpr uint64_t FNV_OFFSET_BASIS = 14695981039346656037ULL;
22+
static constexpr uint64_t FNV_PRIME = 1099511628211ULL;
23+
24+
uint64_t state = FNV_OFFSET_BASIS;
25+
26+
void update(const char* data, size_t len) {
27+
for (size_t i = 0; i < len; ++i) {
28+
state ^= static_cast<uint64_t>(static_cast<uint8_t>(data[i]));
29+
state *= FNV_PRIME;
30+
}
31+
}
32+
33+
void update_char(char c) {
34+
state ^= static_cast<uint64_t>(static_cast<uint8_t>(c));
35+
state *= FNV_PRIME;
36+
}
37+
38+
uint64_t finish() const { return state; }
39+
};
40+
41+
template <Dialect D>
42+
class Digest {
43+
public:
44+
explicit Digest(Arena& arena) : arena_(arena) {}
45+
46+
// From a parsed AST (Tier 1) -- uses Emitter in DIGEST mode
47+
DigestResult compute(const AstNode* ast) {
48+
Emitter<D> emitter(arena_, EmitMode::DIGEST);
49+
emitter.emit(ast);
50+
StringRef normalized = emitter.result();
51+
FnvHash hasher;
52+
hasher.update(normalized.ptr, normalized.len);
53+
return DigestResult{normalized, hasher.finish()};
54+
}
55+
56+
// From raw SQL (works for any statement) -- uses token-level fallback
57+
DigestResult compute(const char* sql, size_t len) {
58+
return compute_token_level(sql, len);
59+
}
60+
61+
private:
62+
Arena& arena_;
63+
64+
// Helper: check if a token type is a keyword (not an identifier, literal, or operator)
65+
static bool is_keyword_token(TokenType type) {
66+
// Keywords start at TK_SELECT and go through TK_EXCEPT
67+
return static_cast<uint16_t>(type) >= static_cast<uint16_t>(TokenType::TK_SELECT);
68+
}
69+
70+
// Helper: check if a token type is a literal value that should become ?
71+
static bool is_literal_token(TokenType type) {
72+
return type == TokenType::TK_INTEGER ||
73+
type == TokenType::TK_FLOAT ||
74+
type == TokenType::TK_STRING;
75+
}
76+
77+
// Helper: uppercase a character
78+
static char to_upper(char c) {
79+
return (c >= 'a' && c <= 'z') ? (c - 32) : c;
80+
}
81+
82+
// Append token text uppercased to StringBuilder
83+
static void append_upper(StringBuilder& sb, const char* ptr, uint32_t len) {
84+
for (uint32_t i = 0; i < len; ++i) {
85+
sb.append_char(to_upper(ptr[i]));
86+
}
87+
}
88+
89+
// Determine if we need a space before this token given the previous token type
90+
static bool needs_space_before(TokenType prev, TokenType cur) {
91+
// Never space after ( or before )
92+
if (prev == TokenType::TK_LPAREN) return false;
93+
if (cur == TokenType::TK_RPAREN) return false;
94+
// No space before or after dot
95+
if (prev == TokenType::TK_DOT || cur == TokenType::TK_DOT) return false;
96+
// No space before comma
97+
if (cur == TokenType::TK_COMMA) return false;
98+
// No space after @ or @@
99+
if (prev == TokenType::TK_AT || prev == TokenType::TK_DOUBLE_AT) return false;
100+
// No space before @
101+
if (cur == TokenType::TK_AT) return false;
102+
return true;
103+
}
104+
105+
// Emit a single token to the string builder, uppercasing keywords, replacing literals with ?
106+
void emit_token(StringBuilder& sb, const Token& t, TokenType prev) {
107+
bool space = (prev != TokenType::TK_EOF) && needs_space_before(prev, t.type);
108+
if (space) sb.append_char(' ');
109+
110+
if (is_literal_token(t.type)) {
111+
sb.append_char('?');
112+
} else if (is_keyword_token(t.type)) {
113+
append_upper(sb, t.text.ptr, t.text.len);
114+
} else if (t.type == TokenType::TK_IDENTIFIER) {
115+
sb.append(t.text.ptr, t.text.len);
116+
} else if (t.type == TokenType::TK_QUESTION) {
117+
sb.append_char('?');
118+
} else if (t.type == TokenType::TK_COMMA) {
119+
sb.append(",", 1);
120+
} else {
121+
// All other tokens: emit as-is
122+
sb.append(t.text.ptr, t.text.len);
123+
}
124+
}
125+
126+
// Skip tokens inside parentheses until matching close paren. Returns last token type consumed.
127+
void skip_paren_contents(Tokenizer<D>& tok) {
128+
int depth = 1;
129+
while (depth > 0) {
130+
Token inner = tok.next_token();
131+
if (inner.type == TokenType::TK_EOF) break;
132+
if (inner.type == TokenType::TK_LPAREN) depth++;
133+
if (inner.type == TokenType::TK_RPAREN) depth--;
134+
}
135+
}
136+
137+
// Token-level digest: walk tokens, normalize, hash
138+
DigestResult compute_token_level(const char* sql, size_t len) {
139+
Tokenizer<D> tok;
140+
tok.reset(sql, len);
141+
StringBuilder sb(arena_);
142+
TokenType prev = TokenType::TK_EOF;
143+
144+
// We collect tokens into a small buffer for lookahead patterns
145+
// Main loop: read token, check for special patterns, emit
146+
147+
Token t = tok.next_token();
148+
149+
while (t.type != TokenType::TK_EOF && t.type != TokenType::TK_SEMICOLON) {
150+
151+
// Pattern: IN (...) -> collapse to IN (?)
152+
if (t.type == TokenType::TK_IN) {
153+
emit_token(sb, t, prev);
154+
prev = t.type;
155+
156+
Token next = tok.next_token();
157+
if (next.type == TokenType::TK_LPAREN) {
158+
// Emit " ("
159+
emit_token(sb, next, prev);
160+
prev = next.type;
161+
// Collapse contents to single ?
162+
bool emitted_q = false;
163+
int depth = 1;
164+
while (depth > 0) {
165+
Token inner = tok.next_token();
166+
if (inner.type == TokenType::TK_EOF) break;
167+
if (inner.type == TokenType::TK_LPAREN) { depth++; continue; }
168+
if (inner.type == TokenType::TK_RPAREN) {
169+
depth--;
170+
if (depth == 0) {
171+
sb.append_char(')');
172+
prev = TokenType::TK_RPAREN;
173+
break;
174+
}
175+
continue;
176+
}
177+
if (!emitted_q) {
178+
sb.append_char('?');
179+
prev = TokenType::TK_QUESTION;
180+
emitted_q = true;
181+
}
182+
}
183+
t = tok.next_token();
184+
continue;
185+
} else {
186+
// IN not followed by ( -- process next token normally
187+
t = next;
188+
continue;
189+
}
190+
}
191+
192+
// Pattern: VALUES (...), (...), ... -> collapse to VALUES (?, ?, ...)
193+
if (t.type == TokenType::TK_VALUES) {
194+
emit_token(sb, t, prev);
195+
prev = t.type;
196+
197+
Token next = tok.next_token();
198+
if (next.type == TokenType::TK_LPAREN) {
199+
// Emit the opening paren
200+
emit_token(sb, next, prev);
201+
prev = next.type;
202+
203+
// Emit first row contents with ? for each value slot
204+
int depth = 1;
205+
while (depth > 0) {
206+
Token inner = tok.next_token();
207+
if (inner.type == TokenType::TK_EOF) break;
208+
if (inner.type == TokenType::TK_LPAREN) {
209+
depth++;
210+
continue;
211+
}
212+
if (inner.type == TokenType::TK_RPAREN) {
213+
depth--;
214+
if (depth == 0) {
215+
sb.append_char(')');
216+
prev = TokenType::TK_RPAREN;
217+
break;
218+
}
219+
continue;
220+
}
221+
if (inner.type == TokenType::TK_COMMA && depth == 1) {
222+
sb.append(", ", 2);
223+
prev = TokenType::TK_COMMA;
224+
continue;
225+
}
226+
// Emit ? for literals and existing placeholders
227+
if (is_literal_token(inner.type) || inner.type == TokenType::TK_QUESTION) {
228+
// Only emit ? once per value slot (skip if prev already emitted one)
229+
if (prev == TokenType::TK_LPAREN || prev == TokenType::TK_COMMA) {
230+
sb.append_char('?');
231+
prev = TokenType::TK_QUESTION;
232+
}
233+
}
234+
}
235+
236+
// Skip additional rows: , (...)
237+
while (true) {
238+
Token peek = tok.next_token();
239+
if (peek.type == TokenType::TK_COMMA) {
240+
Token peek2 = tok.next_token();
241+
if (peek2.type == TokenType::TK_LPAREN) {
242+
// Skip this entire row
243+
skip_paren_contents(tok);
244+
continue;
245+
} else {
246+
// Comma but not followed by ( -- it's not another row
247+
// Emit the comma and continue with peek2
248+
sb.append(",", 1);
249+
prev = TokenType::TK_COMMA;
250+
t = peek2;
251+
goto emit_normal;
252+
}
253+
} else {
254+
// Not a comma - done with VALUES rows
255+
t = peek;
256+
goto emit_normal;
257+
}
258+
}
259+
} else {
260+
// VALUES not followed by ( -- process next normally
261+
t = next;
262+
continue;
263+
}
264+
}
265+
266+
emit_normal:
267+
// Check if we've reached the end (can happen after VALUES/IN lookahead)
268+
if (t.type == TokenType::TK_EOF || t.type == TokenType::TK_SEMICOLON) break;
269+
270+
emit_token(sb, t, prev);
271+
prev = t.type;
272+
// For literal tokens, record as TK_QUESTION since we emitted ?
273+
if (is_literal_token(t.type)) prev = TokenType::TK_QUESTION;
274+
275+
t = tok.next_token();
276+
}
277+
278+
StringRef normalized = sb.finish();
279+
FnvHash hasher;
280+
hasher.update(normalized.ptr, normalized.len);
281+
return DigestResult{normalized, hasher.finish()};
282+
}
283+
};
284+
285+
} // namespace sql_parser
286+
287+
#endif // SQL_PARSER_DIGEST_H

include/sql_parser/emitter.h

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,14 @@
1010

1111
namespace sql_parser {
1212

13+
enum class EmitMode : uint8_t { NORMAL, DIGEST };
14+
1315
template <Dialect D>
1416
class Emitter {
1517
public:
16-
explicit Emitter(Arena& arena, const ParamBindings* bindings = nullptr)
17-
: sb_(arena), bindings_(bindings), placeholder_index_(0) {}
18+
explicit Emitter(Arena& arena, EmitMode mode = EmitMode::NORMAL,
19+
const ParamBindings* bindings = nullptr)
20+
: sb_(arena), bindings_(bindings), placeholder_index_(0), mode_(mode) {}
1821

1922
void emit(const AstNode* node) {
2023
if (!node) return;
@@ -27,6 +30,7 @@ class Emitter {
2730
StringBuilder sb_;
2831
const ParamBindings* bindings_;
2932
uint16_t placeholder_index_;
33+
EmitMode mode_;
3034

3135
void emit_node(const AstNode* node) {
3236
switch (node->type) {
@@ -103,13 +107,16 @@ class Emitter {
103107
// ---- Leaf nodes (emit value directly) ----
104108
case NodeType::NODE_LITERAL_INT:
105109
case NodeType::NODE_LITERAL_FLOAT:
110+
if (mode_ == EmitMode::DIGEST) { sb_.append_char('?'); break; }
111+
emit_value(node); break;
106112
case NodeType::NODE_LITERAL_NULL:
107113
case NodeType::NODE_COLUMN_REF:
108114
case NodeType::NODE_ASTERISK:
109115
case NodeType::NODE_IDENTIFIER:
110116
emit_value(node); break;
111117

112118
case NodeType::NODE_LITERAL_STRING:
119+
if (mode_ == EmitMode::DIGEST) { sb_.append_char('?'); break; }
113120
emit_string_literal(node); break;
114121

115122
default:
@@ -293,6 +300,7 @@ class Emitter {
293300
}
294301

295302
void emit_alias(const AstNode* node) {
303+
if (mode_ == EmitMode::DIGEST) return; // skip aliases in digest mode
296304
sb_.append(" AS ");
297305
emit_value(node);
298306
}
@@ -489,11 +497,16 @@ class Emitter {
489497
return;
490498
}
491499
sb_.append("VALUES ");
492-
bool first = true;
493-
for (const AstNode* child = node->first_child; child; child = child->next_sibling) {
494-
if (!first) sb_.append(", ");
495-
first = false;
496-
emit_node(child);
500+
if (mode_ == EmitMode::DIGEST) {
501+
// Collapse to single row in digest mode
502+
if (node->first_child) emit_node(node->first_child);
503+
} else {
504+
bool first = true;
505+
for (const AstNode* child = node->first_child; child; child = child->next_sibling) {
506+
if (!first) sb_.append(", ");
507+
first = false;
508+
emit_node(child);
509+
}
497510
}
498511
}
499512

@@ -893,11 +906,15 @@ class Emitter {
893906
const AstNode* expr = node->first_child;
894907
if (expr) emit_node(expr);
895908
sb_.append(" IN (");
896-
bool first = true;
897-
for (const AstNode* val = expr ? expr->next_sibling : nullptr; val; val = val->next_sibling) {
898-
if (!first) sb_.append(", ");
899-
first = false;
900-
emit_node(val);
909+
if (mode_ == EmitMode::DIGEST) {
910+
sb_.append_char('?');
911+
} else {
912+
bool first = true;
913+
for (const AstNode* val = expr ? expr->next_sibling : nullptr; val; val = val->next_sibling) {
914+
if (!first) sb_.append(", ");
915+
first = false;
916+
emit_node(val);
917+
}
901918
}
902919
sb_.append_char(')');
903920
}

0 commit comments

Comments
 (0)