|
| 1 | +#ifndef SQL_PARSER_DIGEST_H |
| 2 | +#define SQL_PARSER_DIGEST_H |
| 3 | + |
| 4 | +#include "sql_parser/common.h" |
| 5 | +#include "sql_parser/arena.h" |
| 6 | +#include "sql_parser/tokenizer.h" |
| 7 | +#include "sql_parser/ast.h" |
| 8 | +#include "sql_parser/emitter.h" |
| 9 | +#include "sql_parser/string_builder.h" |
| 10 | +#include <cstdint> |
| 11 | + |
| 12 | +namespace sql_parser { |
| 13 | + |
| 14 | +struct DigestResult { |
| 15 | + StringRef normalized; // "SELECT * FROM t WHERE id = ?" |
| 16 | + uint64_t hash; // 64-bit FNV-1a hash |
| 17 | +}; |
| 18 | + |
| 19 | +// FNV-1a 64-bit hash -- simple, fast, no external dependency |
| 20 | +struct FnvHash { |
| 21 | + static constexpr uint64_t FNV_OFFSET_BASIS = 14695981039346656037ULL; |
| 22 | + static constexpr uint64_t FNV_PRIME = 1099511628211ULL; |
| 23 | + |
| 24 | + uint64_t state = FNV_OFFSET_BASIS; |
| 25 | + |
| 26 | + void update(const char* data, size_t len) { |
| 27 | + for (size_t i = 0; i < len; ++i) { |
| 28 | + state ^= static_cast<uint64_t>(static_cast<uint8_t>(data[i])); |
| 29 | + state *= FNV_PRIME; |
| 30 | + } |
| 31 | + } |
| 32 | + |
| 33 | + void update_char(char c) { |
| 34 | + state ^= static_cast<uint64_t>(static_cast<uint8_t>(c)); |
| 35 | + state *= FNV_PRIME; |
| 36 | + } |
| 37 | + |
| 38 | + uint64_t finish() const { return state; } |
| 39 | +}; |
| 40 | + |
| 41 | +template <Dialect D> |
| 42 | +class Digest { |
| 43 | +public: |
| 44 | + explicit Digest(Arena& arena) : arena_(arena) {} |
| 45 | + |
| 46 | + // From a parsed AST (Tier 1) -- uses Emitter in DIGEST mode |
| 47 | + DigestResult compute(const AstNode* ast) { |
| 48 | + Emitter<D> emitter(arena_, EmitMode::DIGEST); |
| 49 | + emitter.emit(ast); |
| 50 | + StringRef normalized = emitter.result(); |
| 51 | + FnvHash hasher; |
| 52 | + hasher.update(normalized.ptr, normalized.len); |
| 53 | + return DigestResult{normalized, hasher.finish()}; |
| 54 | + } |
| 55 | + |
| 56 | + // From raw SQL (works for any statement) -- uses token-level fallback |
| 57 | + DigestResult compute(const char* sql, size_t len) { |
| 58 | + return compute_token_level(sql, len); |
| 59 | + } |
| 60 | + |
| 61 | +private: |
| 62 | + Arena& arena_; |
| 63 | + |
| 64 | + // Helper: check if a token type is a keyword (not an identifier, literal, or operator) |
| 65 | + static bool is_keyword_token(TokenType type) { |
| 66 | + // Keywords start at TK_SELECT and go through TK_EXCEPT |
| 67 | + return static_cast<uint16_t>(type) >= static_cast<uint16_t>(TokenType::TK_SELECT); |
| 68 | + } |
| 69 | + |
| 70 | + // Helper: check if a token type is a literal value that should become ? |
| 71 | + static bool is_literal_token(TokenType type) { |
| 72 | + return type == TokenType::TK_INTEGER || |
| 73 | + type == TokenType::TK_FLOAT || |
| 74 | + type == TokenType::TK_STRING; |
| 75 | + } |
| 76 | + |
| 77 | + // Helper: uppercase a character |
| 78 | + static char to_upper(char c) { |
| 79 | + return (c >= 'a' && c <= 'z') ? (c - 32) : c; |
| 80 | + } |
| 81 | + |
| 82 | + // Append token text uppercased to StringBuilder |
| 83 | + static void append_upper(StringBuilder& sb, const char* ptr, uint32_t len) { |
| 84 | + for (uint32_t i = 0; i < len; ++i) { |
| 85 | + sb.append_char(to_upper(ptr[i])); |
| 86 | + } |
| 87 | + } |
| 88 | + |
| 89 | + // Determine if we need a space before this token given the previous token type |
| 90 | + static bool needs_space_before(TokenType prev, TokenType cur) { |
| 91 | + // Never space after ( or before ) |
| 92 | + if (prev == TokenType::TK_LPAREN) return false; |
| 93 | + if (cur == TokenType::TK_RPAREN) return false; |
| 94 | + // No space before or after dot |
| 95 | + if (prev == TokenType::TK_DOT || cur == TokenType::TK_DOT) return false; |
| 96 | + // No space before comma |
| 97 | + if (cur == TokenType::TK_COMMA) return false; |
| 98 | + // No space after @ or @@ |
| 99 | + if (prev == TokenType::TK_AT || prev == TokenType::TK_DOUBLE_AT) return false; |
| 100 | + // No space before @ |
| 101 | + if (cur == TokenType::TK_AT) return false; |
| 102 | + return true; |
| 103 | + } |
| 104 | + |
| 105 | + // Emit a single token to the string builder, uppercasing keywords, replacing literals with ? |
| 106 | + void emit_token(StringBuilder& sb, const Token& t, TokenType prev) { |
| 107 | + bool space = (prev != TokenType::TK_EOF) && needs_space_before(prev, t.type); |
| 108 | + if (space) sb.append_char(' '); |
| 109 | + |
| 110 | + if (is_literal_token(t.type)) { |
| 111 | + sb.append_char('?'); |
| 112 | + } else if (is_keyword_token(t.type)) { |
| 113 | + append_upper(sb, t.text.ptr, t.text.len); |
| 114 | + } else if (t.type == TokenType::TK_IDENTIFIER) { |
| 115 | + sb.append(t.text.ptr, t.text.len); |
| 116 | + } else if (t.type == TokenType::TK_QUESTION) { |
| 117 | + sb.append_char('?'); |
| 118 | + } else if (t.type == TokenType::TK_COMMA) { |
| 119 | + sb.append(",", 1); |
| 120 | + } else { |
| 121 | + // All other tokens: emit as-is |
| 122 | + sb.append(t.text.ptr, t.text.len); |
| 123 | + } |
| 124 | + } |
| 125 | + |
| 126 | + // Skip tokens inside parentheses until matching close paren. Returns last token type consumed. |
| 127 | + void skip_paren_contents(Tokenizer<D>& tok) { |
| 128 | + int depth = 1; |
| 129 | + while (depth > 0) { |
| 130 | + Token inner = tok.next_token(); |
| 131 | + if (inner.type == TokenType::TK_EOF) break; |
| 132 | + if (inner.type == TokenType::TK_LPAREN) depth++; |
| 133 | + if (inner.type == TokenType::TK_RPAREN) depth--; |
| 134 | + } |
| 135 | + } |
| 136 | + |
| 137 | + // Token-level digest: walk tokens, normalize, hash |
| 138 | + DigestResult compute_token_level(const char* sql, size_t len) { |
| 139 | + Tokenizer<D> tok; |
| 140 | + tok.reset(sql, len); |
| 141 | + StringBuilder sb(arena_); |
| 142 | + TokenType prev = TokenType::TK_EOF; |
| 143 | + |
| 144 | + // We collect tokens into a small buffer for lookahead patterns |
| 145 | + // Main loop: read token, check for special patterns, emit |
| 146 | + |
| 147 | + Token t = tok.next_token(); |
| 148 | + |
| 149 | + while (t.type != TokenType::TK_EOF && t.type != TokenType::TK_SEMICOLON) { |
| 150 | + |
| 151 | + // Pattern: IN (...) -> collapse to IN (?) |
| 152 | + if (t.type == TokenType::TK_IN) { |
| 153 | + emit_token(sb, t, prev); |
| 154 | + prev = t.type; |
| 155 | + |
| 156 | + Token next = tok.next_token(); |
| 157 | + if (next.type == TokenType::TK_LPAREN) { |
| 158 | + // Emit " (" |
| 159 | + emit_token(sb, next, prev); |
| 160 | + prev = next.type; |
| 161 | + // Collapse contents to single ? |
| 162 | + bool emitted_q = false; |
| 163 | + int depth = 1; |
| 164 | + while (depth > 0) { |
| 165 | + Token inner = tok.next_token(); |
| 166 | + if (inner.type == TokenType::TK_EOF) break; |
| 167 | + if (inner.type == TokenType::TK_LPAREN) { depth++; continue; } |
| 168 | + if (inner.type == TokenType::TK_RPAREN) { |
| 169 | + depth--; |
| 170 | + if (depth == 0) { |
| 171 | + sb.append_char(')'); |
| 172 | + prev = TokenType::TK_RPAREN; |
| 173 | + break; |
| 174 | + } |
| 175 | + continue; |
| 176 | + } |
| 177 | + if (!emitted_q) { |
| 178 | + sb.append_char('?'); |
| 179 | + prev = TokenType::TK_QUESTION; |
| 180 | + emitted_q = true; |
| 181 | + } |
| 182 | + } |
| 183 | + t = tok.next_token(); |
| 184 | + continue; |
| 185 | + } else { |
| 186 | + // IN not followed by ( -- process next token normally |
| 187 | + t = next; |
| 188 | + continue; |
| 189 | + } |
| 190 | + } |
| 191 | + |
| 192 | + // Pattern: VALUES (...), (...), ... -> collapse to VALUES (?, ?, ...) |
| 193 | + if (t.type == TokenType::TK_VALUES) { |
| 194 | + emit_token(sb, t, prev); |
| 195 | + prev = t.type; |
| 196 | + |
| 197 | + Token next = tok.next_token(); |
| 198 | + if (next.type == TokenType::TK_LPAREN) { |
| 199 | + // Emit the opening paren |
| 200 | + emit_token(sb, next, prev); |
| 201 | + prev = next.type; |
| 202 | + |
| 203 | + // Emit first row contents with ? for each value slot |
| 204 | + int depth = 1; |
| 205 | + while (depth > 0) { |
| 206 | + Token inner = tok.next_token(); |
| 207 | + if (inner.type == TokenType::TK_EOF) break; |
| 208 | + if (inner.type == TokenType::TK_LPAREN) { |
| 209 | + depth++; |
| 210 | + continue; |
| 211 | + } |
| 212 | + if (inner.type == TokenType::TK_RPAREN) { |
| 213 | + depth--; |
| 214 | + if (depth == 0) { |
| 215 | + sb.append_char(')'); |
| 216 | + prev = TokenType::TK_RPAREN; |
| 217 | + break; |
| 218 | + } |
| 219 | + continue; |
| 220 | + } |
| 221 | + if (inner.type == TokenType::TK_COMMA && depth == 1) { |
| 222 | + sb.append(", ", 2); |
| 223 | + prev = TokenType::TK_COMMA; |
| 224 | + continue; |
| 225 | + } |
| 226 | + // Emit ? for literals and existing placeholders |
| 227 | + if (is_literal_token(inner.type) || inner.type == TokenType::TK_QUESTION) { |
| 228 | + // Only emit ? once per value slot (skip if prev already emitted one) |
| 229 | + if (prev == TokenType::TK_LPAREN || prev == TokenType::TK_COMMA) { |
| 230 | + sb.append_char('?'); |
| 231 | + prev = TokenType::TK_QUESTION; |
| 232 | + } |
| 233 | + } |
| 234 | + } |
| 235 | + |
| 236 | + // Skip additional rows: , (...) |
| 237 | + while (true) { |
| 238 | + Token peek = tok.next_token(); |
| 239 | + if (peek.type == TokenType::TK_COMMA) { |
| 240 | + Token peek2 = tok.next_token(); |
| 241 | + if (peek2.type == TokenType::TK_LPAREN) { |
| 242 | + // Skip this entire row |
| 243 | + skip_paren_contents(tok); |
| 244 | + continue; |
| 245 | + } else { |
| 246 | + // Comma but not followed by ( -- it's not another row |
| 247 | + // Emit the comma and continue with peek2 |
| 248 | + sb.append(",", 1); |
| 249 | + prev = TokenType::TK_COMMA; |
| 250 | + t = peek2; |
| 251 | + goto emit_normal; |
| 252 | + } |
| 253 | + } else { |
| 254 | + // Not a comma - done with VALUES rows |
| 255 | + t = peek; |
| 256 | + goto emit_normal; |
| 257 | + } |
| 258 | + } |
| 259 | + } else { |
| 260 | + // VALUES not followed by ( -- process next normally |
| 261 | + t = next; |
| 262 | + continue; |
| 263 | + } |
| 264 | + } |
| 265 | + |
| 266 | + emit_normal: |
| 267 | + // Check if we've reached the end (can happen after VALUES/IN lookahead) |
| 268 | + if (t.type == TokenType::TK_EOF || t.type == TokenType::TK_SEMICOLON) break; |
| 269 | + |
| 270 | + emit_token(sb, t, prev); |
| 271 | + prev = t.type; |
| 272 | + // For literal tokens, record as TK_QUESTION since we emitted ? |
| 273 | + if (is_literal_token(t.type)) prev = TokenType::TK_QUESTION; |
| 274 | + |
| 275 | + t = tok.next_token(); |
| 276 | + } |
| 277 | + |
| 278 | + StringRef normalized = sb.finish(); |
| 279 | + FnvHash hasher; |
| 280 | + hasher.update(normalized.ptr, normalized.len); |
| 281 | + return DigestResult{normalized, hasher.finish()}; |
| 282 | + } |
| 283 | +}; |
| 284 | + |
| 285 | +} // namespace sql_parser |
| 286 | + |
| 287 | +#endif // SQL_PARSER_DIGEST_H |
0 commit comments