Skip to content

Commit 7f53668

Browse files
committed
perf: replace binary search keyword lookup with FNV-1a hash table (O(1))
Use a 512-slot open-addressing hash table with linear probing instead of binary search (~7 comparisons) for keyword classification. The table is built once at startup from the existing keyword arrays. Benchmarks show 15-30% improvement across parser operations: - Select_Simple: 239ns -> 185ns (-23%) - Select_Complex: 1448ns -> 1104ns (-24%) - Classify_Begin: 38ns -> 31ns (-18%) - Select_MultiJoin: 1577ns -> 1073ns (-32%) Closes #12
1 parent 43fe79a commit 7f53668

3 files changed

Lines changed: 102 additions & 18 deletions

File tree

include/sql_parser/keyword_hash.h

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#ifndef SQL_PARSER_KEYWORD_HASH_H
2+
#define SQL_PARSER_KEYWORD_HASH_H
3+
4+
#include "sql_parser/token.h"
5+
#include <cstdint>
6+
#include <cstring>
7+
8+
namespace sql_parser {
9+
namespace keyword_hash {
10+
11+
// FNV-1a hash (case-insensitive: lowercases ASCII bytes before hashing)
12+
// Uses the standard FNV-1a 32-bit offset basis and prime.
13+
inline uint32_t hash_keyword(const char* text, uint32_t len) {
14+
uint32_t h = 0x811c9dc5u;
15+
for (uint32_t i = 0; i < len; ++i) {
16+
uint8_t c = static_cast<uint8_t>(text[i]);
17+
// ASCII uppercase to lowercase
18+
if (c >= 'A' && c <= 'Z') c += 32;
19+
h ^= c;
20+
h *= 0x01000193u;
21+
}
22+
return h;
23+
}
24+
25+
// Hash table entry
26+
struct HashEntry {
27+
const char* text; // uppercase keyword text (nullptr = empty slot)
28+
uint8_t len;
29+
TokenType token;
30+
};
31+
32+
// Table size must be a power of 2. 512 gives ~29% load for 150 MySQL keywords
33+
// (max probe = 5) and ~24% load for 123 PgSQL keywords (max probe = 3).
34+
static constexpr uint32_t TABLE_SIZE = 512;
35+
static constexpr uint32_t TABLE_MASK = TABLE_SIZE - 1;
36+
37+
// Maximum number of linear probes before giving up.
38+
// Empirically: MySQL = 5, PgSQL = 3. Use 8 for safety margin.
39+
static constexpr uint32_t MAX_PROBE = 8;
40+
41+
// Case-insensitive equality check (input may be mixed case, entry is uppercase)
42+
inline bool ci_equal(const char* input, const char* upper, uint32_t len) {
43+
for (uint32_t i = 0; i < len; ++i) {
44+
uint8_t a = static_cast<uint8_t>(input[i]);
45+
uint8_t b = static_cast<uint8_t>(upper[i]);
46+
if (a >= 'a' && a <= 'z') a -= 32;
47+
if (a != b) return false;
48+
}
49+
return true;
50+
}
51+
52+
// Build a hash table from a sorted keyword array.
53+
// KeywordArray must have .text, .len, .token fields.
54+
// Returns the number of keywords inserted (for debug).
55+
template <typename KeywordEntry, size_t N>
56+
inline void build_table(const KeywordEntry (&keywords)[N], HashEntry (&table)[TABLE_SIZE]) {
57+
// Zero-init
58+
for (uint32_t i = 0; i < TABLE_SIZE; ++i) {
59+
table[i] = {nullptr, 0, TokenType::TK_IDENTIFIER};
60+
}
61+
for (size_t k = 0; k < N; ++k) {
62+
uint32_t idx = hash_keyword(keywords[k].text, keywords[k].len) & TABLE_MASK;
63+
while (table[idx].text != nullptr) {
64+
idx = (idx + 1) & TABLE_MASK;
65+
}
66+
table[idx] = {keywords[k].text, keywords[k].len, keywords[k].token};
67+
}
68+
}
69+
70+
// Lookup a keyword in a pre-built hash table.
71+
inline TokenType lookup_in_table(const HashEntry (&table)[TABLE_SIZE],
72+
const char* text, uint32_t len) {
73+
uint32_t idx = hash_keyword(text, len) & TABLE_MASK;
74+
for (uint32_t probe = 0; probe <= MAX_PROBE; ++probe) {
75+
const HashEntry& e = table[idx];
76+
if (e.text == nullptr) return TokenType::TK_IDENTIFIER;
77+
if (e.len == len && ci_equal(text, e.text, len)) return e.token;
78+
idx = (idx + 1) & TABLE_MASK;
79+
}
80+
return TokenType::TK_IDENTIFIER;
81+
}
82+
83+
} // namespace keyword_hash
84+
} // namespace sql_parser
85+
86+
#endif // SQL_PARSER_KEYWORD_HASH_H

include/sql_parser/keywords_mysql.h

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#define SQL_PARSER_KEYWORDS_MYSQL_H
33

44
#include "sql_parser/token.h"
5+
#include "sql_parser/keyword_hash.h"
56
#include <algorithm>
67
#include <cstring>
78

@@ -169,16 +170,14 @@ inline constexpr KeywordEntry KEYWORDS[] = {
169170

170171
inline constexpr size_t KEYWORD_COUNT = sizeof(KEYWORDS) / sizeof(KEYWORDS[0]);
171172

173+
inline keyword_hash::HashEntry HASH_TABLE[keyword_hash::TABLE_SIZE];
174+
inline bool HASH_TABLE_INIT = [] {
175+
keyword_hash::build_table(KEYWORDS, HASH_TABLE);
176+
return true;
177+
}();
178+
172179
inline TokenType lookup(const char* text, uint32_t len) {
173-
size_t lo = 0, hi = KEYWORD_COUNT;
174-
while (lo < hi) {
175-
size_t mid = lo + (hi - lo) / 2;
176-
int cmp = sql_parser::ci_cmp(text, len, KEYWORDS[mid].text, KEYWORDS[mid].len);
177-
if (cmp == 0) return KEYWORDS[mid].token;
178-
if (cmp < 0) hi = mid;
179-
else lo = mid + 1;
180-
}
181-
return TokenType::TK_IDENTIFIER;
180+
return keyword_hash::lookup_in_table(HASH_TABLE, text, len);
182181
}
183182

184183
} // namespace mysql_keywords

include/sql_parser/keywords_pgsql.h

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#define SQL_PARSER_KEYWORDS_PGSQL_H
33

44
#include "sql_parser/token.h"
5+
#include "sql_parser/keyword_hash.h"
56

67
namespace sql_parser {
78
namespace pgsql_keywords {
@@ -140,16 +141,14 @@ inline constexpr KeywordEntry KEYWORDS[] = {
140141

141142
inline constexpr size_t KEYWORD_COUNT = sizeof(KEYWORDS) / sizeof(KEYWORDS[0]);
142143

144+
inline keyword_hash::HashEntry HASH_TABLE[keyword_hash::TABLE_SIZE];
145+
inline bool HASH_TABLE_INIT = [] {
146+
keyword_hash::build_table(KEYWORDS, HASH_TABLE);
147+
return true;
148+
}();
149+
143150
inline TokenType lookup(const char* text, uint32_t len) {
144-
size_t lo = 0, hi = KEYWORD_COUNT;
145-
while (lo < hi) {
146-
size_t mid = lo + (hi - lo) / 2;
147-
int cmp = sql_parser::ci_cmp(text, len, KEYWORDS[mid].text, KEYWORDS[mid].len);
148-
if (cmp == 0) return KEYWORDS[mid].token;
149-
if (cmp < 0) hi = mid;
150-
else lo = mid + 1;
151-
}
152-
return TokenType::TK_IDENTIFIER;
151+
return keyword_hash::lookup_in_table(HASH_TABLE, text, len);
153152
}
154153

155154
} // namespace pgsql_keywords

0 commit comments

Comments
 (0)