Skip to content

Commit 4d97bb7

Browse files
committed
feat: add dialect-templated tokenizer with MySQL and PostgreSQL support
1 parent b51164e commit 4d97bb7

2 files changed

Lines changed: 604 additions & 0 deletions

File tree

include/sql_parser/tokenizer.h

Lines changed: 361 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,361 @@
1+
#ifndef SQL_PARSER_TOKENIZER_H
2+
#define SQL_PARSER_TOKENIZER_H
3+
4+
#include "sql_parser/token.h"
5+
#include "sql_parser/keywords_mysql.h"
6+
#include "sql_parser/keywords_pgsql.h"
7+
8+
namespace sql_parser {
9+
10+
template <Dialect D>
11+
class Tokenizer {
12+
public:
13+
void reset(const char* input, size_t len) {
14+
start_ = input;
15+
cursor_ = input;
16+
end_ = input + len;
17+
has_peeked_ = false;
18+
}
19+
20+
Token next_token() {
21+
if (has_peeked_) {
22+
has_peeked_ = false;
23+
return peeked_;
24+
}
25+
return scan_token();
26+
}
27+
28+
Token peek() {
29+
if (!has_peeked_) {
30+
peeked_ = scan_token();
31+
has_peeked_ = true;
32+
}
33+
return peeked_;
34+
}
35+
36+
void skip() {
37+
if (has_peeked_) {
38+
has_peeked_ = false;
39+
} else {
40+
scan_token();
41+
}
42+
}
43+
44+
// Expose end of input for remaining-input calculation
45+
const char* input_end() const { return end_; }
46+
47+
private:
48+
const char* start_ = nullptr;
49+
const char* cursor_ = nullptr;
50+
const char* end_ = nullptr;
51+
Token peeked_;
52+
bool has_peeked_ = false;
53+
54+
uint32_t offset() const {
55+
return static_cast<uint32_t>(cursor_ - start_);
56+
}
57+
58+
char current() const { return (cursor_ < end_) ? *cursor_ : '\0'; }
59+
char advance() {
60+
char c = current();
61+
if (cursor_ < end_) ++cursor_;
62+
return c;
63+
}
64+
char peek_char(size_t ahead = 0) const {
65+
const char* p = cursor_ + ahead;
66+
return (p < end_) ? *p : '\0';
67+
}
68+
69+
void skip_whitespace_and_comments() {
70+
while (cursor_ < end_) {
71+
char c = *cursor_;
72+
73+
// Whitespace
74+
if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
75+
++cursor_;
76+
continue;
77+
}
78+
79+
// -- line comment (MySQL requires space after --, PgSQL doesn't but we handle both)
80+
if (c == '-' && peek_char(1) == '-') {
81+
cursor_ += 2;
82+
while (cursor_ < end_ && *cursor_ != '\n') ++cursor_;
83+
continue;
84+
}
85+
86+
// # line comment (MySQL only)
87+
if constexpr (D == Dialect::MySQL) {
88+
if (c == '#') {
89+
++cursor_;
90+
while (cursor_ < end_ && *cursor_ != '\n') ++cursor_;
91+
continue;
92+
}
93+
}
94+
95+
// /* block comment */
96+
if (c == '/' && peek_char(1) == '*') {
97+
cursor_ += 2;
98+
if constexpr (D == Dialect::PostgreSQL) {
99+
// PostgreSQL supports nested block comments
100+
int depth = 1;
101+
while (cursor_ < end_ && depth > 0) {
102+
if (*cursor_ == '/' && peek_char(1) == '*') {
103+
++depth;
104+
cursor_ += 2;
105+
} else if (*cursor_ == '*' && peek_char(1) == '/') {
106+
--depth;
107+
cursor_ += 2;
108+
} else {
109+
++cursor_;
110+
}
111+
}
112+
} else {
113+
// MySQL: no nesting
114+
while (cursor_ < end_) {
115+
if (*cursor_ == '*' && peek_char(1) == '/') {
116+
cursor_ += 2;
117+
break;
118+
}
119+
++cursor_;
120+
}
121+
}
122+
continue;
123+
}
124+
125+
break; // not whitespace or comment
126+
}
127+
}
128+
129+
Token make_token(TokenType type, const char* start, uint32_t len) {
130+
return Token{type, StringRef{start, len},
131+
static_cast<uint32_t>(start - start_)};
132+
}
133+
134+
Token scan_identifier_or_keyword() {
135+
const char* start = cursor_;
136+
while (cursor_ < end_) {
137+
char c = *cursor_;
138+
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
139+
(c >= '0' && c <= '9') || c == '_') {
140+
++cursor_;
141+
} else {
142+
break;
143+
}
144+
}
145+
uint32_t len = static_cast<uint32_t>(cursor_ - start);
146+
147+
// Keyword lookup
148+
TokenType kw;
149+
if constexpr (D == Dialect::MySQL) {
150+
kw = mysql_keywords::lookup(start, len);
151+
} else {
152+
kw = pgsql_keywords::lookup(start, len);
153+
}
154+
return make_token(kw, start, len);
155+
}
156+
157+
Token scan_number() {
158+
const char* start = cursor_;
159+
bool has_dot = false;
160+
while (cursor_ < end_) {
161+
char c = *cursor_;
162+
if (c >= '0' && c <= '9') {
163+
++cursor_;
164+
} else if (c == '.' && !has_dot) {
165+
has_dot = true;
166+
++cursor_;
167+
} else {
168+
break;
169+
}
170+
}
171+
uint32_t len = static_cast<uint32_t>(cursor_ - start);
172+
return make_token(has_dot ? TokenType::TK_FLOAT : TokenType::TK_INTEGER,
173+
start, len);
174+
}
175+
176+
Token scan_single_quoted_string() {
177+
++cursor_; // skip opening quote
178+
const char* content_start = cursor_;
179+
while (cursor_ < end_ && *cursor_ != '\'') {
180+
if (*cursor_ == '\\') {
181+
++cursor_; // skip escaped char
182+
if (cursor_ < end_) ++cursor_;
183+
} else {
184+
++cursor_;
185+
}
186+
}
187+
uint32_t len = static_cast<uint32_t>(cursor_ - content_start);
188+
if (cursor_ < end_) ++cursor_; // skip closing quote
189+
return make_token(TokenType::TK_STRING, content_start, len);
190+
}
191+
192+
// MySQL: backtick-quoted identifier
193+
Token scan_backtick_identifier() {
194+
++cursor_; // skip opening backtick
195+
const char* content_start = cursor_;
196+
while (cursor_ < end_ && *cursor_ != '`') ++cursor_;
197+
uint32_t len = static_cast<uint32_t>(cursor_ - content_start);
198+
if (cursor_ < end_) ++cursor_; // skip closing backtick
199+
return make_token(TokenType::TK_IDENTIFIER, content_start, len);
200+
}
201+
202+
// PostgreSQL: double-quoted identifier
203+
Token scan_double_quoted_identifier() {
204+
++cursor_; // skip opening quote
205+
const char* content_start = cursor_;
206+
while (cursor_ < end_ && *cursor_ != '"') ++cursor_;
207+
uint32_t len = static_cast<uint32_t>(cursor_ - content_start);
208+
if (cursor_ < end_) ++cursor_; // skip closing quote
209+
return make_token(TokenType::TK_IDENTIFIER, content_start, len);
210+
}
211+
212+
// PostgreSQL: $$...$$ dollar-quoted string
213+
Token scan_dollar_string() {
214+
// We're at the first $. Simple form: $$content$$
215+
cursor_ += 2; // skip opening $$
216+
const char* content_start = cursor_;
217+
while (cursor_ < end_) {
218+
if (*cursor_ == '$' && peek_char(1) == '$') {
219+
uint32_t len = static_cast<uint32_t>(cursor_ - content_start);
220+
cursor_ += 2; // skip closing $$
221+
return make_token(TokenType::TK_STRING, content_start, len);
222+
}
223+
++cursor_;
224+
}
225+
// Unterminated — return what we have
226+
uint32_t len = static_cast<uint32_t>(cursor_ - content_start);
227+
return make_token(TokenType::TK_STRING, content_start, len);
228+
}
229+
230+
Token scan_token() {
231+
skip_whitespace_and_comments();
232+
233+
if (cursor_ >= end_) {
234+
return make_token(TokenType::TK_EOF, cursor_, 0);
235+
}
236+
237+
char c = *cursor_;
238+
239+
// Identifiers and keywords
240+
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_') {
241+
return scan_identifier_or_keyword();
242+
}
243+
244+
// Numbers
245+
if (c >= '0' && c <= '9') {
246+
return scan_number();
247+
}
248+
249+
// Dot — could be start of .123 float or just dot
250+
if (c == '.' && cursor_ + 1 < end_ &&
251+
peek_char(1) >= '0' && peek_char(1) <= '9') {
252+
return scan_number();
253+
}
254+
255+
// String literals
256+
if (c == '\'') return scan_single_quoted_string();
257+
258+
// MySQL: double-quoted strings; PostgreSQL: double-quoted identifiers
259+
if (c == '"') {
260+
if constexpr (D == Dialect::MySQL) {
261+
// In MySQL, double quotes are strings (unless ANSI_QUOTES mode)
262+
++cursor_;
263+
const char* content_start = cursor_;
264+
while (cursor_ < end_ && *cursor_ != '"') {
265+
if (*cursor_ == '\\') { ++cursor_; if (cursor_ < end_) ++cursor_; }
266+
else ++cursor_;
267+
}
268+
uint32_t len = static_cast<uint32_t>(cursor_ - content_start);
269+
if (cursor_ < end_) ++cursor_;
270+
return make_token(TokenType::TK_STRING, content_start, len);
271+
} else {
272+
return scan_double_quoted_identifier();
273+
}
274+
}
275+
276+
// Backtick identifier (MySQL only)
277+
if constexpr (D == Dialect::MySQL) {
278+
if (c == '`') return scan_backtick_identifier();
279+
}
280+
281+
// @ and @@
282+
if (c == '@') {
283+
if (peek_char(1) == '@') {
284+
const char* s = cursor_;
285+
cursor_ += 2;
286+
return make_token(TokenType::TK_DOUBLE_AT, s, 2);
287+
}
288+
const char* s = cursor_;
289+
++cursor_;
290+
return make_token(TokenType::TK_AT, s, 1);
291+
}
292+
293+
// $ — PostgreSQL: $N placeholder or $$string$$
294+
if constexpr (D == Dialect::PostgreSQL) {
295+
if (c == '$') {
296+
if (peek_char(1) == '$') {
297+
return scan_dollar_string();
298+
}
299+
if (peek_char(1) >= '0' && peek_char(1) <= '9') {
300+
const char* start = cursor_;
301+
++cursor_; // skip $
302+
while (cursor_ < end_ && *cursor_ >= '0' && *cursor_ <= '9')
303+
++cursor_;
304+
uint32_t len = static_cast<uint32_t>(cursor_ - start);
305+
return make_token(TokenType::TK_DOLLAR_NUM, start, len);
306+
}
307+
}
308+
}
309+
310+
// Two-character operators
311+
if (cursor_ + 1 < end_) {
312+
char c2 = peek_char(1);
313+
314+
if (c == '<' && c2 == '=') { auto s = cursor_; cursor_ += 2; return make_token(TokenType::TK_LESS_EQUAL, s, 2); }
315+
if (c == '>' && c2 == '=') { auto s = cursor_; cursor_ += 2; return make_token(TokenType::TK_GREATER_EQUAL, s, 2); }
316+
if (c == '!' && c2 == '=') { auto s = cursor_; cursor_ += 2; return make_token(TokenType::TK_NOT_EQUAL, s, 2); }
317+
if (c == '<' && c2 == '>') { auto s = cursor_; cursor_ += 2; return make_token(TokenType::TK_NOT_EQUAL, s, 2); }
318+
if (c == '|' && c2 == '|') { auto s = cursor_; cursor_ += 2; return make_token(TokenType::TK_DOUBLE_PIPE, s, 2); }
319+
320+
if constexpr (D == Dialect::MySQL) {
321+
if (c == ':' && c2 == '=') { auto s = cursor_; cursor_ += 2; return make_token(TokenType::TK_COLON_EQUAL, s, 2); }
322+
}
323+
324+
if constexpr (D == Dialect::PostgreSQL) {
325+
if (c == ':' && c2 == ':') { auto s = cursor_; cursor_ += 2; return make_token(TokenType::TK_DOUBLE_COLON, s, 2); }
326+
}
327+
}
328+
329+
// Single-character operators/punctuation
330+
const char* s = cursor_;
331+
++cursor_;
332+
switch (c) {
333+
case '(': return make_token(TokenType::TK_LPAREN, s, 1);
334+
case ')': return make_token(TokenType::TK_RPAREN, s, 1);
335+
case ',': return make_token(TokenType::TK_COMMA, s, 1);
336+
case ';': return make_token(TokenType::TK_SEMICOLON, s, 1);
337+
case '.': return make_token(TokenType::TK_DOT, s, 1);
338+
case '*': return make_token(TokenType::TK_ASTERISK, s, 1);
339+
case '+': return make_token(TokenType::TK_PLUS, s, 1);
340+
case '-': return make_token(TokenType::TK_MINUS, s, 1);
341+
case '/': return make_token(TokenType::TK_SLASH, s, 1);
342+
case '%': return make_token(TokenType::TK_PERCENT, s, 1);
343+
case '=': return make_token(TokenType::TK_EQUAL, s, 1);
344+
case '<': return make_token(TokenType::TK_LESS, s, 1);
345+
case '>': return make_token(TokenType::TK_GREATER, s, 1);
346+
case '&': return make_token(TokenType::TK_AMPERSAND, s, 1);
347+
case '|': return make_token(TokenType::TK_PIPE, s, 1);
348+
case '^': return make_token(TokenType::TK_CARET, s, 1);
349+
case '~': return make_token(TokenType::TK_TILDE, s, 1);
350+
case '!': return make_token(TokenType::TK_EXCLAIM, s, 1);
351+
case ':': return make_token(TokenType::TK_COLON, s, 1);
352+
case '?': return make_token(TokenType::TK_QUESTION, s, 1);
353+
case '#': return make_token(TokenType::TK_HASH, s, 1);
354+
default: return make_token(TokenType::TK_ERROR, s, 1);
355+
}
356+
}
357+
};
358+
359+
} // namespace sql_parser
360+
361+
#endif // SQL_PARSER_TOKENIZER_H

0 commit comments

Comments
 (0)