diff --git a/examples/models/parakeet/tokenizer_utils.cpp b/examples/models/parakeet/tokenizer_utils.cpp index 8cebebd8b19..5513fb0ecb9 100644 --- a/examples/models/parakeet/tokenizer_utils.cpp +++ b/examples/models/parakeet/tokenizer_utils.cpp @@ -8,6 +8,10 @@ namespace { +// SentencePiece's word-boundary marker, spelled as UTF-8 bytes so this remains +// a const char[] literal when compiled as C++20. +constexpr char kSentencePieceWordBoundary[] = "\xE2\x96\x81"; + bool is_whitespace_only(const std::string& token) { if (token.empty()) { return true; @@ -36,7 +40,7 @@ bool is_special_token(const std::string& token) { if (token.rfind("##", 0) == 0) { return true; } - if (token.rfind(u8"▁", 0) == 0) { + if (token.rfind(kSentencePieceWordBoundary, 0) == 0) { return true; } if (is_whitespace_only(token)) { diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers index b642403834a..3f98e9903e4 160000 --- a/extension/llm/tokenizers +++ b/extension/llm/tokenizers @@ -1 +1 @@ -Subproject commit b642403834a67c8ef14a7109dcd1bb5e5f3cb68a +Subproject commit 3f98e9903e4e9972e5371522d1b64bc7793c250b