From 82463aeb7e0d5ba1ba113a06aa27228b07cadf6d Mon Sep 17 00:00:00 2001 From: RJ Ascani Date: Mon, 8 Jun 2026 14:23:59 -0700 Subject: [PATCH 1/2] Bump tokenizers submodule to fix sentencepiece GCC 15 build Updates extension/llm/tokenizers to include meta-pytorch/tokenizers#193, which bumps the sentencepiece submodule to pick up a missing `#include ` (google/sentencepiece#1109). Without this, `pytorch_tokenizers` fails to compile inside the `executorch-ubuntu-26.04-gcc15` docker image, blocking the RISC-V baremetal CI (#19917). Co-Authored-By: Claude Opus 4.6 (1M context) --- extension/llm/tokenizers | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers index b642403834a..3f98e9903e4 160000 --- a/extension/llm/tokenizers +++ b/extension/llm/tokenizers @@ -1 +1 @@ -Subproject commit b642403834a67c8ef14a7109dcd1bb5e5f3cb68a +Subproject commit 3f98e9903e4e9972e5371522d1b64bc7793c250b From bc3e7cddcf34e8b77e868973170029fd7a4b039a Mon Sep 17 00:00:00 2001 From: RJ Ascani Date: Tue, 9 Jun 2026 12:52:35 -0700 Subject: [PATCH 2/2] Fix Parakeet tokenizer C++20 build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The tokenizers submodule bump (meta-pytorch/tokenizers#193) changed CMAKE_CXX_STANDARD from 17 to 20. Under C++20 the u8"▁" literal is const char8_t[], which has no implicit conversion to const char* and breaks std::string::rfind. Spell the SentencePiece word-boundary marker as raw UTF-8 bytes, matching the fix already on the 1.3 release branch (#19824). Co-Authored-By: Claude Opus 4.6 (1M context) --- examples/models/parakeet/tokenizer_utils.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/models/parakeet/tokenizer_utils.cpp b/examples/models/parakeet/tokenizer_utils.cpp index 8cebebd8b19..5513fb0ecb9 100644 --- a/examples/models/parakeet/tokenizer_utils.cpp +++ b/examples/models/parakeet/tokenizer_utils.cpp @@ -8,6 +8,10 @@ namespace { +// SentencePiece's word-boundary marker, spelled as UTF-8 bytes so this remains +// a const char[] literal when compiled as C++20. +constexpr char kSentencePieceWordBoundary[] = "\xE2\x96\x81"; + bool is_whitespace_only(const std::string& token) { if (token.empty()) { return true; @@ -36,7 +40,7 @@ bool is_special_token(const std::string& token) { if (token.rfind("##", 0) == 0) { return true; } - if (token.rfind(u8"▁", 0) == 0) { + if (token.rfind(kSentencePieceWordBoundary, 0) == 0) { return true; } if (is_whitespace_only(token)) {