From 2c2a1efd47cecaeccf8e3283e63a28296bcaecd9 Mon Sep 17 00:00:00 2001 From: phoneee Date: Sun, 29 Mar 2026 18:09:29 +0700 Subject: [PATCH] fix: catch TypeError instead of ValueError in sent_tokenize --- pythainlp/tokenize/core.py | 2 +- tests/core/test_tokenize.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index ff133c68b..64d716c0a 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -488,7 +488,7 @@ def sent_tokenize( if isinstance(text, list): try: original_text = "".join(text) - except ValueError: + except TypeError: return [] else: original_text = str(text) diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py index 0b5befd3a..af49a1e3a 100644 --- a/tests/core/test_tokenize.py +++ b/tests/core/test_tokenize.py @@ -333,6 +333,16 @@ def test_sent_tokenize(self): ) with self.assertRaises(ValueError): sent_tokenize("ฉันไป กิน", engine="XX") # engine does not exist + # Reproduce: list with non-string items should return [] + # instead of raising TypeError (str.join raises TypeError, not ValueError) + self.assertEqual( + sent_tokenize(["สวัสดี", 123], engine="whitespace+newline"), + [], + ) + self.assertEqual( + sent_tokenize(["สวัสดี", None], engine="whitespace+newline"), + [], + ) def test_subword_tokenize(self): self.assertEqual(subword_tokenize(None), []) # type: ignore[arg-type]