diff --git a/CHANGELOG.md b/CHANGELOG.md index ee5cd27..8da390f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 2.1.4 + +### Fixed +- **Unicode rune correctness**: Replaced all `codeUnits` (UTF-16) usage with `runes` (Unicode code points) throughout `SafeTextFilter` and `AhoCorasick`. This fixes incorrect indexing and potential missed/false matches for characters outside the Basic Multilingual Plane (e.g. emoji, supplementary CJK characters). +- Word-boundary checks, match-range offsets, and string reconstruction now operate on rune indices, ensuring accurate filtering for all Unicode input. + ## 2.1.3 ### Documentation diff --git a/README.md b/README.md index 6947f62..cf60d95 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ Or manually add it to your `pubspec.yaml`: ```yaml dependencies: - safe_text: ^2.1.3 + safe_text: ^2.1.4 ``` Then run: diff --git a/lib/src/aho_corasick.dart b/lib/src/aho_corasick.dart index ef7ca76..f5855c6 100644 --- a/lib/src/aho_corasick.dart +++ b/lib/src/aho_corasick.dart @@ -129,7 +129,7 @@ class AhoCorasick { final matches = >{}; TrieNode? current = _root; final textLower = text.toLowerCase(); - final units = textLower.codeUnits; + final units = textLower.runes.toList(); for (int i = 0; i < units.length; i++) { final rune = units[i]; diff --git a/lib/src/safe_text_filter.dart b/lib/src/safe_text_filter.dart index f97f6c2..62680aa 100644 --- a/lib/src/safe_text_filter.dart +++ b/lib/src/safe_text_filter.dart @@ -101,14 +101,19 @@ class SafeTextFilter { /// Normalizes text by replacing leet-speak with standard alphabets. static String normalizeText(String text) { if (text.isEmpty) return text; - final units = List.from(text.toLowerCase().codeUnits); - for (int i = 0; i < units.length; i++) { - final replacement = _normalizationMap[units[i]]; + final runes = _normalizeToRunes(text); + return String.fromCharCodes(runes); + } + + static List _normalizeToRunes(String text) { + final runes = text.toLowerCase().runes.toList(); + for (int i = 0; i < runes.length; i++) { + final replacement = _normalizationMap[runes[i]]; if (replacement != null) { - units[i] = replacement; + runes[i] = replacement; } } - return String.fromCharCodes(units); + return runes; } /// Static method to check if a string contains any bad words. @@ -120,23 +125,26 @@ class SafeTextFilter { }) async { if (text.isEmpty) return false; - final normalized = normalizeText(text); + final normalizedRunes = _normalizeToRunes(text); // Optimized sync check if initialized if (_isInitialized && useDefaultWords) { + final normalized = String.fromCharCodes(normalizedRunes); final matches = _trie!.search(normalized); for (final entry in matches.entries) { final endIndex = entry.key; for (final word in entry.value) { if (excludedWords != null && excludedWords.contains(word)) continue; + final wordRuneLength = word.runes.length; if (_isWordBoundary( - normalized, endIndex - word.length + 1, endIndex + 1)) { + normalizedRunes, endIndex - wordRuneLength + 1, endIndex + 1)) { return true; } } } } else if (useDefaultWords) { // Fallback or legacy path + final normalized = String.fromCharCodes(normalizedRunes); for (final word in badWords) { if (excludedWords != null && excludedWords.contains(word)) continue; if (_hasMatch(normalized, word)) return true; @@ -144,6 +152,7 @@ class SafeTextFilter { } if (extraWords != null) { + final normalized = String.fromCharCodes(normalizedRunes); for (final word in extraWords) { if (excludedWords != null && excludedWords.contains(word)) continue; if (_hasMatch(normalized, word)) return true; @@ -155,12 +164,22 @@ class SafeTextFilter { static bool _hasMatch(String normalizedText, String word) { final wordLower = word.toLowerCase(); - int index = normalizedText.indexOf(wordLower); - while (index != -1) { - if (_isWordBoundary(normalizedText, index, index + wordLower.length)) { + final normalizedRunes = normalizedText.runes.toList(); + final wordRunes = wordLower.runes.toList(); + + // Simple pattern matching on runes + for (int i = 0; i <= normalizedRunes.length - wordRunes.length; i++) { + bool match = true; + for (int j = 0; j < wordRunes.length; j++) { + if (normalizedRunes[i + j] != wordRunes[j]) { + match = false; + break; + } + } + + if (match && _isWordBoundary(normalizedRunes, i, i + wordRunes.length)) { return true; } - index = normalizedText.indexOf(wordLower, index + 1); } return false; } @@ -202,17 +221,20 @@ class SafeTextFilter { if (text.isEmpty) return text; - final normalizedText = normalizeText(text); + final textRunes = text.runes.toList(); + final normalizedRunes = _normalizeToRunes(text); final List<_Range> matchRanges = []; // Step 1: Collect match ranges if (_isInitialized && useDefaultWords) { - final trieMatches = _trie!.search(normalizedText); + final normalized = String.fromCharCodes(normalizedRunes); + final trieMatches = _trie!.search(normalized); trieMatches.forEach((endIndex, words) { for (final word in words) { if (excludedWords != null && excludedWords.contains(word)) continue; - final startIndex = endIndex - word.length + 1; - if (_isWordBoundary(normalizedText, startIndex, endIndex + 1)) { + final wordRuneLength = word.runes.length; + final startIndex = endIndex - wordRuneLength + 1; + if (_isWordBoundary(normalizedRunes, startIndex, endIndex + 1)) { matchRanges.add(_Range(startIndex, endIndex + 1)); } } @@ -220,14 +242,14 @@ class SafeTextFilter { } else if (useDefaultWords) { for (final word in badWords) { if (excludedWords != null && excludedWords.contains(word)) continue; - _addMatchesForWord(normalizedText, word, matchRanges); + _addMatchesForWord(normalizedRunes, word, matchRanges); } } if (extraWords != null) { for (final word in extraWords) { if (excludedWords != null && excludedWords.contains(word)) continue; - _addMatchesForWord(normalizedText, word, matchRanges); + _addMatchesForWord(normalizedRunes, word, matchRanges); } } @@ -256,7 +278,8 @@ class SafeTextFilter { int lastAppended = 0; for (final range in merged) { - buffer.write(text.substring(lastAppended, range.start)); + buffer.write( + String.fromCharCodes(textRunes.sublist(lastAppended, range.start))); final matchLength = range.end - range.start; switch (maskStrategy) { @@ -273,9 +296,11 @@ class SafeTextFilter { final showLast = matchLength >= 4; final maskCount = matchLength - 1 - (showLast ? 1 : 0); buffer - ..write(text[range.start]) + ..write(String.fromCharCode(textRunes[range.start])) ..write(obscureSymbol * maskCount); - if (showLast) buffer.write(text[range.end - 1]); + if (showLast) { + buffer.write(String.fromCharCode(textRunes[range.end - 1])); + } } // Custom: replace entire word with the replacement string @@ -285,40 +310,50 @@ class SafeTextFilter { lastAppended = range.end; } - if (lastAppended < text.length) { - buffer.write(text.substring(lastAppended)); + if (lastAppended < textRunes.length) { + buffer.write(String.fromCharCodes(textRunes.sublist(lastAppended))); } return buffer.toString(); } static void _addMatchesForWord( - String normalizedText, String word, List<_Range> matches) { - final wordLower = word.toLowerCase(); - int index = normalizedText.indexOf(wordLower); - while (index != -1) { - final endIndex = index + wordLower.length; - if (_isWordBoundary(normalizedText, index, endIndex)) { - matches.add(_Range(index, endIndex)); + List normalizedRunes, String word, List<_Range> matches) { + final wordRunes = word.toLowerCase().runes.toList(); + if (wordRunes.isEmpty) return; + + for (int i = 0; i <= normalizedRunes.length - wordRunes.length; i++) { + bool match = true; + for (int j = 0; j < wordRunes.length; j++) { + if (normalizedRunes[i + j] != wordRunes[j]) { + match = false; + break; + } + } + + if (match) { + final endIndex = i + wordRunes.length; + if (_isWordBoundary(normalizedRunes, i, endIndex)) { + matches.add(_Range(i, endIndex)); + } } - index = normalizedText.indexOf(wordLower, index + 1); } } - static bool _isWordBoundary(String text, int start, int end) { + static bool _isWordBoundary(List runes, int start, int end) { if (start > 0) { - final charCode = text.codeUnitAt(start - 1); - if (_isAlphanumeric(charCode)) return false; + final rune = runes[start - 1]; + if (_isAlphanumeric(rune)) return false; } - if (end < text.length) { - final charCode = text.codeUnitAt(end); - if (_isAlphanumeric(charCode)) return false; + if (end < runes.length) { + final rune = runes[end]; + if (_isAlphanumeric(rune)) return false; } return true; } - static bool _isAlphanumeric(int charCode) { - return _unicodeLetterOrDigit.hasMatch(String.fromCharCode(charCode)); + static bool _isAlphanumeric(int rune) { + return _unicodeLetterOrDigit.hasMatch(String.fromCharCode(rune)); } } diff --git a/pubspec.yaml b/pubspec.yaml index 2e20310..404495f 100644 --- a/pubspec.yaml +++ b/pubspec.yaml @@ -1,6 +1,6 @@ name: safe_text description: A Flutter package for filtering out bad words from text inputs and detecting phone numbers in various formats including multiplier words. -version: 2.1.3 +version: 2.1.4 homepage: https://github.com/master-wayne7/safe_text repository: https://github.com/master-wayne7/safe_text issue_tracker: https://github.com/master-wayne7/safe_text/issues diff --git a/test/aho_corasick_test.dart b/test/aho_corasick_test.dart new file mode 100644 index 0000000..5d2dac2 --- /dev/null +++ b/test/aho_corasick_test.dart @@ -0,0 +1,78 @@ +import 'package:flutter_test/flutter_test.dart'; +import 'package:safe_text/src/aho_corasick.dart'; + +void main() { + group('AhoCorasick Unicode Search', () { + late AhoCorasick ac; + + setUp(() { + ac = AhoCorasick(); + }); + + test('finds basic ASCII words', () { + ac.addWord('apple'); + ac.addWord('banana'); + ac.buildFailureLinks(); + + final matches = ac.search('I like apple and banana'); + + // 'apple' ends at rune index 11 + // 'banana' ends at rune index 22 + expect(matches[11], contains('apple')); + expect(matches[22], contains('banana')); + }); + + test('finds words with non-BMP characters (emojis)', () { + // 💩 is U+1F4A9 + // 🖕 is U+1F595 + ac.addWord('💩'); + ac.addWord('🖕'); + ac.buildFailureLinks(); + + final text = 'You are a 💩 and a 🖕'; + final matches = ac.search(text); + + // Rune indices: + // Y(0) o(1) u(2) (3) a(4) r(5) e(6) (7) a(8) (9) 💩(10) (11) a(12) n(13) d(14) (15) a(16) (17) 🖕(18) + + expect(matches.containsKey(10), true); + expect(matches[10], contains('💩')); + + expect(matches.containsKey(18), true); + expect(matches[18], contains('🖕')); + }); + + test('handles mixed emojis and text', () { + ac.addWord('bad💩'); + ac.addWord('word'); + ac.buildFailureLinks(); + + final text = 'This is a bad💩 word'; + final matches = ac.search(text); + + // bad💩 ends at rune index 13 + // word ends at rune index 18 + expect(matches[13], contains('bad💩')); + expect(matches[18], contains('word')); + }); + + test('case-insensitive search with Unicode', () { + ac.addWord('💩'); + ac.buildFailureLinks(); + + // Note: most emojis don't have case, but this tests the pipeline + final matches = ac.search('POOP 💩'); + expect(matches[5], contains('💩')); + }); + + test('handles overlapping matches with runes', () { + ac.addWord('ab'); + ac.addWord('abc'); + ac.buildFailureLinks(); + + final matches = ac.search('abcd'); + expect(matches[1], contains('ab')); + expect(matches[2], contains('abc')); + }); + }); +} diff --git a/test/profanity_filter_test.dart b/test/profanity_filter_test.dart index f710286..7913d3b 100644 --- a/test/profanity_filter_test.dart +++ b/test/profanity_filter_test.dart @@ -230,6 +230,16 @@ void main() { expect(filtered, contains("******")); expect(filtered, contains("*****")); }); + + test("filters foul emojis and characters outside BMP correctly", () async { + await SafeTextFilter.init(language: Language.english); + + // 🖕 and 💩 are in en.txt + final text = "You are a 💩 and a 🖕"; + final filtered = SafeTextFilter.filterText(text: text); + + expect(filtered, "You are a * and a *"); + }); }); group("Performance Benchmark (Aho-Corasick vs Legacy Loop)", () {