From 11fa70559a1bc340ca30b9b961677d46da60d81e Mon Sep 17 00:00:00 2001 From: Kirthisai Date: Sat, 11 Apr 2026 21:22:32 +0530 Subject: [PATCH 1/4] fix: use runes instead of codeUnits in AhoCorasick.search --- lib/src/aho_corasick.dart | 81 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 4 deletions(-) diff --git a/lib/src/aho_corasick.dart b/lib/src/aho_corasick.dart index 9294d6a..f5855c6 100644 --- a/lib/src/aho_corasick.dart +++ b/lib/src/aho_corasick.dart @@ -1,12 +1,56 @@ +/// A single node in the Aho-Corasick trie. +/// +/// Each node represents a prefix of one or more patterns that have been +/// inserted via [AhoCorasick.addWord]. class TrieNode { + /// Maps a Unicode code point to the child [TrieNode] for that character. final Map children = {}; + + /// Failure (fallback) link — points to the node representing the longest + /// proper suffix of the current path that is also a valid prefix in the trie. + /// + /// Set on all non-root nodes by [AhoCorasick.buildFailureLinks]. TrieNode? fail; + + /// Patterns that terminate at this node. + /// + /// After [AhoCorasick.buildFailureLinks] is called, this list also includes + /// patterns inherited from nodes reachable via [fail] links (the "dictionary + /// suffix links" of the classic algorithm). final List outputs = []; } +/// An implementation of the Aho-Corasick multi-pattern string search algorithm. +/// +/// Aho-Corasick finds all occurrences of a set of patterns in a text in a +/// single linear pass — O(n + m + z), where n is the text length, m is the +/// total length of all patterns, and z is the number of matches. This makes +/// it well-suited for profanity filtering with large word lists. +/// +/// ## Usage +/// +/// ```dart +/// final ac = AhoCorasick(); +/// ac.addWord('bad'); +/// ac.addWord('worse'); +/// ac.buildFailureLinks(); // must be called before search +/// +/// final matches = ac.search('this is bad and worse'); +/// // {10: ['bad'], 20: ['worse']} +/// ``` +/// +/// **Important:** always call [buildFailureLinks] after adding all words and +/// before calling [search]. Omitting this step produces incorrect results. class AhoCorasick { final TrieNode _root = TrieNode(); + /// Inserts [word] into the trie. + /// + /// The word is lowercased before insertion so that [search] can operate on + /// pre-lowercased input. Empty strings are silently ignored. + /// + /// Call this for every pattern you want to detect, then call + /// [buildFailureLinks] once before any calls to [search]. void addWord(String word) { if (word.isEmpty) return; TrieNode current = _root; @@ -17,6 +61,15 @@ class AhoCorasick { current.outputs.add(word.toLowerCase()); } + /// Constructs failure links for all nodes in the trie using a BFS traversal. + /// + /// This is the preprocessing phase of the Aho-Corasick algorithm. It must + /// be called **once**, after all words have been added via [addWord] and + /// before any calls to [search]. + /// + /// Failure links allow the search to fall back to the longest matching + /// suffix instead of restarting from the root on a mismatch, which keeps + /// the search complexity linear in the length of the input. void buildFailureLinks() { final queue = []; @@ -49,14 +102,34 @@ class AhoCorasick { } } - /// Finds all matches in the text. - /// Returns a map where the key is the string index where the match ENDS - /// and the value is a list of matching words. + /// Searches [text] for all patterns previously added via [addWord]. + /// + /// Returns a [Map] where each key is the **zero-based end index** (inclusive) + /// of a match within [text], and the corresponding value is the list of + /// pattern strings that end at that position. + /// + /// The search is case-insensitive — [text] is lowercased internally before + /// matching. + /// + /// [buildFailureLinks] must have been called before invoking this method. + /// + /// ```dart + /// final ac = AhoCorasick() + /// ..addWord('he') + /// ..addWord('she') + /// ..addWord('hers') + /// ..buildFailureLinks(); + /// + /// final result = ac.search('ushers'); + /// // Keys represent end indices; values are matched words at that position. + /// ``` + /// + /// Returns an empty map if no patterns match. Map> search(String text) { final matches = >{}; TrieNode? current = _root; final textLower = text.toLowerCase(); - final units = textLower.codeUnits; + final units = textLower.runes.toList(); for (int i = 0; i < units.length; i++) { final rune = units[i]; From 60b09eee96b334fb50ea7c03d2c8d36523233aa9 Mon Sep 17 00:00:00 2001 From: Kirthisai Date: Mon, 13 Apr 2026 22:54:23 +0530 Subject: [PATCH 2/4] fix: Unicode support and emoji filtering in Aho-Corasick - Updated SafeTextFilter to use runes consistently for searching and filtering. - Optimized rune conversions to maintain Aho-Corasick performance. - Added unit test for emoji filtering. --- lib/src/safe_text_filter.dart | 112 +++++++++++++++++++++----------- test/profanity_filter_test.dart | 10 +++ 2 files changed, 84 insertions(+), 38 deletions(-) diff --git a/lib/src/safe_text_filter.dart b/lib/src/safe_text_filter.dart index f97f6c2..d5e8347 100644 --- a/lib/src/safe_text_filter.dart +++ b/lib/src/safe_text_filter.dart @@ -101,14 +101,19 @@ class SafeTextFilter { /// Normalizes text by replacing leet-speak with standard alphabets. static String normalizeText(String text) { if (text.isEmpty) return text; - final units = List.from(text.toLowerCase().codeUnits); - for (int i = 0; i < units.length; i++) { - final replacement = _normalizationMap[units[i]]; + final runes = _normalizeToRunes(text); + return String.fromCharCodes(runes); + } + + static List _normalizeToRunes(String text) { + final runes = text.toLowerCase().runes.toList(); + for (int i = 0; i < runes.length; i++) { + final replacement = _normalizationMap[runes[i]]; if (replacement != null) { - units[i] = replacement; + runes[i] = replacement; } } - return String.fromCharCodes(units); + return runes; } /// Static method to check if a string contains any bad words. @@ -120,23 +125,26 @@ class SafeTextFilter { }) async { if (text.isEmpty) return false; - final normalized = normalizeText(text); + final normalizedRunes = _normalizeToRunes(text); // Optimized sync check if initialized if (_isInitialized && useDefaultWords) { + final normalized = String.fromCharCodes(normalizedRunes); final matches = _trie!.search(normalized); for (final entry in matches.entries) { final endIndex = entry.key; for (final word in entry.value) { if (excludedWords != null && excludedWords.contains(word)) continue; + final wordRuneLength = word.runes.length; if (_isWordBoundary( - normalized, endIndex - word.length + 1, endIndex + 1)) { + normalizedRunes, endIndex - wordRuneLength + 1, endIndex + 1)) { return true; } } } } else if (useDefaultWords) { // Fallback or legacy path + final normalized = String.fromCharCodes(normalizedRunes); for (final word in badWords) { if (excludedWords != null && excludedWords.contains(word)) continue; if (_hasMatch(normalized, word)) return true; @@ -144,6 +152,7 @@ class SafeTextFilter { } if (extraWords != null) { + final normalized = String.fromCharCodes(normalizedRunes); for (final word in extraWords) { if (excludedWords != null && excludedWords.contains(word)) continue; if (_hasMatch(normalized, word)) return true; @@ -155,12 +164,23 @@ class SafeTextFilter { static bool _hasMatch(String normalizedText, String word) { final wordLower = word.toLowerCase(); - int index = normalizedText.indexOf(wordLower); - while (index != -1) { - if (_isWordBoundary(normalizedText, index, index + wordLower.length)) { + final normalizedRunes = normalizedText.runes.toList(); + final wordRunes = wordLower.runes.toList(); + + // Simple pattern matching on runes + for (int i = 0; i <= normalizedRunes.length - wordRunes.length; i++) { + bool match = true; + for (int j = 0; j < wordRunes.length; j++) { + if (normalizedRunes[i + j] != wordRunes[j]) { + match = false; + break; + } + } + + if (match && + _isWordBoundary(normalizedRunes, i, i + wordRunes.length)) { return true; } - index = normalizedText.indexOf(wordLower, index + 1); } return false; } @@ -202,17 +222,20 @@ class SafeTextFilter { if (text.isEmpty) return text; - final normalizedText = normalizeText(text); + final textRunes = text.runes.toList(); + final normalizedRunes = _normalizeToRunes(text); final List<_Range> matchRanges = []; // Step 1: Collect match ranges if (_isInitialized && useDefaultWords) { - final trieMatches = _trie!.search(normalizedText); + final normalized = String.fromCharCodes(normalizedRunes); + final trieMatches = _trie!.search(normalized); trieMatches.forEach((endIndex, words) { for (final word in words) { if (excludedWords != null && excludedWords.contains(word)) continue; - final startIndex = endIndex - word.length + 1; - if (_isWordBoundary(normalizedText, startIndex, endIndex + 1)) { + final wordRuneLength = word.runes.length; + final startIndex = endIndex - wordRuneLength + 1; + if (_isWordBoundary(normalizedRunes, startIndex, endIndex + 1)) { matchRanges.add(_Range(startIndex, endIndex + 1)); } } @@ -220,14 +243,14 @@ class SafeTextFilter { } else if (useDefaultWords) { for (final word in badWords) { if (excludedWords != null && excludedWords.contains(word)) continue; - _addMatchesForWord(normalizedText, word, matchRanges); + _addMatchesForWord(normalizedRunes, word, matchRanges); } } if (extraWords != null) { for (final word in extraWords) { if (excludedWords != null && excludedWords.contains(word)) continue; - _addMatchesForWord(normalizedText, word, matchRanges); + _addMatchesForWord(normalizedRunes, word, matchRanges); } } @@ -256,7 +279,8 @@ class SafeTextFilter { int lastAppended = 0; for (final range in merged) { - buffer.write(text.substring(lastAppended, range.start)); + buffer.write( + String.fromCharCodes(textRunes.sublist(lastAppended, range.start))); final matchLength = range.end - range.start; switch (maskStrategy) { @@ -273,9 +297,11 @@ class SafeTextFilter { final showLast = matchLength >= 4; final maskCount = matchLength - 1 - (showLast ? 1 : 0); buffer - ..write(text[range.start]) + ..write(String.fromCharCode(textRunes[range.start])) ..write(obscureSymbol * maskCount); - if (showLast) buffer.write(text[range.end - 1]); + if (showLast) { + buffer.write(String.fromCharCode(textRunes[range.end - 1])); + } } // Custom: replace entire word with the replacement string @@ -285,40 +311,50 @@ class SafeTextFilter { lastAppended = range.end; } - if (lastAppended < text.length) { - buffer.write(text.substring(lastAppended)); + if (lastAppended < textRunes.length) { + buffer.write(String.fromCharCodes(textRunes.sublist(lastAppended))); } return buffer.toString(); } static void _addMatchesForWord( - String normalizedText, String word, List<_Range> matches) { - final wordLower = word.toLowerCase(); - int index = normalizedText.indexOf(wordLower); - while (index != -1) { - final endIndex = index + wordLower.length; - if (_isWordBoundary(normalizedText, index, endIndex)) { - matches.add(_Range(index, endIndex)); + List normalizedRunes, String word, List<_Range> matches) { + final wordRunes = word.toLowerCase().runes.toList(); + if (wordRunes.isEmpty) return; + + for (int i = 0; i <= normalizedRunes.length - wordRunes.length; i++) { + bool match = true; + for (int j = 0; j < wordRunes.length; j++) { + if (normalizedRunes[i + j] != wordRunes[j]) { + match = false; + break; + } + } + + if (match) { + final endIndex = i + wordRunes.length; + if (_isWordBoundary(normalizedRunes, i, endIndex)) { + matches.add(_Range(i, endIndex)); + } } - index = normalizedText.indexOf(wordLower, index + 1); } } - static bool _isWordBoundary(String text, int start, int end) { + static bool _isWordBoundary(List runes, int start, int end) { if (start > 0) { - final charCode = text.codeUnitAt(start - 1); - if (_isAlphanumeric(charCode)) return false; + final rune = runes[start - 1]; + if (_isAlphanumeric(rune)) return false; } - if (end < text.length) { - final charCode = text.codeUnitAt(end); - if (_isAlphanumeric(charCode)) return false; + if (end < runes.length) { + final rune = runes[end]; + if (_isAlphanumeric(rune)) return false; } return true; } - static bool _isAlphanumeric(int charCode) { - return _unicodeLetterOrDigit.hasMatch(String.fromCharCode(charCode)); + static bool _isAlphanumeric(int rune) { + return _unicodeLetterOrDigit.hasMatch(String.fromCharCode(rune)); } } diff --git a/test/profanity_filter_test.dart b/test/profanity_filter_test.dart index f710286..7913d3b 100644 --- a/test/profanity_filter_test.dart +++ b/test/profanity_filter_test.dart @@ -230,6 +230,16 @@ void main() { expect(filtered, contains("******")); expect(filtered, contains("*****")); }); + + test("filters foul emojis and characters outside BMP correctly", () async { + await SafeTextFilter.init(language: Language.english); + + // 🖕 and 💩 are in en.txt + final text = "You are a 💩 and a 🖕"; + final filtered = SafeTextFilter.filterText(text: text); + + expect(filtered, "You are a * and a *"); + }); }); group("Performance Benchmark (Aho-Corasick vs Legacy Loop)", () { From 02bed1e69bc8aca75a437870effbb0b55351ede0 Mon Sep 17 00:00:00 2001 From: Kirthisai Date: Tue, 14 Apr 2026 23:15:18 +0530 Subject: [PATCH 3/4] fix: formatting and added Aho-Corasick Unicode tests --- lib/src/safe_text_filter.dart | 3 +- test/aho_corasick_test.dart | 78 +++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 test/aho_corasick_test.dart diff --git a/lib/src/safe_text_filter.dart b/lib/src/safe_text_filter.dart index d5e8347..62680aa 100644 --- a/lib/src/safe_text_filter.dart +++ b/lib/src/safe_text_filter.dart @@ -177,8 +177,7 @@ class SafeTextFilter { } } - if (match && - _isWordBoundary(normalizedRunes, i, i + wordRunes.length)) { + if (match && _isWordBoundary(normalizedRunes, i, i + wordRunes.length)) { return true; } } diff --git a/test/aho_corasick_test.dart b/test/aho_corasick_test.dart new file mode 100644 index 0000000..5d2dac2 --- /dev/null +++ b/test/aho_corasick_test.dart @@ -0,0 +1,78 @@ +import 'package:flutter_test/flutter_test.dart'; +import 'package:safe_text/src/aho_corasick.dart'; + +void main() { + group('AhoCorasick Unicode Search', () { + late AhoCorasick ac; + + setUp(() { + ac = AhoCorasick(); + }); + + test('finds basic ASCII words', () { + ac.addWord('apple'); + ac.addWord('banana'); + ac.buildFailureLinks(); + + final matches = ac.search('I like apple and banana'); + + // 'apple' ends at rune index 11 + // 'banana' ends at rune index 22 + expect(matches[11], contains('apple')); + expect(matches[22], contains('banana')); + }); + + test('finds words with non-BMP characters (emojis)', () { + // 💩 is U+1F4A9 + // 🖕 is U+1F595 + ac.addWord('💩'); + ac.addWord('🖕'); + ac.buildFailureLinks(); + + final text = 'You are a 💩 and a 🖕'; + final matches = ac.search(text); + + // Rune indices: + // Y(0) o(1) u(2) (3) a(4) r(5) e(6) (7) a(8) (9) 💩(10) (11) a(12) n(13) d(14) (15) a(16) (17) 🖕(18) + + expect(matches.containsKey(10), true); + expect(matches[10], contains('💩')); + + expect(matches.containsKey(18), true); + expect(matches[18], contains('🖕')); + }); + + test('handles mixed emojis and text', () { + ac.addWord('bad💩'); + ac.addWord('word'); + ac.buildFailureLinks(); + + final text = 'This is a bad💩 word'; + final matches = ac.search(text); + + // bad💩 ends at rune index 13 + // word ends at rune index 18 + expect(matches[13], contains('bad💩')); + expect(matches[18], contains('word')); + }); + + test('case-insensitive search with Unicode', () { + ac.addWord('💩'); + ac.buildFailureLinks(); + + // Note: most emojis don't have case, but this tests the pipeline + final matches = ac.search('POOP 💩'); + expect(matches[5], contains('💩')); + }); + + test('handles overlapping matches with runes', () { + ac.addWord('ab'); + ac.addWord('abc'); + ac.buildFailureLinks(); + + final matches = ac.search('abcd'); + expect(matches[1], contains('ab')); + expect(matches[2], contains('abc')); + }); + }); +} From 51196d76df5a847bf292e14d8de0d138fbcef1af Mon Sep 17 00:00:00 2001 From: Ronit Rameja Date: Mon, 20 Apr 2026 19:31:21 +0530 Subject: [PATCH 4/4] updated the version to 2.1.4 --- CHANGELOG.md | 6 ++++++ README.md | 2 +- pubspec.yaml | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ee5cd27..8da390f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 2.1.4 + +### Fixed +- **Unicode rune correctness**: Replaced all `codeUnits` (UTF-16) usage with `runes` (Unicode code points) throughout `SafeTextFilter` and `AhoCorasick`. This fixes incorrect indexing and potential missed/false matches for characters outside the Basic Multilingual Plane (e.g. emoji, supplementary CJK characters). +- Word-boundary checks, match-range offsets, and string reconstruction now operate on rune indices, ensuring accurate filtering for all Unicode input. + ## 2.1.3 ### Documentation diff --git a/README.md b/README.md index 6947f62..cf60d95 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ Or manually add it to your `pubspec.yaml`: ```yaml dependencies: - safe_text: ^2.1.3 + safe_text: ^2.1.4 ``` Then run: diff --git a/pubspec.yaml b/pubspec.yaml index 2e20310..404495f 100644 --- a/pubspec.yaml +++ b/pubspec.yaml @@ -1,6 +1,6 @@ name: safe_text description: A Flutter package for filtering out bad words from text inputs and detecting phone numbers in various formats including multiplier words. -version: 2.1.3 +version: 2.1.4 homepage: https://github.com/master-wayne7/safe_text repository: https://github.com/master-wayne7/safe_text issue_tracker: https://github.com/master-wayne7/safe_text/issues