From 11fa70559a1bc340ca30b9b961677d46da60d81e Mon Sep 17 00:00:00 2001
From: Kirthisai <kirthisai251@gmail.com>
Date: Sat, 11 Apr 2026 21:22:32 +0530
Subject: [PATCH 1/4] fix: use runes instead of codeUnits in AhoCorasick.search

---
 lib/src/aho_corasick.dart | 81 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 77 insertions(+), 4 deletions(-)
diff --git a/lib/src/aho_corasick.dart b/lib/src/aho_corasick.dart
index 9294d6a..f5855c6 100644
--- a/lib/src/aho_corasick.dart
+++ b/lib/src/aho_corasick.dart
@@ -1,12 +1,56 @@
+/// A single node in the Aho-Corasick trie.
+///
+/// Each node represents a prefix of one or more patterns that have been
+/// inserted via [AhoCorasick.addWord].
 class TrieNode {
+  /// Maps a Unicode code point to the child [TrieNode] for that character.
   final Map<int, TrieNode> children = {};
+
+  /// Failure (fallback) link — points to the node representing the longest
+  /// proper suffix of the current path that is also a valid prefix in the trie.
+  ///
+  /// Set on all non-root nodes by [AhoCorasick.buildFailureLinks].
   TrieNode? fail;
+
+  /// Patterns that terminate at this node.
+  ///
+  /// After [AhoCorasick.buildFailureLinks] is called, this list also includes
+  /// patterns inherited from nodes reachable via [fail] links (the "dictionary
+  /// suffix links" of the classic algorithm).
   final List<String> outputs = [];
 }
 
+/// An implementation of the Aho-Corasick multi-pattern string search algorithm.
+///
+/// Aho-Corasick finds all occurrences of a set of patterns in a text in a
+/// single linear pass — O(n + m + z), where n is the text length, m is the
+/// total length of all patterns, and z is the number of matches. This makes
+/// it well-suited for profanity filtering with large word lists.
+///
+/// ## Usage
+///
+/// ```dart
+/// final ac = AhoCorasick();
+/// ac.addWord('bad');
+/// ac.addWord('worse');
+/// ac.buildFailureLinks(); // must be called before search
+///
+/// final matches = ac.search('this is bad and worse');
+/// // {10: ['bad'], 20: ['worse']}
+/// ```
+///
+/// **Important:** always call [buildFailureLinks] after adding all words and
+/// before calling [search]. Omitting this step produces incorrect results.
 class AhoCorasick {
   final TrieNode _root = TrieNode();
 
+  /// Inserts [word] into the trie.
+  ///
+  /// The word is lowercased before insertion so that [search] can operate on
+  /// pre-lowercased input. Empty strings are silently ignored.
+  ///
+  /// Call this for every pattern you want to detect, then call
+  /// [buildFailureLinks] once before any calls to [search].
   void addWord(String word) {
     if (word.isEmpty) return;
     TrieNode current = _root;
@@ -17,6 +61,15 @@ class AhoCorasick {
     current.outputs.add(word.toLowerCase());
   }
 
+  /// Constructs failure links for all nodes in the trie using a BFS traversal.
+  ///
+  /// This is the preprocessing phase of the Aho-Corasick algorithm. It must
+  /// be called **once**, after all words have been added via [addWord] and
+  /// before any calls to [search].
+  ///
+  /// Failure links allow the search to fall back to the longest matching
+  /// suffix instead of restarting from the root on a mismatch, which keeps
+  /// the search complexity linear in the length of the input.
   void buildFailureLinks() {
     final queue = <TrieNode>[];
 
@@ -49,14 +102,34 @@ class AhoCorasick {
     }
   }
 
-  /// Finds all matches in the text.
-  /// Returns a map where the key is the string index where the match ENDS
-  /// and the value is a list of matching words.
+  /// Searches [text] for all patterns previously added via [addWord].
+  ///
+  /// Returns a [Map] where each key is the **zero-based end index** (inclusive)
+  /// of a match within [text], and the corresponding value is the list of
+  /// pattern strings that end at that position.
+  ///
+  /// The search is case-insensitive — [text] is lowercased internally before
+  /// matching.
+  ///
+  /// [buildFailureLinks] must have been called before invoking this method.
+  ///
+  /// ```dart
+  /// final ac = AhoCorasick()
+  ///   ..addWord('he')
+  ///   ..addWord('she')
+  ///   ..addWord('hers')
+  ///   ..buildFailureLinks();
+  ///
+  /// final result = ac.search('ushers');
+  /// // Keys represent end indices; values are matched words at that position.
+  /// ```
+  ///
+  /// Returns an empty map if no patterns match.
   Map<int, List<String>> search(String text) {
     final matches = <int, List<String>>{};
     TrieNode? current = _root;
     final textLower = text.toLowerCase();
-    final units = textLower.codeUnits;
+    final units = textLower.runes.toList();
 
     for (int i = 0; i < units.length; i++) {
       final rune = units[i];

From 60b09eee96b334fb50ea7c03d2c8d36523233aa9 Mon Sep 17 00:00:00 2001
From: Kirthisai <kirthisai251@gmail.com>
Date: Mon, 13 Apr 2026 22:54:23 +0530
Subject: [PATCH 2/4] fix: Unicode support and emoji filtering in Aho-Corasick

- Updated SafeTextFilter to use runes consistently for searching and filtering.
- Optimized rune conversions to maintain Aho-Corasick performance.
- Added unit test for emoji filtering.
---
 lib/src/safe_text_filter.dart   | 112 +++++++++++++++++++++-----------
 test/profanity_filter_test.dart |  10 +++
 2 files changed, 84 insertions(+), 38 deletions(-)

diff --git a/lib/src/safe_text_filter.dart b/lib/src/safe_text_filter.dart
index f97f6c2..d5e8347 100644
--- a/lib/src/safe_text_filter.dart
+++ b/lib/src/safe_text_filter.dart
@@ -101,14 +101,19 @@ class SafeTextFilter {
   /// Normalizes text by replacing leet-speak with standard alphabets.
   static String normalizeText(String text) {
     if (text.isEmpty) return text;
-    final units = List<int>.from(text.toLowerCase().codeUnits);
-    for (int i = 0; i < units.length; i++) {
-      final replacement = _normalizationMap[units[i]];
+    final runes = _normalizeToRunes(text);
+    return String.fromCharCodes(runes);
+  }
+
+  static List<int> _normalizeToRunes(String text) {
+    final runes = text.toLowerCase().runes.toList();
+    for (int i = 0; i < runes.length; i++) {
+      final replacement = _normalizationMap[runes[i]];
       if (replacement != null) {
-        units[i] = replacement;
+        runes[i] = replacement;
       }
     }
-    return String.fromCharCodes(units);
+    return runes;
   }
 
   /// Static method to check if a string contains any bad words.
@@ -120,23 +125,26 @@ class SafeTextFilter {
   }) async {
     if (text.isEmpty) return false;
 
-    final normalized = normalizeText(text);
+    final normalizedRunes = _normalizeToRunes(text);
 
     // Optimized sync check if initialized
     if (_isInitialized && useDefaultWords) {
+      final normalized = String.fromCharCodes(normalizedRunes);
       final matches = _trie!.search(normalized);
       for (final entry in matches.entries) {
         final endIndex = entry.key;
         for (final word in entry.value) {
           if (excludedWords != null && excludedWords.contains(word)) continue;
+          final wordRuneLength = word.runes.length;
           if (_isWordBoundary(
-              normalized, endIndex - word.length + 1, endIndex + 1)) {
+              normalizedRunes, endIndex - wordRuneLength + 1, endIndex + 1)) {
             return true;
           }
         }
       }
     } else if (useDefaultWords) {
       // Fallback or legacy path
+      final normalized = String.fromCharCodes(normalizedRunes);
       for (final word in badWords) {
         if (excludedWords != null && excludedWords.contains(word)) continue;
         if (_hasMatch(normalized, word)) return true;
@@ -144,6 +152,7 @@ class SafeTextFilter {
     }
 
     if (extraWords != null) {
+      final normalized = String.fromCharCodes(normalizedRunes);
       for (final word in extraWords) {
         if (excludedWords != null && excludedWords.contains(word)) continue;
         if (_hasMatch(normalized, word)) return true;
@@ -155,12 +164,23 @@ class SafeTextFilter {
 
   static bool _hasMatch(String normalizedText, String word) {
     final wordLower = word.toLowerCase();
-    int index = normalizedText.indexOf(wordLower);
-    while (index != -1) {
-      if (_isWordBoundary(normalizedText, index, index + wordLower.length)) {
+    final normalizedRunes = normalizedText.runes.toList();
+    final wordRunes = wordLower.runes.toList();
+
+    // Simple pattern matching on runes
+    for (int i = 0; i <= normalizedRunes.length - wordRunes.length; i++) {
+      bool match = true;
+      for (int j = 0; j < wordRunes.length; j++) {
+        if (normalizedRunes[i + j] != wordRunes[j]) {
+          match = false;
+          break;
+        }
+      }
+
+      if (match &&
+          _isWordBoundary(normalizedRunes, i, i + wordRunes.length)) {
         return true;
       }
-      index = normalizedText.indexOf(wordLower, index + 1);
     }
     return false;
   }
@@ -202,17 +222,20 @@ class SafeTextFilter {
 
     if (text.isEmpty) return text;
 
-    final normalizedText = normalizeText(text);
+    final textRunes = text.runes.toList();
+    final normalizedRunes = _normalizeToRunes(text);
     final List<_Range> matchRanges = [];
 
     // Step 1: Collect match ranges
     if (_isInitialized && useDefaultWords) {
-      final trieMatches = _trie!.search(normalizedText);
+      final normalized = String.fromCharCodes(normalizedRunes);
+      final trieMatches = _trie!.search(normalized);
       trieMatches.forEach((endIndex, words) {
         for (final word in words) {
           if (excludedWords != null && excludedWords.contains(word)) continue;
-          final startIndex = endIndex - word.length + 1;
-          if (_isWordBoundary(normalizedText, startIndex, endIndex + 1)) {
+          final wordRuneLength = word.runes.length;
+          final startIndex = endIndex - wordRuneLength + 1;
+          if (_isWordBoundary(normalizedRunes, startIndex, endIndex + 1)) {
             matchRanges.add(_Range(startIndex, endIndex + 1));
           }
         }
@@ -220,14 +243,14 @@ class SafeTextFilter {
     } else if (useDefaultWords) {
       for (final word in badWords) {
         if (excludedWords != null && excludedWords.contains(word)) continue;
-        _addMatchesForWord(normalizedText, word, matchRanges);
+        _addMatchesForWord(normalizedRunes, word, matchRanges);
       }
     }
 
     if (extraWords != null) {
       for (final word in extraWords) {
         if (excludedWords != null && excludedWords.contains(word)) continue;
-        _addMatchesForWord(normalizedText, word, matchRanges);
+        _addMatchesForWord(normalizedRunes, word, matchRanges);
       }
     }
 
@@ -256,7 +279,8 @@ class SafeTextFilter {
     int lastAppended = 0;
 
     for (final range in merged) {
-      buffer.write(text.substring(lastAppended, range.start));
+      buffer.write(
+          String.fromCharCodes(textRunes.sublist(lastAppended, range.start)));
 
       final matchLength = range.end - range.start;
       switch (maskStrategy) {
@@ -273,9 +297,11 @@ class SafeTextFilter {
             final showLast = matchLength >= 4;
             final maskCount = matchLength - 1 - (showLast ? 1 : 0);
             buffer
-              ..write(text[range.start])
+              ..write(String.fromCharCode(textRunes[range.start]))
               ..write(obscureSymbol * maskCount);
-            if (showLast) buffer.write(text[range.end - 1]);
+            if (showLast) {
+              buffer.write(String.fromCharCode(textRunes[range.end - 1]));
+            }
           }
 
         // Custom: replace entire word with the replacement string
@@ -285,40 +311,50 @@ class SafeTextFilter {
       lastAppended = range.end;
     }
 
-    if (lastAppended < text.length) {
-      buffer.write(text.substring(lastAppended));
+    if (lastAppended < textRunes.length) {
+      buffer.write(String.fromCharCodes(textRunes.sublist(lastAppended)));
     }
 
     return buffer.toString();
   }
 
   static void _addMatchesForWord(
-      String normalizedText, String word, List<_Range> matches) {
-    final wordLower = word.toLowerCase();
-    int index = normalizedText.indexOf(wordLower);
-    while (index != -1) {
-      final endIndex = index + wordLower.length;
-      if (_isWordBoundary(normalizedText, index, endIndex)) {
-        matches.add(_Range(index, endIndex));
+      List<int> normalizedRunes, String word, List<_Range> matches) {
+    final wordRunes = word.toLowerCase().runes.toList();
+    if (wordRunes.isEmpty) return;
+
+    for (int i = 0; i <= normalizedRunes.length - wordRunes.length; i++) {
+      bool match = true;
+      for (int j = 0; j < wordRunes.length; j++) {
+        if (normalizedRunes[i + j] != wordRunes[j]) {
+          match = false;
+          break;
+        }
+      }
+
+      if (match) {
+        final endIndex = i + wordRunes.length;
+        if (_isWordBoundary(normalizedRunes, i, endIndex)) {
+          matches.add(_Range(i, endIndex));
+        }
       }
-      index = normalizedText.indexOf(wordLower, index + 1);
     }
   }
 
-  static bool _isWordBoundary(String text, int start, int end) {
+  static bool _isWordBoundary(List<int> runes, int start, int end) {
     if (start > 0) {
-      final charCode = text.codeUnitAt(start - 1);
-      if (_isAlphanumeric(charCode)) return false;
+      final rune = runes[start - 1];
+      if (_isAlphanumeric(rune)) return false;
     }
-    if (end < text.length) {
-      final charCode = text.codeUnitAt(end);
-      if (_isAlphanumeric(charCode)) return false;
+    if (end < runes.length) {
+      final rune = runes[end];
+      if (_isAlphanumeric(rune)) return false;
     }
     return true;
   }
 
-  static bool _isAlphanumeric(int charCode) {
-    return _unicodeLetterOrDigit.hasMatch(String.fromCharCode(charCode));
+  static bool _isAlphanumeric(int rune) {
+    return _unicodeLetterOrDigit.hasMatch(String.fromCharCode(rune));
   }
 }
 
diff --git a/test/profanity_filter_test.dart b/test/profanity_filter_test.dart
index f710286..7913d3b 100644
--- a/test/profanity_filter_test.dart
+++ b/test/profanity_filter_test.dart
@@ -230,6 +230,16 @@ void main() {
       expect(filtered, contains("******"));
       expect(filtered, contains("*****"));
     });
+
+    test("filters foul emojis and characters outside BMP correctly", () async {
+      await SafeTextFilter.init(language: Language.english);
+
+      // 🖕 and 💩 are in en.txt
+      final text = "You are a 💩 and a 🖕";
+      final filtered = SafeTextFilter.filterText(text: text);
+
+      expect(filtered, "You are a * and a *");
+    });
   });
 
   group("Performance Benchmark (Aho-Corasick vs Legacy Loop)", () {

From 02bed1e69bc8aca75a437870effbb0b55351ede0 Mon Sep 17 00:00:00 2001
From: Kirthisai <kirthisai251@gmail.com>
Date: Tue, 14 Apr 2026 23:15:18 +0530
Subject: [PATCH 3/4] fix: formatting and added Aho-Corasick Unicode tests

---
 lib/src/safe_text_filter.dart |  3 +-
 test/aho_corasick_test.dart   | 78 +++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 2 deletions(-)
 create mode 100644 test/aho_corasick_test.dart

diff --git a/lib/src/safe_text_filter.dart b/lib/src/safe_text_filter.dart
index d5e8347..62680aa 100644
--- a/lib/src/safe_text_filter.dart
+++ b/lib/src/safe_text_filter.dart
@@ -177,8 +177,7 @@ class SafeTextFilter {
         }
       }
 
-      if (match &&
-          _isWordBoundary(normalizedRunes, i, i + wordRunes.length)) {
+      if (match && _isWordBoundary(normalizedRunes, i, i + wordRunes.length)) {
         return true;
       }
     }
diff --git a/test/aho_corasick_test.dart b/test/aho_corasick_test.dart
new file mode 100644
index 0000000..5d2dac2
--- /dev/null
+++ b/test/aho_corasick_test.dart
@@ -0,0 +1,78 @@
+import 'package:flutter_test/flutter_test.dart';
+import 'package:safe_text/src/aho_corasick.dart';
+
+void main() {
+  group('AhoCorasick Unicode Search', () {
+    late AhoCorasick ac;
+
+    setUp(() {
+      ac = AhoCorasick();
+    });
+
+    test('finds basic ASCII words', () {
+      ac.addWord('apple');
+      ac.addWord('banana');
+      ac.buildFailureLinks();
+
+      final matches = ac.search('I like apple and banana');
+
+      // 'apple' ends at rune index 11
+      // 'banana' ends at rune index 22
+      expect(matches[11], contains('apple'));
+      expect(matches[22], contains('banana'));
+    });
+
+    test('finds words with non-BMP characters (emojis)', () {
+      // 💩 is U+1F4A9
+      // 🖕 is U+1F595
+      ac.addWord('💩');
+      ac.addWord('🖕');
+      ac.buildFailureLinks();
+
+      final text = 'You are a 💩 and a 🖕';
+      final matches = ac.search(text);
+
+      // Rune indices:
+      // Y(0) o(1) u(2)  (3) a(4) r(5) e(6)  (7) a(8)  (9) 💩(10)  (11) a(12) n(13) d(14)  (15) a(16)  (17) 🖕(18)
+
+      expect(matches.containsKey(10), true);
+      expect(matches[10], contains('💩'));
+
+      expect(matches.containsKey(18), true);
+      expect(matches[18], contains('🖕'));
+    });
+
+    test('handles mixed emojis and text', () {
+      ac.addWord('bad💩');
+      ac.addWord('word');
+      ac.buildFailureLinks();
+
+      final text = 'This is a bad💩 word';
+      final matches = ac.search(text);
+
+      // bad💩 ends at rune index 13
+      // word ends at rune index 18
+      expect(matches[13], contains('bad💩'));
+      expect(matches[18], contains('word'));
+    });
+
+    test('case-insensitive search with Unicode', () {
+      ac.addWord('💩');
+      ac.buildFailureLinks();
+
+      // Note: most emojis don't have case, but this tests the pipeline
+      final matches = ac.search('POOP 💩');
+      expect(matches[5], contains('💩'));
+    });
+
+    test('handles overlapping matches with runes', () {
+      ac.addWord('ab');
+      ac.addWord('abc');
+      ac.buildFailureLinks();
+
+      final matches = ac.search('abcd');
+      expect(matches[1], contains('ab'));
+      expect(matches[2], contains('abc'));
+    });
+  });
+}

From 51196d76df5a847bf292e14d8de0d138fbcef1af Mon Sep 17 00:00:00 2001
From: Ronit Rameja <ronitrameja28@gmail.com>
Date: Mon, 20 Apr 2026 19:31:21 +0530
Subject: [PATCH 4/4] updated the version to 2.1.4

---
 CHANGELOG.md | 6 ++++++
 README.md    | 2 +-
 pubspec.yaml | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ee5cd27..8da390f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,9 @@
+## 2.1.4
+
+### Fixed
+- **Unicode rune correctness**: Replaced all `codeUnits` (UTF-16) usage with `runes` (Unicode code points) throughout `SafeTextFilter` and `AhoCorasick`. This fixes incorrect indexing and potential missed/false matches for characters outside the Basic Multilingual Plane (e.g. emoji, supplementary CJK characters).
+- Word-boundary checks, match-range offsets, and string reconstruction now operate on rune indices, ensuring accurate filtering for all Unicode input.
+
 ## 2.1.3
 
 ### Documentation
diff --git a/README.md b/README.md
index 6947f62..cf60d95 100644
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ Or manually add it to your `pubspec.yaml`:
 
 ```yaml
 dependencies:
-  safe_text: ^2.1.3
+  safe_text: ^2.1.4
 ```
 
 Then run:
diff --git a/pubspec.yaml b/pubspec.yaml
index 2e20310..404495f 100644
--- a/pubspec.yaml
+++ b/pubspec.yaml
@@ -1,6 +1,6 @@
 name: safe_text
 description: A Flutter package for filtering out bad words from text inputs and detecting phone numbers in various formats including multiplier words.
-version: 2.1.3
+version: 2.1.4
 homepage: https://github.com/master-wayne7/safe_text
 repository: https://github.com/master-wayne7/safe_text
 issue_tracker: https://github.com/master-wayne7/safe_text/issues