Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## 2.1.4

### Fixed
- **Unicode rune correctness**: Replaced all `codeUnits` (UTF-16) usage with `runes` (Unicode code points) throughout `SafeTextFilter` and `AhoCorasick`. This fixes incorrect indexing and potential missed/false matches for characters outside the Basic Multilingual Plane (e.g. emoji, supplementary CJK characters).
- Word-boundary checks, match-range offsets, and string reconstruction now operate on rune indices, ensuring accurate filtering for all Unicode input.

## 2.1.3

### Documentation
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ Or manually add it to your `pubspec.yaml`:

```yaml
dependencies:
safe_text: ^2.1.3
safe_text: ^2.1.4
```

Then run:
Expand Down
2 changes: 1 addition & 1 deletion lib/src/aho_corasick.dart
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ class AhoCorasick {
final matches = <int, List<String>>{};
TrieNode? current = _root;
final textLower = text.toLowerCase();
final units = textLower.codeUnits;
final units = textLower.runes.toList();

for (int i = 0; i < units.length; i++) {
final rune = units[i];
Expand Down
111 changes: 73 additions & 38 deletions lib/src/safe_text_filter.dart
Original file line number Diff line number Diff line change
Expand Up @@ -101,14 +101,19 @@ class SafeTextFilter {
/// Normalizes text by replacing leet-speak with standard alphabets.
static String normalizeText(String text) {
if (text.isEmpty) return text;
final units = List<int>.from(text.toLowerCase().codeUnits);
for (int i = 0; i < units.length; i++) {
final replacement = _normalizationMap[units[i]];
final runes = _normalizeToRunes(text);
return String.fromCharCodes(runes);
}

static List<int> _normalizeToRunes(String text) {
final runes = text.toLowerCase().runes.toList();
for (int i = 0; i < runes.length; i++) {
final replacement = _normalizationMap[runes[i]];
if (replacement != null) {
units[i] = replacement;
runes[i] = replacement;
}
}
return String.fromCharCodes(units);
return runes;
}

/// Static method to check if a string contains any bad words.
Expand All @@ -120,30 +125,34 @@ class SafeTextFilter {
}) async {
if (text.isEmpty) return false;

final normalized = normalizeText(text);
final normalizedRunes = _normalizeToRunes(text);

// Optimized sync check if initialized
if (_isInitialized && useDefaultWords) {
final normalized = String.fromCharCodes(normalizedRunes);
final matches = _trie!.search(normalized);
for (final entry in matches.entries) {
final endIndex = entry.key;
for (final word in entry.value) {
if (excludedWords != null && excludedWords.contains(word)) continue;
final wordRuneLength = word.runes.length;
if (_isWordBoundary(
normalized, endIndex - word.length + 1, endIndex + 1)) {
normalizedRunes, endIndex - wordRuneLength + 1, endIndex + 1)) {
return true;
}
}
}
} else if (useDefaultWords) {
// Fallback or legacy path
final normalized = String.fromCharCodes(normalizedRunes);
for (final word in badWords) {
if (excludedWords != null && excludedWords.contains(word)) continue;
if (_hasMatch(normalized, word)) return true;
}
}

if (extraWords != null) {
final normalized = String.fromCharCodes(normalizedRunes);
for (final word in extraWords) {
if (excludedWords != null && excludedWords.contains(word)) continue;
if (_hasMatch(normalized, word)) return true;
Expand All @@ -155,12 +164,22 @@ class SafeTextFilter {

static bool _hasMatch(String normalizedText, String word) {
final wordLower = word.toLowerCase();
int index = normalizedText.indexOf(wordLower);
while (index != -1) {
if (_isWordBoundary(normalizedText, index, index + wordLower.length)) {
final normalizedRunes = normalizedText.runes.toList();
final wordRunes = wordLower.runes.toList();

// Simple pattern matching on runes
for (int i = 0; i <= normalizedRunes.length - wordRunes.length; i++) {
bool match = true;
for (int j = 0; j < wordRunes.length; j++) {
if (normalizedRunes[i + j] != wordRunes[j]) {
match = false;
break;
}
}

if (match && _isWordBoundary(normalizedRunes, i, i + wordRunes.length)) {
return true;
}
index = normalizedText.indexOf(wordLower, index + 1);
}
return false;
}
Expand Down Expand Up @@ -202,32 +221,35 @@ class SafeTextFilter {

if (text.isEmpty) return text;

final normalizedText = normalizeText(text);
final textRunes = text.runes.toList();
final normalizedRunes = _normalizeToRunes(text);
final List<_Range> matchRanges = [];

// Step 1: Collect match ranges
if (_isInitialized && useDefaultWords) {
final trieMatches = _trie!.search(normalizedText);
final normalized = String.fromCharCodes(normalizedRunes);
final trieMatches = _trie!.search(normalized);
trieMatches.forEach((endIndex, words) {
for (final word in words) {
if (excludedWords != null && excludedWords.contains(word)) continue;
final startIndex = endIndex - word.length + 1;
if (_isWordBoundary(normalizedText, startIndex, endIndex + 1)) {
final wordRuneLength = word.runes.length;
final startIndex = endIndex - wordRuneLength + 1;
if (_isWordBoundary(normalizedRunes, startIndex, endIndex + 1)) {
matchRanges.add(_Range(startIndex, endIndex + 1));
}
}
});
} else if (useDefaultWords) {
for (final word in badWords) {
if (excludedWords != null && excludedWords.contains(word)) continue;
_addMatchesForWord(normalizedText, word, matchRanges);
_addMatchesForWord(normalizedRunes, word, matchRanges);
}
}

if (extraWords != null) {
for (final word in extraWords) {
if (excludedWords != null && excludedWords.contains(word)) continue;
_addMatchesForWord(normalizedText, word, matchRanges);
_addMatchesForWord(normalizedRunes, word, matchRanges);
}
}

Expand Down Expand Up @@ -256,7 +278,8 @@ class SafeTextFilter {
int lastAppended = 0;

for (final range in merged) {
buffer.write(text.substring(lastAppended, range.start));
buffer.write(
String.fromCharCodes(textRunes.sublist(lastAppended, range.start)));

final matchLength = range.end - range.start;
switch (maskStrategy) {
Expand All @@ -273,9 +296,11 @@ class SafeTextFilter {
final showLast = matchLength >= 4;
final maskCount = matchLength - 1 - (showLast ? 1 : 0);
buffer
..write(text[range.start])
..write(String.fromCharCode(textRunes[range.start]))
..write(obscureSymbol * maskCount);
if (showLast) buffer.write(text[range.end - 1]);
if (showLast) {
buffer.write(String.fromCharCode(textRunes[range.end - 1]));
}
}

// Custom: replace entire word with the replacement string
Expand All @@ -285,40 +310,50 @@ class SafeTextFilter {
lastAppended = range.end;
}

if (lastAppended < text.length) {
buffer.write(text.substring(lastAppended));
if (lastAppended < textRunes.length) {
buffer.write(String.fromCharCodes(textRunes.sublist(lastAppended)));
}

return buffer.toString();
}

static void _addMatchesForWord(
String normalizedText, String word, List<_Range> matches) {
final wordLower = word.toLowerCase();
int index = normalizedText.indexOf(wordLower);
while (index != -1) {
final endIndex = index + wordLower.length;
if (_isWordBoundary(normalizedText, index, endIndex)) {
matches.add(_Range(index, endIndex));
List<int> normalizedRunes, String word, List<_Range> matches) {
final wordRunes = word.toLowerCase().runes.toList();
if (wordRunes.isEmpty) return;

for (int i = 0; i <= normalizedRunes.length - wordRunes.length; i++) {
bool match = true;
for (int j = 0; j < wordRunes.length; j++) {
if (normalizedRunes[i + j] != wordRunes[j]) {
match = false;
break;
}
}

if (match) {
final endIndex = i + wordRunes.length;
if (_isWordBoundary(normalizedRunes, i, endIndex)) {
matches.add(_Range(i, endIndex));
}
}
index = normalizedText.indexOf(wordLower, index + 1);
}
}

static bool _isWordBoundary(String text, int start, int end) {
static bool _isWordBoundary(List<int> runes, int start, int end) {
if (start > 0) {
final charCode = text.codeUnitAt(start - 1);
if (_isAlphanumeric(charCode)) return false;
final rune = runes[start - 1];
if (_isAlphanumeric(rune)) return false;
}
if (end < text.length) {
final charCode = text.codeUnitAt(end);
if (_isAlphanumeric(charCode)) return false;
if (end < runes.length) {
final rune = runes[end];
if (_isAlphanumeric(rune)) return false;
}
return true;
}

static bool _isAlphanumeric(int charCode) {
return _unicodeLetterOrDigit.hasMatch(String.fromCharCode(charCode));
static bool _isAlphanumeric(int rune) {
return _unicodeLetterOrDigit.hasMatch(String.fromCharCode(rune));
}
}

Expand Down
2 changes: 1 addition & 1 deletion pubspec.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: safe_text
description: A Flutter package for filtering out bad words from text inputs and detecting phone numbers in various formats including multiplier words.
version: 2.1.3
version: 2.1.4
homepage: https://github.com/master-wayne7/safe_text
repository: https://github.com/master-wayne7/safe_text
issue_tracker: https://github.com/master-wayne7/safe_text/issues
Expand Down
78 changes: 78 additions & 0 deletions test/aho_corasick_test.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import 'package:flutter_test/flutter_test.dart';
import 'package:safe_text/src/aho_corasick.dart';

void main() {
group('AhoCorasick Unicode Search', () {
late AhoCorasick ac;

setUp(() {
ac = AhoCorasick();
});

test('finds basic ASCII words', () {
ac.addWord('apple');
ac.addWord('banana');
ac.buildFailureLinks();

final matches = ac.search('I like apple and banana');

// 'apple' ends at rune index 11
// 'banana' ends at rune index 22
expect(matches[11], contains('apple'));
expect(matches[22], contains('banana'));
});

test('finds words with non-BMP characters (emojis)', () {
// 💩 is U+1F4A9
// 🖕 is U+1F595
ac.addWord('💩');
ac.addWord('🖕');
ac.buildFailureLinks();

final text = 'You are a 💩 and a 🖕';
final matches = ac.search(text);

// Rune indices:
// Y(0) o(1) u(2) (3) a(4) r(5) e(6) (7) a(8) (9) 💩(10) (11) a(12) n(13) d(14) (15) a(16) (17) 🖕(18)

expect(matches.containsKey(10), true);
expect(matches[10], contains('💩'));

expect(matches.containsKey(18), true);
expect(matches[18], contains('🖕'));
});

test('handles mixed emojis and text', () {
ac.addWord('bad💩');
ac.addWord('word');
ac.buildFailureLinks();

final text = 'This is a bad💩 word';
final matches = ac.search(text);

// bad💩 ends at rune index 13
// word ends at rune index 18
expect(matches[13], contains('bad💩'));
expect(matches[18], contains('word'));
});

test('case-insensitive search with Unicode', () {
ac.addWord('💩');
ac.buildFailureLinks();

// Note: most emojis don't have case, but this tests the pipeline
final matches = ac.search('POOP 💩');
expect(matches[5], contains('💩'));
});

test('handles overlapping matches with runes', () {
ac.addWord('ab');
ac.addWord('abc');
ac.buildFailureLinks();

final matches = ac.search('abcd');
expect(matches[1], contains('ab'));
expect(matches[2], contains('abc'));
});
});
}
10 changes: 10 additions & 0 deletions test/profanity_filter_test.dart
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,16 @@ void main() {
expect(filtered, contains("******"));
expect(filtered, contains("*****"));
});

test("filters foul emojis and characters outside BMP correctly", () async {
await SafeTextFilter.init(language: Language.english);

// 🖕 and 💩 are in en.txt
final text = "You are a 💩 and a 🖕";
final filtered = SafeTextFilter.filterText(text: text);

expect(filtered, "You are a * and a *");
});
});

group("Performance Benchmark (Aho-Corasick vs Legacy Loop)", () {
Expand Down
Loading