Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 77 additions & 4 deletions lib/src/aho_corasick.dart
Original file line number Diff line number Diff line change
@@ -1,12 +1,56 @@
/// A single node in the Aho-Corasick trie.
///
/// Each node represents a prefix of one or more patterns that have been
/// inserted via [AhoCorasick.addWord].
class TrieNode {
/// Maps a Unicode code point to the child [TrieNode] for that character.
final Map<int, TrieNode> children = {};

/// Failure (fallback) link — points to the node representing the longest
/// proper suffix of the current path that is also a valid prefix in the trie.
///
/// Set on all non-root nodes by [AhoCorasick.buildFailureLinks].
TrieNode? fail;

/// Patterns that terminate at this node.
///
/// After [AhoCorasick.buildFailureLinks] is called, this list also includes
/// patterns inherited from nodes reachable via [fail] links (the "dictionary
/// suffix links" of the classic algorithm).
final List<String> outputs = [];
}

/// An implementation of the Aho-Corasick multi-pattern string search algorithm.
///
/// Aho-Corasick finds all occurrences of a set of patterns in a text in a
/// single linear pass — O(n + m + z), where n is the text length, m is the
/// total length of all patterns, and z is the number of matches. This makes
/// it well-suited for profanity filtering with large word lists.
///
/// ## Usage
///
/// ```dart
/// final ac = AhoCorasick();
/// ac.addWord('bad');
/// ac.addWord('worse');
/// ac.buildFailureLinks(); // must be called before search
///
/// final matches = ac.search('this is bad and worse');
/// // {10: ['bad'], 20: ['worse']}
/// ```
///
/// **Important:** always call [buildFailureLinks] after adding all words and
/// before calling [search]. Omitting this step produces incorrect results.
class AhoCorasick {
final TrieNode _root = TrieNode();

/// Inserts [word] into the trie.
///
/// The word is lowercased before insertion so that [search] can operate on
/// pre-lowercased input. Empty strings are silently ignored.
///
/// Call this for every pattern you want to detect, then call
/// [buildFailureLinks] once before any calls to [search].
void addWord(String word) {
if (word.isEmpty) return;
TrieNode current = _root;
Expand All @@ -17,6 +61,15 @@ class AhoCorasick {
current.outputs.add(word.toLowerCase());
}

/// Constructs failure links for all nodes in the trie using a BFS traversal.
///
/// This is the preprocessing phase of the Aho-Corasick algorithm. It must
/// be called **once**, after all words have been added via [addWord] and
/// before any calls to [search].
///
/// Failure links allow the search to fall back to the longest matching
/// suffix instead of restarting from the root on a mismatch, which keeps
/// the search complexity linear in the length of the input.
void buildFailureLinks() {
final queue = <TrieNode>[];

Expand Down Expand Up @@ -49,14 +102,34 @@ class AhoCorasick {
}
}

/// Finds all matches in the text.
/// Returns a map where the key is the string index where the match ENDS
/// and the value is a list of matching words.
/// Searches [text] for all patterns previously added via [addWord].
///
/// Returns a [Map] where each key is the **zero-based end index** (inclusive)
/// of a match within [text], and the corresponding value is the list of
/// pattern strings that end at that position.
///
/// The search is case-insensitive — [text] is lowercased internally before
/// matching.
///
/// [buildFailureLinks] must have been called before invoking this method.
///
/// ```dart
/// final ac = AhoCorasick()
/// ..addWord('he')
/// ..addWord('she')
/// ..addWord('hers')
/// ..buildFailureLinks();
///
/// final result = ac.search('ushers');
/// // Keys represent end indices; values are matched words at that position.
/// ```
///
/// Returns an empty map if no patterns match.
Map<int, List<String>> search(String text) {
final matches = <int, List<String>>{};
TrieNode? current = _root;
final textLower = text.toLowerCase();
final units = textLower.codeUnits;
final units = textLower.runes.toList();

for (int i = 0; i < units.length; i++) {
final rune = units[i];
Expand Down
111 changes: 73 additions & 38 deletions lib/src/safe_text_filter.dart
Original file line number Diff line number Diff line change
Expand Up @@ -101,14 +101,19 @@ class SafeTextFilter {
/// Normalizes text by replacing leet-speak with standard alphabets.
static String normalizeText(String text) {
if (text.isEmpty) return text;
final units = List<int>.from(text.toLowerCase().codeUnits);
for (int i = 0; i < units.length; i++) {
final replacement = _normalizationMap[units[i]];
final runes = _normalizeToRunes(text);
return String.fromCharCodes(runes);
}

static List<int> _normalizeToRunes(String text) {
final runes = text.toLowerCase().runes.toList();
for (int i = 0; i < runes.length; i++) {
final replacement = _normalizationMap[runes[i]];
if (replacement != null) {
units[i] = replacement;
runes[i] = replacement;
}
}
return String.fromCharCodes(units);
return runes;
}

/// Static method to check if a string contains any bad words.
Expand All @@ -120,30 +125,34 @@ class SafeTextFilter {
}) async {
if (text.isEmpty) return false;

final normalized = normalizeText(text);
final normalizedRunes = _normalizeToRunes(text);

// Optimized sync check if initialized
if (_isInitialized && useDefaultWords) {
final normalized = String.fromCharCodes(normalizedRunes);
final matches = _trie!.search(normalized);
for (final entry in matches.entries) {
final endIndex = entry.key;
for (final word in entry.value) {
if (excludedWords != null && excludedWords.contains(word)) continue;
final wordRuneLength = word.runes.length;
if (_isWordBoundary(
normalized, endIndex - word.length + 1, endIndex + 1)) {
normalizedRunes, endIndex - wordRuneLength + 1, endIndex + 1)) {
return true;
}
}
}
} else if (useDefaultWords) {
// Fallback or legacy path
final normalized = String.fromCharCodes(normalizedRunes);
for (final word in badWords) {
if (excludedWords != null && excludedWords.contains(word)) continue;
if (_hasMatch(normalized, word)) return true;
}
}

if (extraWords != null) {
final normalized = String.fromCharCodes(normalizedRunes);
for (final word in extraWords) {
if (excludedWords != null && excludedWords.contains(word)) continue;
if (_hasMatch(normalized, word)) return true;
Expand All @@ -155,12 +164,22 @@ class SafeTextFilter {

static bool _hasMatch(String normalizedText, String word) {
final wordLower = word.toLowerCase();
int index = normalizedText.indexOf(wordLower);
while (index != -1) {
if (_isWordBoundary(normalizedText, index, index + wordLower.length)) {
final normalizedRunes = normalizedText.runes.toList();
final wordRunes = wordLower.runes.toList();

// Simple pattern matching on runes
for (int i = 0; i <= normalizedRunes.length - wordRunes.length; i++) {
bool match = true;
for (int j = 0; j < wordRunes.length; j++) {
if (normalizedRunes[i + j] != wordRunes[j]) {
match = false;
break;
}
}

if (match && _isWordBoundary(normalizedRunes, i, i + wordRunes.length)) {
return true;
}
index = normalizedText.indexOf(wordLower, index + 1);
}
return false;
}
Expand Down Expand Up @@ -202,32 +221,35 @@ class SafeTextFilter {

if (text.isEmpty) return text;

final normalizedText = normalizeText(text);
final textRunes = text.runes.toList();
final normalizedRunes = _normalizeToRunes(text);
final List<_Range> matchRanges = [];

// Step 1: Collect match ranges
if (_isInitialized && useDefaultWords) {
final trieMatches = _trie!.search(normalizedText);
final normalized = String.fromCharCodes(normalizedRunes);
final trieMatches = _trie!.search(normalized);
trieMatches.forEach((endIndex, words) {
for (final word in words) {
if (excludedWords != null && excludedWords.contains(word)) continue;
final startIndex = endIndex - word.length + 1;
if (_isWordBoundary(normalizedText, startIndex, endIndex + 1)) {
final wordRuneLength = word.runes.length;
final startIndex = endIndex - wordRuneLength + 1;
if (_isWordBoundary(normalizedRunes, startIndex, endIndex + 1)) {
matchRanges.add(_Range(startIndex, endIndex + 1));
}
}
});
} else if (useDefaultWords) {
for (final word in badWords) {
if (excludedWords != null && excludedWords.contains(word)) continue;
_addMatchesForWord(normalizedText, word, matchRanges);
_addMatchesForWord(normalizedRunes, word, matchRanges);
}
}

if (extraWords != null) {
for (final word in extraWords) {
if (excludedWords != null && excludedWords.contains(word)) continue;
_addMatchesForWord(normalizedText, word, matchRanges);
_addMatchesForWord(normalizedRunes, word, matchRanges);
}
}

Expand Down Expand Up @@ -256,7 +278,8 @@ class SafeTextFilter {
int lastAppended = 0;

for (final range in merged) {
buffer.write(text.substring(lastAppended, range.start));
buffer.write(
String.fromCharCodes(textRunes.sublist(lastAppended, range.start)));

final matchLength = range.end - range.start;
switch (maskStrategy) {
Expand All @@ -273,9 +296,11 @@ class SafeTextFilter {
final showLast = matchLength >= 4;
final maskCount = matchLength - 1 - (showLast ? 1 : 0);
buffer
..write(text[range.start])
..write(String.fromCharCode(textRunes[range.start]))
..write(obscureSymbol * maskCount);
if (showLast) buffer.write(text[range.end - 1]);
if (showLast) {
buffer.write(String.fromCharCode(textRunes[range.end - 1]));
}
}

// Custom: replace entire word with the replacement string
Expand All @@ -285,40 +310,50 @@ class SafeTextFilter {
lastAppended = range.end;
}

if (lastAppended < text.length) {
buffer.write(text.substring(lastAppended));
if (lastAppended < textRunes.length) {
buffer.write(String.fromCharCodes(textRunes.sublist(lastAppended)));
}

return buffer.toString();
}

static void _addMatchesForWord(
String normalizedText, String word, List<_Range> matches) {
final wordLower = word.toLowerCase();
int index = normalizedText.indexOf(wordLower);
while (index != -1) {
final endIndex = index + wordLower.length;
if (_isWordBoundary(normalizedText, index, endIndex)) {
matches.add(_Range(index, endIndex));
List<int> normalizedRunes, String word, List<_Range> matches) {
final wordRunes = word.toLowerCase().runes.toList();
if (wordRunes.isEmpty) return;

for (int i = 0; i <= normalizedRunes.length - wordRunes.length; i++) {
bool match = true;
for (int j = 0; j < wordRunes.length; j++) {
if (normalizedRunes[i + j] != wordRunes[j]) {
match = false;
break;
}
}

if (match) {
final endIndex = i + wordRunes.length;
if (_isWordBoundary(normalizedRunes, i, endIndex)) {
matches.add(_Range(i, endIndex));
}
}
index = normalizedText.indexOf(wordLower, index + 1);
}
}

static bool _isWordBoundary(String text, int start, int end) {
static bool _isWordBoundary(List<int> runes, int start, int end) {
if (start > 0) {
final charCode = text.codeUnitAt(start - 1);
if (_isAlphanumeric(charCode)) return false;
final rune = runes[start - 1];
if (_isAlphanumeric(rune)) return false;
}
if (end < text.length) {
final charCode = text.codeUnitAt(end);
if (_isAlphanumeric(charCode)) return false;
if (end < runes.length) {
final rune = runes[end];
if (_isAlphanumeric(rune)) return false;
}
return true;
}

static bool _isAlphanumeric(int charCode) {
return _unicodeLetterOrDigit.hasMatch(String.fromCharCode(charCode));
static bool _isAlphanumeric(int rune) {
return _unicodeLetterOrDigit.hasMatch(String.fromCharCode(rune));
}
}

Expand Down
Loading
Loading