diff --git a/plugin/action/hash/normalize/README.md b/plugin/action/hash/normalize/README.md index 2324d616b..74873f115 100644 --- a/plugin/action/hash/normalize/README.md +++ b/plugin/action/hash/normalize/README.md @@ -26,7 +26,7 @@ We support a set of patterns out of the box. | 11 | uuid | `` | 7c1811ed-e98f-4c9c-a9f9-58c757ff494f | | 12 | hash | `` | 48757ec9f04efe7faacec8722f3476339b125a6b6172b8a69ff3aa329e0bd0ff
a94a8fe5ccb19ba61c4c0873d391e987982fbbd3
098f6bcd4621d373cade4e832627b4f6 | | 13 | datetime | `` | 2025-01-13T10:20:40.999999Z
2025-01-13T10:20:40+04:00
2025-01-13 10:20:40
2025-01-13
10:20:40 | -| 14 | ip | `` | 1.2.3.4
01.102.103.104 | +| 14 | ip | `` | **IPv4:** 1.2.3.4
**IPv6:** 2001:db8:3333:4444:5555:6666:1.2.3.4 | | 15 | duration | `` | -1m5s
1w2d3h4m5s6ms7us8ns | | 16 | hex | `` | 0x13eb85e69dfbc0758b12acdaae36287d
0X553026A59C | | 17 | float | `` | 100.23
-4.56 | diff --git a/plugin/action/hash/normalize/token_normalizer.go b/plugin/action/hash/normalize/token_normalizer.go index 526b12331..51070370b 100644 --- a/plugin/action/hash/normalize/token_normalizer.go +++ b/plugin/action/hash/normalize/token_normalizer.go @@ -3,6 +3,7 @@ package normalize import ( "errors" "fmt" + "net" "slices" "strings" @@ -208,7 +209,12 @@ func initTokens(lexer *lexmachine.Lexer, addTokens := func(patterns []TokenPattern) { for _, p := range patterns { if p.mask == 0 || builtinPatterns&p.mask != 0 { - lexer.Add([]byte(p.RE), newToken(p.Placeholder)) + switch p.mask { + case pIp: + lexer.Add([]byte(p.RE), newIpToken(p.Placeholder)) + default: + lexer.Add([]byte(p.RE), newToken(p.Placeholder)) + } } } } @@ -264,6 +270,84 @@ func newToken(placeholder string) lexmachine.Action { } } +func newIpToken(placeholder string) lexmachine.Action { + return func(s *lexmachine.Scanner, m *machines.Match) (any, error) { + // skip `\w\w` + if m.TC > 0 && isWord(s.Text[m.TC-1]) || + m.TC+len(m.Bytes) < len(s.Text) && isWord(s.Text[m.TC+len(m.Bytes)]) { + return nil, nil + } + + // Fallback IP parser. + // Scans for IP-like patterns until end, then validates with net.ParseIP. + // Necessary because lexer's own pattern matching can be incomplete. + begin, end := m.TC, m.TC + + for end < len(s.Text) { + if !isIPChar(s.Text[end]) { + break + } + end++ + } + + candidate := string(s.Text[begin:end]) + trimmedCandidate := strings.TrimSuffix(candidate, ":") + // classic ip (IPv4+IPv6) + if ip := net.ParseIP(candidate); ip != nil { + return token{ + placeholder: placeholder, + begin: begin, + end: end, + }, nil + } + + if strings.Count(trimmedCandidate, ":") >= 2 { + // IPv6+: + if ip := net.ParseIP(trimmedCandidate); ip != nil { + return token{ + placeholder: placeholder, + begin: begin, + end: end - 1, + }, nil + } + } else { + // IPv4+: + if ip := net.ParseIP(trimmedCandidate); ip != nil { + return token{ + placeholder: placeholder, + begin: begin, + end: end - 1, + }, nil + } + + // IPv4:port + host, _, err := net.SplitHostPort(candidate) + if err == nil { + if ip := net.ParseIP(host); ip != nil { + return token{ + placeholder: placeholder, + begin: begin, + end: end, + }, nil + } + } + + // IPv4:port+: + host, _, err = net.SplitHostPort(trimmedCandidate) + if err == nil { + if ip := net.ParseIP(host); ip != nil { + return token{ + placeholder: placeholder, + begin: begin, + end: end - 1, + }, nil + } + } + } + return nil, nil + } +} + func (n *tokenNormalizer) normalizeByScanner(out []byte, scanner *lexmachine.Scanner) []byte { prevEnd := 0 for tokRaw, err, eos := scanner.Next(); !eos; tokRaw, err, eos = scanner.Next() { @@ -457,6 +541,13 @@ func isWord(c byte) bool { c == '_' } +func isIPChar(c byte) bool { + return (c >= '0' && c <= '9') || + (c >= 'a' && c <= 'f') || + (c >= 'A' && c <= 'F') || + c == ':' || c == '.' +} + // [lexmachine] pkg doesn't support 'exactly' re syntax (a{3}, a{3,6}), // so we use [strings.Repeat] instead var builtinTokenPatterns = []TokenPattern{ @@ -523,9 +614,11 @@ var builtinTokenPatterns = []TokenPattern{ mask: pDatetime, }, { - // IPv4 only Placeholder: placeholderByPattern[pIp], - RE: strings.TrimSuffix(strings.Repeat(`(25[0-5]|(2[0-4]|1?[0-9])?[0-9])\.`, 4), `\.`), + RE: fmt.Sprintf(`%s|%s`, + strings.TrimSuffix(strings.Repeat(`(25[0-5]|(2[0-4]|1?[0-9])?[0-9])\.`, 4), `\.`), + `[0-9a-fA-F:]*:[0-9a-fA-F:]*`, + ), mask: pIp, }, diff --git a/plugin/action/hash/normalize/token_normalizer_test.go b/plugin/action/hash/normalize/token_normalizer_test.go index 552e8ee8d..b7176c593 100644 --- a/plugin/action/hash/normalize/token_normalizer_test.go +++ b/plugin/action/hash/normalize/token_normalizer_test.go @@ -280,27 +280,40 @@ func TestTokenNormalizerBuiltin(t *testing.T) { name: "ip", inputs: []string{ "some 1.2.3.4 here", - "some 01.102.103.104 here", + "some 101.102.103.104 here", + "some 10.234.121.44:34850 here", // IPv6 Normal - //"some 2001:db8:3333:4444:5555:DDDD:EEEE:FFFF here", - //"some :: here", - //"some 2001:db8:: here", - //"some ::1234:5678 here", - //"some 2001:0db8:0001:0000:0000:0ab9:C0A8:0102 here", - //"some 2001:db8::1234:5678 here", + "some 2001:db8:3333:4444:5555:DDDD:EEEE:FFFF here", + "some :: here", + "some ::1 here", + "some 2001:db8:: here", + "some ::1234:5678 here", + "some 2001:0db8:0001:0000:0000:0ab9:C0A8:0102 here", + "some 2001:db8::1234:5678 here", // IPv6 Dual - //"some 2001:db8:3333:4444:5555:6666:1.2.3.4 here", - //"some ::11.22.33.44 here", - //"some 2001:db8::123.123.123.123 here", - //"some ::1234:5678:91.123.4.56 here", - //"some ::1234:5678:1.2.3.4 here", - //"some 2001:db8::1234:5678:5.6.7.8 here", + "some 2001:db8:3333:4444:5555:6666:1.2.3.4 here", + "some ::11.22.33.44 here", + "some 2001:db8::123.123.123.123 here", + "some ::1234:5678:91.123.4.56 here", + "some ::1234:5678:1.2.3.4 here", + "some 2001:db8::1234:5678:5.6.7.8 here", }, patterns: "ip", want: "some here", }, + { + name: "ip_with_colon", + inputs: []string{ + "some 10.234.121.44:34850: here", + "some 10.234.121.44: here", + "some 2001:db8:3333:4444:5555:6666:1.2.3.4: here", + "some ::11.22.33.44: here", + }, + patterns: "ip", + want: "some : here", + }, { name: "duration", inputs: []string{