Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion plugin/action/hash/normalize/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ We support a set of patterns out of the box.
| 11 | uuid | `<uuid>` | 7c1811ed-e98f-4c9c-a9f9-58c757ff494f |
| 12 | hash | `<hash>` | 48757ec9f04efe7faacec8722f3476339b125a6b6172b8a69ff3aa329e0bd0ff<br>a94a8fe5ccb19ba61c4c0873d391e987982fbbd3<br>098f6bcd4621d373cade4e832627b4f6 |
| 13 | datetime | `<datetime>` | 2025-01-13T10:20:40.999999Z<br>2025-01-13T10:20:40+04:00<br>2025-01-13 10:20:40<br>2025-01-13<br>10:20:40 |
| 14 | ip | `<ip>` | 1.2.3.4<br>01.102.103.104 |
| 14 | ip | `<ip>` | **IPv4:** 1.2.3.4<br>**IPv6:** 2001:db8:3333:4444:5555:6666:1.2.3.4 |
| 15 | duration | `<duration>` | -1m5s<br>1w2d3h4m5s6ms7us8ns |
| 16 | hex | `<hex>` | 0x13eb85e69dfbc0758b12acdaae36287d<br>0X553026A59C |
| 17 | float | `<float>` | 100.23<br>-4.56 |
Expand Down
99 changes: 96 additions & 3 deletions plugin/action/hash/normalize/token_normalizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package normalize
import (
"errors"
"fmt"
"net"
"slices"
"strings"

Expand Down Expand Up @@ -208,7 +209,12 @@ func initTokens(lexer *lexmachine.Lexer,
addTokens := func(patterns []TokenPattern) {
for _, p := range patterns {
if p.mask == 0 || builtinPatterns&p.mask != 0 {
lexer.Add([]byte(p.RE), newToken(p.Placeholder))
switch p.mask {
case pIp:
lexer.Add([]byte(p.RE), newIpToken(p.Placeholder))
default:
lexer.Add([]byte(p.RE), newToken(p.Placeholder))
}
}
}
}
Expand Down Expand Up @@ -264,6 +270,84 @@ func newToken(placeholder string) lexmachine.Action {
}
}

func newIpToken(placeholder string) lexmachine.Action {
return func(s *lexmachine.Scanner, m *machines.Match) (any, error) {
// skip `\w<match>\w`
if m.TC > 0 && isWord(s.Text[m.TC-1]) ||
m.TC+len(m.Bytes) < len(s.Text) && isWord(s.Text[m.TC+len(m.Bytes)]) {
return nil, nil
}

// Fallback IP parser.
// Scans for IP-like patterns until end, then validates with net.ParseIP.
// Necessary because lexer's own pattern matching can be incomplete.
begin, end := m.TC, m.TC

for end < len(s.Text) {
if !isIPChar(s.Text[end]) {
break
}
end++
}

candidate := string(s.Text[begin:end])
trimmedCandidate := strings.TrimSuffix(candidate, ":")
// classic ip (IPv4+IPv6)
if ip := net.ParseIP(candidate); ip != nil {
return token{
placeholder: placeholder,
begin: begin,
end: end,
}, nil
}

if strings.Count(trimmedCandidate, ":") >= 2 {
// IPv6+:
if ip := net.ParseIP(trimmedCandidate); ip != nil {
return token{
placeholder: placeholder,
begin: begin,
end: end - 1,
}, nil
}
} else {
// IPv4+:
if ip := net.ParseIP(trimmedCandidate); ip != nil {
return token{
placeholder: placeholder,
begin: begin,
end: end - 1,
}, nil
}

// IPv4:port
host, _, err := net.SplitHostPort(candidate)
if err == nil {
if ip := net.ParseIP(host); ip != nil {
return token{
placeholder: placeholder,
begin: begin,
end: end,
}, nil
}
}

// IPv4:port+:
host, _, err = net.SplitHostPort(trimmedCandidate)
if err == nil {
if ip := net.ParseIP(host); ip != nil {
return token{
placeholder: placeholder,
begin: begin,
end: end - 1,
}, nil
}
}
}
return nil, nil
}
}

func (n *tokenNormalizer) normalizeByScanner(out []byte, scanner *lexmachine.Scanner) []byte {
prevEnd := 0
for tokRaw, err, eos := scanner.Next(); !eos; tokRaw, err, eos = scanner.Next() {
Expand Down Expand Up @@ -457,6 +541,13 @@ func isWord(c byte) bool {
c == '_'
}

func isIPChar(c byte) bool {
return (c >= '0' && c <= '9') ||
(c >= 'a' && c <= 'f') ||
(c >= 'A' && c <= 'F') ||
c == ':' || c == '.'
}

// [lexmachine] pkg doesn't support 'exactly' re syntax (a{3}, a{3,6}),
// so we use [strings.Repeat] instead
var builtinTokenPatterns = []TokenPattern{
Expand Down Expand Up @@ -523,9 +614,11 @@ var builtinTokenPatterns = []TokenPattern{
mask: pDatetime,
},
{
// IPv4 only
Placeholder: placeholderByPattern[pIp],
RE: strings.TrimSuffix(strings.Repeat(`(25[0-5]|(2[0-4]|1?[0-9])?[0-9])\.`, 4), `\.`),
RE: fmt.Sprintf(`%s|%s`,
strings.TrimSuffix(strings.Repeat(`(25[0-5]|(2[0-4]|1?[0-9])?[0-9])\.`, 4), `\.`),
`[0-9a-fA-F:]*:[0-9a-fA-F:]*`,
),

mask: pIp,
},
Expand Down
39 changes: 26 additions & 13 deletions plugin/action/hash/normalize/token_normalizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -280,27 +280,40 @@ func TestTokenNormalizerBuiltin(t *testing.T) {
name: "ip",
inputs: []string{
"some 1.2.3.4 here",
"some 01.102.103.104 here",
"some 101.102.103.104 here",
"some 10.234.121.44:34850 here",

// IPv6 Normal
//"some 2001:db8:3333:4444:5555:DDDD:EEEE:FFFF here",
//"some :: here",
//"some 2001:db8:: here",
//"some ::1234:5678 here",
//"some 2001:0db8:0001:0000:0000:0ab9:C0A8:0102 here",
//"some 2001:db8::1234:5678 here",
"some 2001:db8:3333:4444:5555:DDDD:EEEE:FFFF here",
"some :: here",
"some ::1 here",
"some 2001:db8:: here",
"some ::1234:5678 here",
"some 2001:0db8:0001:0000:0000:0ab9:C0A8:0102 here",
"some 2001:db8::1234:5678 here",

// IPv6 Dual
//"some 2001:db8:3333:4444:5555:6666:1.2.3.4 here",
//"some ::11.22.33.44 here",
//"some 2001:db8::123.123.123.123 here",
//"some ::1234:5678:91.123.4.56 here",
//"some ::1234:5678:1.2.3.4 here",
//"some 2001:db8::1234:5678:5.6.7.8 here",
"some 2001:db8:3333:4444:5555:6666:1.2.3.4 here",
"some ::11.22.33.44 here",
"some 2001:db8::123.123.123.123 here",
"some ::1234:5678:91.123.4.56 here",
"some ::1234:5678:1.2.3.4 here",
"some 2001:db8::1234:5678:5.6.7.8 here",
},
patterns: "ip",
want: "some <ip> here",
},
{
name: "ip_with_colon",
inputs: []string{
"some 10.234.121.44:34850: here",
"some 10.234.121.44: here",
"some 2001:db8:3333:4444:5555:6666:1.2.3.4: here",
"some ::11.22.33.44: here",
},
patterns: "ip",
want: "some <ip>: here",
},
{
name: "duration",
inputs: []string{
Expand Down
Loading