diff --git a/dd-java-agent/agent-iast/build.gradle b/dd-java-agent/agent-iast/build.gradle index 55d0fbbe9e8..2433fd496b0 100644 --- a/dd-java-agent/agent-iast/build.gradle +++ b/dd-java-agent/agent-iast/build.gradle @@ -45,6 +45,7 @@ dependencies { implementation libs.moshi implementation libs.bundles.asm implementation libs.instrument.java + implementation libs.re2j testImplementation project(':utils:test-utils') testImplementation project(':dd-java-agent:agent-bootstrap') diff --git a/dd-java-agent/agent-iast/src/jmh/java/com/datadog/iast/sensitive/SensitiveTokenizerBenchmark.java b/dd-java-agent/agent-iast/src/jmh/java/com/datadog/iast/sensitive/SensitiveTokenizerBenchmark.java new file mode 100644 index 00000000000..c3d86674e09 --- /dev/null +++ b/dd-java-agent/agent-iast/src/jmh/java/com/datadog/iast/sensitive/SensitiveTokenizerBenchmark.java @@ -0,0 +1,231 @@ +package com.datadog.iast.sensitive; + +import static datadog.trace.api.iast.sink.SqlInjectionModule.DATABASE_PARAMETER; +import static java.util.concurrent.TimeUnit.MICROSECONDS; +import static java.util.concurrent.TimeUnit.MILLISECONDS; + +import com.datadog.iast.model.Evidence; +import com.datadog.iast.sensitive.SensitiveHandler.Tokenizer; +import java.util.Arrays; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; + +/** Tracks the cost of the IAST evidence-redaction "sensitive analyzer" tokenizers. */ +@Warmup(iterations = 2, time = 250, timeUnit = MILLISECONDS) +@Measurement(iterations = 3, time = 250, timeUnit = MILLISECONDS) +@Fork(1) +@OutputTimeUnit(MICROSECONDS) +@BenchmarkMode(Mode.AverageTime) +@State(Scope.Benchmark) +public class SensitiveTokenizerBenchmark { + + /** Each scenario pairs a malformed payload shape with the tokenizer that processes it. */ + public enum Scenario { + /** LDAP filter opened, never closed, packed with operators — quadratic: {@code "(" + "="*n}. */ + LDAP_UNCLOSED_FILTER { + @Override + String payload(final int n) { + return "(" + repeat('=', n - 1); + } + + @Override + Tokenizer tokenizer(final String payload) { + return new LdapRegexTokenizer(new Evidence(payload)); + } + }, + /** Repeated open-group + operator — CUBIC, the worst found: {@code "(="*n}. */ + LDAP_NESTED_OPEN_EQ { + @Override + String payload(final int n) { + return repeatUnit("(=", n); + } + + @Override + Tokenizer tokenizer(final String payload) { + return new LdapRegexTokenizer(new Evidence(payload)); + } + }, + /** ANSI SQL string literal opened but never closed — stack overflow: {@code "'" + "a"*n}. */ + SQL_ANSI_UNTERMINATED_STRING { + @Override + String payload(final int n) { + return "'" + repeat('a', n - 1); + } + + @Override + Tokenizer tokenizer(final String payload) { + return sql(payload, null); + } + }, + /** Oracle {@code q' ...} escaped literal with no matching close — stack overflow. */ + SQL_ORACLE_ESCAPED_LITERAL { + @Override + String payload(final int n) { + return "q'~" + repeat('a', n - 3); + } + + @Override + Tokenizer tokenizer(final String payload) { + return sql(payload, "oracle"); + } + }, + /** MySQL double-quoted string literal opened but never closed — stack overflow. */ + SQL_MYSQL_UNTERMINATED_STRING { + @Override + String payload(final int n) { + return "\"" + repeat('a', n - 1); + } + + @Override + Tokenizer tokenizer(final String payload) { + return sql(payload, "mysql"); + } + }, + /** URL query separator + long key, no {@code =} value — linear baseline. */ + URL_QUERY { + @Override + String payload(final int n) { + return "http://h/p?" + repeat('a', n - 11); + } + + @Override + Tokenizer tokenizer(final String payload) { + return new UrlRegexpTokenizer(new Evidence(payload)); + } + }, + /** Run of {@code ?} (also matched by {@code [^=&;]}) — quadratic: {@code "?"*n}. */ + URL_QUESTION_RUN { + @Override + String payload(final int n) { + return repeat('?', n); + } + + @Override + Tokenizer tokenizer(final String payload) { + return new UrlRegexpTokenizer(new Evidence(payload)); + } + }, + /** URL authority started with {@code //}, no {@code @} terminator — linear baseline. */ + URL_AUTHORITY { + @Override + String payload(final int n) { + return "//" + repeat('a', n - 2); + } + + @Override + Tokenizer tokenizer(final String payload) { + return new UrlRegexpTokenizer(new Evidence(payload)); + } + }, + /** Single command + long argument — linear baseline. */ + COMMAND_SINGLE_TOKEN { + @Override + String payload(final int n) { + return "cmd " + repeat('a', n - 4); + } + + @Override + Tokenizer tokenizer(final String payload) { + return new CommandRegexpTokenizer(new Evidence(payload)); + } + }, + /** + * Blank lines exploit MULTILINE {@code ^} + {@code \s*} backtracking — quadratic: {@code + * "\n"*n}. + */ + COMMAND_BLANK_LINES { + @Override + String payload(final int n) { + return repeat('\n', n); + } + + @Override + Tokenizer tokenizer(final String payload) { + return new CommandRegexpTokenizer(new Evidence(payload)); + } + }; + + abstract String payload(int sizeBytes); + + abstract Tokenizer tokenizer(String payload); + + static Tokenizer sql(final String payload, final String dialect) { + final Evidence evidence = new Evidence(payload); + if (dialect != null) { + evidence.getContext().put(DATABASE_PARAMETER, dialect); + } + return new SqlRegexpTokenizer(evidence); + } + + static String repeat(final char c, final int count) { + final int n = Math.max(count, 0); + final char[] chars = new char[n]; + Arrays.fill(chars, c); + return new String(chars); + } + + static String repeatUnit(final String unit, final int totalLen) { + final int n = Math.max(totalLen, 0); + final StringBuilder sb = new StringBuilder(n); + while (sb.length() < n) { + sb.append(unit); + } + sb.setLength(n); + return sb.toString(); + } + } + + @Param({ + "LDAP_UNCLOSED_FILTER", + "LDAP_NESTED_OPEN_EQ", + "SQL_ANSI_UNTERMINATED_STRING", + "SQL_ORACLE_ESCAPED_LITERAL", + "SQL_MYSQL_UNTERMINATED_STRING", + "URL_QUERY", + "URL_QUESTION_RUN", + "URL_AUTHORITY", + "COMMAND_SINGLE_TOKEN", + "COMMAND_BLANK_LINES" + }) + Scenario scenario; + + @Param({"512", "1024", "2048"}) + int sizeBytes; + + private String payload; + + @Setup(Level.Trial) + public void setup() { + payload = scenario.payload(sizeBytes); + } + + /** + * Builds the tokenizer and fully drains it, exactly as evidence redaction does. Returns the + * number of tokens (consumed by JMH). A pathological pattern may overflow the stack; we catch it + * so the run stays stable and report {@code -1} — see the class javadoc. + */ + @Benchmark + public long tokenize() { + try { + final Tokenizer tokenizer = scenario.tokenizer(payload); + long count = 0; + while (tokenizer.next()) { + tokenizer.current(); + count++; + } + return count; + } catch (final Throwable pathological) { + return -1; + } + } +} diff --git a/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/model/json/EvidenceAdapter.java b/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/model/json/EvidenceAdapter.java index 9aa13db2e56..666d5c7c0c8 100644 --- a/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/model/json/EvidenceAdapter.java +++ b/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/model/json/EvidenceAdapter.java @@ -446,9 +446,12 @@ public void write(final Context ctx, final JsonWriter writer) throws IOException private List split(final RedactionContext redaction) { final List parts = new ArrayList<>(); + // Identical sensitive chunks redact to the same pattern (the first occurrence in the source), + // so cache chunk -> offset to avoid an O(sourceLength) indexOf per repeated occurrence. + final Map matchingOffsets = new HashMap<>(); if (redaction.isSensitive()) { // redact the full tainted value as the source is sensitive (password, certificate, ...) - addValuePart(0, value.length(), redaction, true, parts); + addValuePart(0, value.length(), redaction, matchingOffsets, true, parts); } else { // redact only sensitive parts int index = 0; @@ -456,13 +459,13 @@ private List split(final RedactionContext redaction) { final int start = sensitive.getStart(); final int end = sensitive.getStart() + sensitive.getLength(); // append previous tainted chunk (if any) - addValuePart(index, start, redaction, false, parts); + addValuePart(index, start, redaction, matchingOffsets, false, parts); // append current sensitive tainted chunk - addValuePart(start, end, redaction, true, parts); + addValuePart(start, end, redaction, matchingOffsets, true, parts); index = end; } // append last tainted chunk (if any) - addValuePart(index, value.length(), redaction, false, parts); + addValuePart(index, value.length(), redaction, matchingOffsets, false, parts); } return parts; } @@ -471,6 +474,7 @@ private void addValuePart( final int start, final int end, final RedactionContext ctx, + final Map matchingOffsets, final boolean redact, final List valueParts) { if (start < end) { @@ -484,7 +488,9 @@ private void addValuePart( final int length = chunk.length(); final String sourceValue = source.getValue(); final String redactedValue = ctx.getRedactedValue(); - final int matching = (sourceValue == null) ? -1 : sourceValue.indexOf(chunk); + final int matching = + matchingOffsets.computeIfAbsent( + chunk, c -> sourceValue == null ? -1 : sourceValue.indexOf(c)); final String pattern; if (matching >= 0 && redactedValue != null) { // if matches append the matching part from the redacted value diff --git a/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/AbstractRegexTokenizer.java b/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/AbstractRegexTokenizer.java index 54c73669be3..f43f00df26f 100644 --- a/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/AbstractRegexTokenizer.java +++ b/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/AbstractRegexTokenizer.java @@ -1,9 +1,9 @@ package com.datadog.iast.sensitive; import com.datadog.iast.util.Ranged; +import com.google.re2j.Matcher; +import com.google.re2j.Pattern; import java.util.NoSuchElementException; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import javax.annotation.Nullable; public abstract class AbstractRegexTokenizer implements SensitiveHandler.Tokenizer { diff --git a/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/CommandRegexpTokenizer.java b/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/CommandRegexpTokenizer.java index 970239e0207..b9108d103e8 100644 --- a/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/CommandRegexpTokenizer.java +++ b/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/CommandRegexpTokenizer.java @@ -2,7 +2,7 @@ import com.datadog.iast.model.Evidence; import com.datadog.iast.util.Ranged; -import java.util.regex.Pattern; +import com.google.re2j.Pattern; public class CommandRegexpTokenizer extends AbstractRegexTokenizer { diff --git a/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/HeaderRegexpTokenizer.java b/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/HeaderRegexpTokenizer.java index dd2f1921365..ae3645879b3 100644 --- a/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/HeaderRegexpTokenizer.java +++ b/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/HeaderRegexpTokenizer.java @@ -2,8 +2,8 @@ import com.datadog.iast.model.Evidence; import com.datadog.iast.util.Ranged; +import com.google.re2j.Pattern; import java.util.NoSuchElementException; -import java.util.regex.Pattern; import javax.annotation.Nullable; public class HeaderRegexpTokenizer implements SensitiveHandler.Tokenizer { diff --git a/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/LdapRegexTokenizer.java b/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/LdapRegexTokenizer.java index 72ec3e70d86..5e3f84adafa 100644 --- a/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/LdapRegexTokenizer.java +++ b/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/LdapRegexTokenizer.java @@ -2,7 +2,7 @@ import com.datadog.iast.model.Evidence; import com.datadog.iast.util.Ranged; -import java.util.regex.Pattern; +import com.google.re2j.Pattern; /** * @see Lightweight Directory Access Protocol @@ -14,7 +14,7 @@ public class LdapRegexTokenizer extends AbstractRegexTokenizer { private static final Pattern LDAP_PATTERN = Pattern.compile( - String.format("\\(.*?(?:~=|=|<=|>=)(?<%s>[^)]+)\\)", LITERAL_GROUP), Pattern.MULTILINE); + String.format("\\(.*?(?:~=|=|<=|>=)(?P<%s>[^)]+)\\)", LITERAL_GROUP), Pattern.MULTILINE); public LdapRegexTokenizer(final Evidence evidence) { super(LDAP_PATTERN, evidence.getValue()); diff --git a/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/SensitiveHandlerImpl.java b/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/SensitiveHandlerImpl.java index c0a500e9de6..3aaaa67a775 100644 --- a/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/SensitiveHandlerImpl.java +++ b/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/SensitiveHandlerImpl.java @@ -3,21 +3,28 @@ import static com.datadog.iast.util.CharUtils.fillCharArray; import static com.datadog.iast.util.CharUtils.newCharArray; import static com.datadog.iast.util.CharUtils.newString; -import static java.util.regex.Pattern.CASE_INSENSITIVE; -import static java.util.regex.Pattern.MULTILINE; +import static com.google.re2j.Pattern.CASE_INSENSITIVE; +import static com.google.re2j.Pattern.MULTILINE; +import static datadog.trace.api.ConfigDefaults.DEFAULT_IAST_REDACTION_NAME_PATTERN; +import static datadog.trace.api.ConfigDefaults.DEFAULT_IAST_REDACTION_VALUE_PATTERN; import com.datadog.iast.model.Evidence; import com.datadog.iast.model.Source; import com.datadog.iast.model.VulnerabilityType; +import com.google.re2j.Pattern; +import com.google.re2j.PatternSyntaxException; import datadog.trace.api.Config; import java.util.HashMap; import java.util.Map; -import java.util.regex.Pattern; import javax.annotation.Nonnull; import javax.annotation.Nullable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class SensitiveHandlerImpl implements SensitiveHandler { + private static final Logger LOG = LoggerFactory.getLogger(SensitiveHandlerImpl.class); + static final SensitiveHandler INSTANCE = new SensitiveHandlerImpl(); private static final char[] REDACTED_SENSITIVE_BUFFER = newCharArray(16, '*'); @@ -35,10 +42,17 @@ public class SensitiveHandlerImpl implements SensitiveHandler { private final Map tokenizers; public SensitiveHandlerImpl() { - final Config config = Config.get(); - namePattern = Pattern.compile(config.getIastRedactionNamePattern(), CASE_INSENSITIVE); + this(Config.get().getIastRedactionNamePattern(), Config.get().getIastRedactionValuePattern()); + } + + SensitiveHandlerImpl(final String configuredNamePattern, final String configuredValuePattern) { + namePattern = + safeCompile(configuredNamePattern, DEFAULT_IAST_REDACTION_NAME_PATTERN, CASE_INSENSITIVE); valuePattern = - Pattern.compile(config.getIastRedactionValuePattern(), CASE_INSENSITIVE | MULTILINE); + safeCompile( + configuredValuePattern, + DEFAULT_IAST_REDACTION_VALUE_PATTERN, + CASE_INSENSITIVE | MULTILINE); tokenizers = new HashMap<>(); tokenizers.put(VulnerabilityType.SQL_INJECTION, SqlRegexpTokenizer::new); tokenizers.put(VulnerabilityType.LDAP_INJECTION, LdapRegexTokenizer::new); @@ -75,8 +89,8 @@ public String redactString(final String value) { @Override public Tokenizer tokenizeEvidence( @Nonnull final VulnerabilityType type, @Nonnull final Evidence evidence) { - final TokenizerSupplier supplier = tokenizers.computeIfAbsent(type, t -> emptyTokenizer()); - return supplier.tokenizerFor(evidence); + final TokenizerSupplier supplier = tokenizers.get(type); + return supplier == null ? Tokenizer.EMPTY : supplier.tokenizerFor(evidence); } private int computeLength(@Nullable final String value) { @@ -93,8 +107,18 @@ private int computeLength(@Nullable final String value) { return size; } - private TokenizerSupplier emptyTokenizer() { - return evidence -> Tokenizer.EMPTY; + private static Pattern safeCompile( + final String configured, final String fallback, final int flags) { + try { + return Pattern.compile(configured, flags); + } catch (final PatternSyntaxException e) { + LOG.error( + "Could not compile IAST redaction pattern with RE2J, falling back to the default: {} (configured: {})", + fallback, + configured, + e); + return Pattern.compile(fallback, flags); + } } private interface TokenizerSupplier { diff --git a/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/SqlRegexpTokenizer.java b/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/SqlRegexpTokenizer.java index 6c87aaf7a87..a52eedb402a 100644 --- a/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/SqlRegexpTokenizer.java +++ b/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/SqlRegexpTokenizer.java @@ -4,23 +4,25 @@ import com.datadog.iast.model.Evidence; import com.datadog.iast.util.Ranged; +import com.google.re2j.Matcher; +import com.google.re2j.Pattern; import java.util.*; +import java.util.concurrent.ConcurrentHashMap; import java.util.function.Predicate; import java.util.function.Supplier; -import java.util.regex.Pattern; +import javax.annotation.Nullable; -public class SqlRegexpTokenizer extends AbstractRegexTokenizer { +public class SqlRegexpTokenizer implements SensitiveHandler.Tokenizer { private static final String STRING_LITERAL = "'(?:''|[^'])*'"; - private static final String ORACLE_ESCAPED_LITERAL = - "q'<.*?>'|q'\\(.*?\\)'|q'\\{.*?\\}'|q'\\[.*?\\]'|q'(?.).*?\\k'"; - private static final String POSTGRESQL_ESCAPED_LITERAL = - "\\$(?[^$]*?)\\$.*?\\$\\k\\$"; + private static final String ORACLE_ESCAPED_LITERAL = buildOracleEscapedLiteral(); + // $$ or $tag$ where tag is a SQL identifier + private static final String POSTGRESQL_ESCAPED_LITERAL = "\\$(?:[a-zA-Z_]\\w*)?\\$"; private static final String MYSQL_STRING_LITERAL = "\"(?:\\\"|[^\"])*\"|'(?:\\'|[^'])*'"; private static final String LINE_COMMENT = "--.*$"; private static final String BLOCK_COMMENT = "/\\*[\\s\\S]*\\*/"; private static final String EXPONENT = "(?:E[-+]?\\d+[fd]?)?"; - private static final String INTEGER_NUMBER = "(? PATTERNS = new EnumMap<>(Dialect.class); + private static final Map PATTERNS = new ConcurrentHashMap<>(); private final String sql; + private final Matcher matcher; + private int searchFrom; + @Nullable private Ranged current; + // Lazily built (Postgres only): every "$tag$" occurrence indexed by tag, so the matching close + // can be located with a binary search instead of an O(n) scan per opener. + @Nullable private Map dollarTagPositions; public SqlRegexpTokenizer(final Evidence evidence) { - super( - PATTERNS.computeIfAbsent(Dialect.fromEvidence(evidence), Dialect::buildPattern), - evidence.getValue()); this.sql = evidence.getValue(); + this.matcher = + PATTERNS + .computeIfAbsent(Dialect.fromEvidence(evidence), Dialect::buildPattern) + .matcher(sql); } @Override - protected Ranged buildNext() { - int start = matcher.start(); - int end = matcher.end(); - final char startChar = sql.charAt(start); - if (startChar == '\'' || startChar == '"') { - start++; - end--; - } else if (end > start + 1) { - final char nextChar = sql.charAt(start + 1); - if (startChar == '/' && nextChar == '*') { - start += 2; - end -= 2; - } else if (startChar == '-' && startChar == nextChar) { - start += 2; - } else if (Character.toLowerCase(startChar) == 'q' && nextChar == '\'') { - start += 3; - end -= 2; - } else if (startChar == '$') { - final String match = matcher.group(); - final int size = match.indexOf('$', 1) + 1; - if (size > 1) { - start += size; - end -= size; + public boolean next() { + while (matcher.find(searchFrom)) { + final int start = matcher.start(); + int end = matcher.end(); + int rangeStart = start; + int rangeEnd = end; + final char startChar = sql.charAt(start); + if (startChar == '$') { + // Postgres dollar-quoting: the regex matched the opening "$tag$"; find the matching close. + final String tag = sql.substring(start, end); + final int close = nextDollarTag(tag, end); + if (close < 0) { + // No matching close tag: not a dollar-quoted literal. Skip past the whole opener we + // already matched (not just one char) so find() does not re-scan it. + searchFrom = end; + continue; } + end = close + tag.length(); + rangeStart = start + tag.length(); + rangeEnd = close; + } else if (startChar == '\'' || startChar == '"') { + rangeStart++; + rangeEnd--; + } else if (end > start + 1) { + final char nextChar = sql.charAt(start + 1); + if (startChar == '/' && nextChar == '*') { + rangeStart += 2; + rangeEnd -= 2; + } else if (startChar == '-' && startChar == nextChar) { + rangeStart += 2; + } else if (Character.toLowerCase(startChar) == 'q' && nextChar == '\'') { + rangeStart += 3; + rangeEnd -= 2; + } + } + searchFrom = end; + current = Ranged.build(rangeStart, rangeEnd - rangeStart); + return true; + } + current = null; + return false; + } + + @Override + public Ranged current() { + if (current == null) { + throw new NoSuchElementException(); + } + return current; + } + + /** + * Returns the start offset of the first {@code "$tag$"} occurrence at or after {@code from}, or + * {@code -1} if there is none. Equivalent to {@code sql.indexOf(tag, from)} for dollar-quote tags + * but backed by a precomputed index so the whole tokenization stays near-linear instead of + * scanning to end-of-string once per opener. + */ + private int nextDollarTag(final String tag, final int from) { + final int[] positions = dollarTagPositions().get(tag); + if (positions == null) { + return -1; + } + // first position >= from + int lo = 0; + int hi = positions.length; + while (lo < hi) { + final int mid = (lo + hi) >>> 1; + if (positions[mid] < from) { + lo = mid + 1; + } else { + hi = mid; + } + } + return lo < positions.length ? positions[lo] : -1; + } + + private Map dollarTagPositions() { + if (dollarTagPositions == null) { + dollarTagPositions = buildDollarTagPositions(sql); + } + return dollarTagPositions; + } + + /** + * Single left-to-right pass collecting the start offset of every {@code "$tag$"} token (empty tag + * or a SQL identifier), keyed by the token text. Each character is visited once: the optional + * identifier run after a {@code '$'} contains no {@code '$'}, so runs from distinct openers never + * overlap, making the scan O(n). + */ + private static Map buildDollarTagPositions(final String sql) { + final Map> positions = new HashMap<>(); + final int length = sql.length(); + for (int i = 0; i < length; i++) { + if (sql.charAt(i) != '$') { + continue; + } + int end = i + 1; + if (end < length && isIdentifierStart(sql.charAt(end))) { + end++; + while (end < length && isIdentifierPart(sql.charAt(end))) { + end++; + } + } + if (end < length && sql.charAt(end) == '$') { + final String tag = sql.substring(i, end + 1); + positions.computeIfAbsent(tag, k -> new ArrayList<>()).add(i); + } + } + final Map result = new HashMap<>(positions.size() * 2); + for (final Map.Entry> entry : positions.entrySet()) { + final List list = entry.getValue(); + final int[] array = new int[list.size()]; + for (int i = 0; i < array.length; i++) { + array[i] = list.get(i); + } + result.put(entry.getKey(), array); + } + return result; + } + + private static boolean isIdentifierStart(final char c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'; + } + + private static boolean isIdentifierPart(final char c) { + return isIdentifierStart(c) || (c >= '0' && c <= '9'); + } + + /** + * Builds the Oracle {@code q'…'} alternation: the four bracket-paired delimiters plus one branch + * per other printable single-character delimiter (RE2J has no back-reference to require the same + * closing char, so the finite delimiter alphabet is enumerated). + * + *

The enumeration is intentionally restricted to the printable ASCII range {@code 0x21..0x7e}. + * Oracle forbids whitespace (space, tab, carriage return) as a {@code q'} delimiter, so excluding + * those characters matches Oracle's own rules rather than dropping valid literals. Multi-byte + * (non-ASCII) delimiters, which Oracle does allow, are not enumerated; such literals fall back to + * being tokenized by the generic {@code STRING_LITERAL} branch. + */ + private static String buildOracleEscapedLiteral() { + final List alternatives = new ArrayList<>(); + alternatives.add("q'<.*?>'"); + alternatives.add("q'\\(.*?\\)'"); + alternatives.add("q'\\{.*?\\}'"); + alternatives.add("q'\\[.*?\\]'"); + for (char delim = 0x21; delim <= 0x7e; delim++) { + // brackets handled above; ' is ambiguous with the surrounding quotes. + if ("<>(){}[]'".indexOf(delim) >= 0) { + continue; } + final String escaped = escapeDelimiter(delim); + alternatives.add("q'" + escaped + ".*?" + escaped + "'"); } - return Ranged.build(start, end - start); + return String.join("|", alternatives); + } + + private static String escapeDelimiter(final char delim) { + return "\\.+*?^$|".indexOf(delim) >= 0 ? "\\" + delim : String.valueOf(delim); } private enum Dialect { diff --git a/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/UrlRegexpTokenizer.java b/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/UrlRegexpTokenizer.java index 30371a6d0b5..06e1001a637 100644 --- a/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/UrlRegexpTokenizer.java +++ b/dd-java-agent/agent-iast/src/main/java/com/datadog/iast/sensitive/UrlRegexpTokenizer.java @@ -2,7 +2,7 @@ import com.datadog.iast.model.Evidence; import com.datadog.iast.util.Ranged; -import java.util.regex.Pattern; +import com.google.re2j.Pattern; /** * @see [^@]+)@", AUTHORITY_GROUP); + String.format("^(?:[^:]+:)?//(?P<%s>[^@]+)@", AUTHORITY_GROUP); private static final String QUERY_FRAGMENT = - String.format("[?#&]([^=&;]+)=(?<%s>[^?#&]+)", QUERY_FRAGMENT_GROUP); + String.format("[?#&]([^=&;]+)=(?P<%s>[^?#&]+)", QUERY_FRAGMENT_GROUP); private static final Pattern PATTERN = Pattern.compile(String.join("|", AUTHORITY, QUERY_FRAGMENT)); diff --git a/dd-java-agent/agent-iast/src/test/groovy/com/datadog/iast/sensitive/SensitiveHandlerTest.groovy b/dd-java-agent/agent-iast/src/test/groovy/com/datadog/iast/sensitive/SensitiveHandlerTest.groovy deleted file mode 100644 index 19dfdf1999e..00000000000 --- a/dd-java-agent/agent-iast/src/test/groovy/com/datadog/iast/sensitive/SensitiveHandlerTest.groovy +++ /dev/null @@ -1,34 +0,0 @@ -package com.datadog.iast.sensitive - -import datadog.trace.test.util.DDSpecification - -/** - * Most of the testing is done via {@link com.datadog.iast.model.json.EvidenceRedactionTest} - */ -class SensitiveHandlerTest extends DDSpecification { - - void 'test that empty tokenizer returns nothing'() { - given: - final tokenizer = SensitiveHandler.Tokenizer.EMPTY - - when: - final next = tokenizer.next() - - then: - !next - - when: - tokenizer.current() - - then: - thrown(NoSuchElementException) - } - - void 'test that current instance has a value'() { - when: - final current = SensitiveHandler.get() - - then: - current != null - } -} diff --git a/dd-java-agent/agent-iast/src/test/java/com/datadog/iast/model/json/EvidenceAdapterTest.java b/dd-java-agent/agent-iast/src/test/java/com/datadog/iast/model/json/EvidenceAdapterTest.java new file mode 100644 index 00000000000..a3189181c7f --- /dev/null +++ b/dd-java-agent/agent-iast/src/test/java/com/datadog/iast/model/json/EvidenceAdapterTest.java @@ -0,0 +1,62 @@ +package com.datadog.iast.model.json; + +import static org.junit.jupiter.api.Assumptions.assumeTrue; + +import com.datadog.iast.model.Evidence; +import com.datadog.iast.model.Location; +import com.datadog.iast.model.Range; +import com.datadog.iast.model.Source; +import com.datadog.iast.model.Vulnerability; +import com.datadog.iast.model.VulnerabilityBatch; +import com.datadog.iast.model.VulnerabilityType; +import datadog.trace.api.Config; +import datadog.trace.api.iast.SourceTypes; +import datadog.trace.api.iast.VulnerabilityMarks; +import org.junit.jupiter.api.Test; +import org.skyscreamer.jsonassert.JSONAssert; +import org.skyscreamer.jsonassert.JSONCompareMode; + +class EvidenceAdapterTest { + + @Test + void repeatedSensitiveLiteralsRedactToTheSamePattern() throws Exception { + assumeTrue(Config.get().isIastRedactionEnabled(), "redaction must be enabled"); + + // The two 'abc' string literals are detected as sensitive ranges within the same tainted value. + // Both must map to the first occurrence of "abc" in the source (index 8 -> "ijk"), so they + // render with an identical pattern. This is the behavior preserved by the chunk -> offset + // memoization in EvidenceAdapter#addValuePart. + final String sql = "select 'abc' or 'abc'"; + final Source source = new Source(SourceTypes.REQUEST_PARAMETER_VALUE, "query", sql); + final Range range = new Range(0, sql.length(), source, VulnerabilityMarks.NOT_MARKED); + final Evidence evidence = new Evidence(sql, new Range[] {range}); + final Vulnerability vulnerability = + new Vulnerability( + VulnerabilityType.SQL_INJECTION, + Location.forClassAndMethodAndLine("Test", "test", 1), + evidence); + final VulnerabilityBatch batch = new VulnerabilityBatch(); + batch.add(vulnerability); + + final String json = VulnerabilityEncoding.toJson(batch); + + final String expected = + "{" + + " \"sources\": [" + + " { \"origin\": \"http.request.parameter\", \"name\": \"query\"," + + " \"redacted\": true, \"pattern\": \"abcdefghijklmnopqrstu\" }" + + " ]," + + " \"vulnerabilities\": [" + + " { \"type\": \"SQL_INJECTION\", \"evidence\": { \"valueParts\": [" + + " { \"source\": 0, \"value\": \"select '\" }," + + " { \"source\": 0, \"redacted\": true, \"pattern\": \"ijk\" }," + + " { \"source\": 0, \"value\": \"' or '\" }," + + " { \"source\": 0, \"redacted\": true, \"pattern\": \"ijk\" }," + + " { \"source\": 0, \"value\": \"'\" }" + + " ] } }" + + " ]" + + "}"; + + JSONAssert.assertEquals(expected, json, JSONCompareMode.LENIENT); + } +} diff --git a/dd-java-agent/agent-iast/src/test/java/com/datadog/iast/sensitive/CommandRegexpTokenizerTest.java b/dd-java-agent/agent-iast/src/test/java/com/datadog/iast/sensitive/CommandRegexpTokenizerTest.java new file mode 100644 index 00000000000..d43b3495713 --- /dev/null +++ b/dd-java-agent/agent-iast/src/test/java/com/datadog/iast/sensitive/CommandRegexpTokenizerTest.java @@ -0,0 +1,44 @@ +package com.datadog.iast.sensitive; + +import static java.util.Arrays.asList; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.params.provider.Arguments.arguments; + +import com.datadog.iast.model.Evidence; +import com.datadog.iast.sensitive.SensitiveHandler.Tokenizer; +import com.datadog.iast.util.Ranged; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Stream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class CommandRegexpTokenizerTest { + + @ParameterizedTest(name = "{0}") + @MethodSource("redactsCommandArgumentsArguments") + void redactsCommandArguments( + final String description, final String command, final List expected) { + assertEquals(expected, tokenize(command)); + } + + static Stream redactsCommandArgumentsArguments() { + return Stream.of( + arguments("plain command keeps its arguments", "ls -la /tmp", asList("-la /tmp")), + arguments("sudo prefix is skipped", "sudo rm -rf /", asList("-rf /")), + arguments("doas prefix is skipped", "doas cat /etc/passwd", asList("/etc/passwd")), + arguments( + "everything after the binary is captured", "echo hello world", asList("hello world"))); + } + + private static List tokenize(String command) { + Tokenizer tokenizer = new CommandRegexpTokenizer(new Evidence(command)); + List tokens = new ArrayList<>(); + while (tokenizer.next()) { + Ranged range = tokenizer.current(); + tokens.add(command.substring(range.getStart(), range.getStart() + range.getLength())); + } + return tokens; + } +} diff --git a/dd-java-agent/agent-iast/src/test/java/com/datadog/iast/sensitive/HeaderRegexpTokenizerTest.java b/dd-java-agent/agent-iast/src/test/java/com/datadog/iast/sensitive/HeaderRegexpTokenizerTest.java new file mode 100644 index 00000000000..c24a2d4dd61 --- /dev/null +++ b/dd-java-agent/agent-iast/src/test/java/com/datadog/iast/sensitive/HeaderRegexpTokenizerTest.java @@ -0,0 +1,58 @@ +package com.datadog.iast.sensitive; + +import static com.google.re2j.Pattern.CASE_INSENSITIVE; +import static java.util.Collections.emptyList; +import static java.util.Collections.singletonList; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.params.provider.Arguments.arguments; + +import com.datadog.iast.model.Evidence; +import com.datadog.iast.sensitive.SensitiveHandler.Tokenizer; +import com.datadog.iast.util.Ranged; +import com.google.re2j.Pattern; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Stream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class HeaderRegexpTokenizerTest { + + private static final Pattern NAME_PATTERN = + Pattern.compile("password|authorization", CASE_INSENSITIVE); + private static final Pattern VALUE_PATTERN = Pattern.compile("bearer\\s", CASE_INSENSITIVE); + + @ParameterizedTest(name = "{0}") + @MethodSource("redactsSensitiveHeadersArguments") + void redactsSensitiveHeaders( + final String description, final String header, final List expected) { + assertEquals(expected, tokenize(header)); + } + + static Stream redactsSensitiveHeadersArguments() { + return Stream.of( + arguments( + "sensitive name redacts the value", + "Authorization: Bearer xyz", + singletonList("Bearer xyz")), + arguments( + "sensitive value redacts the value", + "X-Auth: Bearer secret", + singletonList("Bearer secret")), + arguments("non-sensitive header is ignored", "Accept: text/html", emptyList()), + arguments("missing separator is ignored", "NoColonHeader", emptyList()), + arguments("missing value is ignored", "Empty:", emptyList())); + } + + private static List tokenize(String header) { + Tokenizer tokenizer = + new HeaderRegexpTokenizer(new Evidence(header), NAME_PATTERN, VALUE_PATTERN); + List tokens = new ArrayList<>(); + while (tokenizer.next()) { + Ranged range = tokenizer.current(); + tokens.add(header.substring(range.getStart(), range.getStart() + range.getLength())); + } + return tokens; + } +} diff --git a/dd-java-agent/agent-iast/src/test/java/com/datadog/iast/sensitive/LdapRegexTokenizerTest.java b/dd-java-agent/agent-iast/src/test/java/com/datadog/iast/sensitive/LdapRegexTokenizerTest.java new file mode 100644 index 00000000000..4060e00eca1 --- /dev/null +++ b/dd-java-agent/agent-iast/src/test/java/com/datadog/iast/sensitive/LdapRegexTokenizerTest.java @@ -0,0 +1,45 @@ +package com.datadog.iast.sensitive; + +import static java.util.Arrays.asList; +import static java.util.Collections.singletonList; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.params.provider.Arguments.arguments; + +import com.datadog.iast.model.Evidence; +import com.datadog.iast.sensitive.SensitiveHandler.Tokenizer; +import com.datadog.iast.util.Ranged; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Stream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class LdapRegexTokenizerTest { + + @ParameterizedTest(name = "{0}") + @MethodSource("redactsFilterLiteralsArguments") + void redactsFilterLiterals( + final String description, final String filter, final List expected) { + assertEquals(expected, tokenize(filter)); + } + + static Stream redactsFilterLiteralsArguments() { + return Stream.of( + arguments("equality literal", "(cn=John Doe)", singletonList("John Doe")), + arguments("nested filter literals", "(&(uid=bob)(role=admin))", asList("bob", "admin")), + arguments("greater-or-equal operator", "(age>=21)", singletonList("21")), + arguments("less-or-equal operator", "(score<=100)", singletonList("100")), + arguments("approximate operator", "(attr~=approx)", singletonList("approx"))); + } + + private static List tokenize(String filter) { + Tokenizer tokenizer = new LdapRegexTokenizer(new Evidence(filter)); + List tokens = new ArrayList<>(); + while (tokenizer.next()) { + Ranged range = tokenizer.current(); + tokens.add(filter.substring(range.getStart(), range.getStart() + range.getLength())); + } + return tokens; + } +} diff --git a/dd-java-agent/agent-iast/src/test/java/com/datadog/iast/sensitive/SensitiveHandlerTest.java b/dd-java-agent/agent-iast/src/test/java/com/datadog/iast/sensitive/SensitiveHandlerTest.java new file mode 100644 index 00000000000..6f296eafb4e --- /dev/null +++ b/dd-java-agent/agent-iast/src/test/java/com/datadog/iast/sensitive/SensitiveHandlerTest.java @@ -0,0 +1,52 @@ +package com.datadog.iast.sensitive; + +import static datadog.trace.api.ConfigDefaults.DEFAULT_IAST_REDACTION_NAME_PATTERN; +import static datadog.trace.api.ConfigDefaults.DEFAULT_IAST_REDACTION_VALUE_PATTERN; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import com.datadog.iast.sensitive.SensitiveHandler.Tokenizer; +import java.util.NoSuchElementException; +import org.junit.jupiter.api.Test; + +/** Most of the testing is done via {@link com.datadog.iast.model.json.EvidenceRedactionTest}. */ +class SensitiveHandlerTest { + + // Valid under java.util.regex (named group + backreference) but rejected by RE2J, so + // SensitiveHandlerImpl must fall back to the default pattern instead of failing to compile. + private static final String RE2J_INCOMPATIBLE_PATTERN = "(?secret)\\k"; + + @Test + void emptyTokenizerReturnsNothing() { + final Tokenizer tokenizer = Tokenizer.EMPTY; + assertFalse(tokenizer.next()); + assertThrows(NoSuchElementException.class, tokenizer::current); + } + + @Test + void currentInstanceHasValue() { + assertNotNull(SensitiveHandler.get()); + } + + @Test + void incompatibleNamePatternFallsBackToDefault() { + final SensitiveHandlerImpl handler = + new SensitiveHandlerImpl(RE2J_INCOMPATIBLE_PATTERN, DEFAULT_IAST_REDACTION_VALUE_PATTERN); + + // the default name pattern is used instead of failing to compile + assertTrue(handler.isSensitiveName("password")); + assertFalse(handler.isSensitiveName("username")); + } + + @Test + void incompatibleValuePatternFallsBackToDefault() { + final SensitiveHandlerImpl handler = + new SensitiveHandlerImpl(DEFAULT_IAST_REDACTION_NAME_PATTERN, RE2J_INCOMPATIBLE_PATTERN); + + // the default value pattern is used instead of failing to compile + assertTrue(handler.isSensitiveValue("bearer abc123def456")); + assertFalse(handler.isSensitiveValue("not a secret value")); + } +} diff --git a/dd-java-agent/agent-iast/src/test/java/com/datadog/iast/sensitive/SqlRegexpTokenizerTest.java b/dd-java-agent/agent-iast/src/test/java/com/datadog/iast/sensitive/SqlRegexpTokenizerTest.java new file mode 100644 index 00000000000..4268b95539b --- /dev/null +++ b/dd-java-agent/agent-iast/src/test/java/com/datadog/iast/sensitive/SqlRegexpTokenizerTest.java @@ -0,0 +1,160 @@ +package com.datadog.iast.sensitive; + +import static datadog.trace.api.iast.sink.SqlInjectionModule.DATABASE_PARAMETER; +import static java.util.Arrays.asList; +import static java.util.Collections.emptyList; +import static java.util.Collections.singletonList; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTimeoutPreemptively; +import static org.junit.jupiter.params.provider.Arguments.arguments; + +import com.datadog.iast.model.Evidence; +import com.datadog.iast.sensitive.SensitiveHandler.Tokenizer; +import com.datadog.iast.util.Ranged; +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Stream; +import javax.annotation.Nullable; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class SqlRegexpTokenizerTest { + + @ParameterizedTest(name = "{0}") + @MethodSource("tokenizesSqlLiteralsArguments") + void tokenizesSqlLiterals( + final String description, + @Nullable final String dialect, + final String sql, + final List expected) { + assertEquals(expected, tokenize(dialect, sql)); + } + + static Stream tokenizesSqlLiteralsArguments() { + return Stream.of( + // ANSI (default dialect when no database is provided) + arguments( + "ansi single-quoted string", + null, + "SELECT name FROM u WHERE name = 'john'", + singletonList("john")), + arguments( + "ansi escaped single quote", null, "SELECT 'O''Brien'", singletonList("O''Brien")), + arguments("ansi integer literal", null, "SELECT 12345", singletonList("12345")), + arguments("ansi decimal literal", null, "SELECT 3.14", singletonList("3.14")), + arguments("ansi hex literal", null, "SELECT 0x1aF", singletonList("0x1aF")), + arguments("ansi line comment", null, "SELECT a -- bye", singletonList(" bye")), + arguments("ansi block comment", null, "SELECT /* hidden */ a", singletonList(" hidden ")), + arguments("ansi ignores double quotes", null, "SELECT \"x\" FROM t", emptyList()), + // MySQL family treats double-quoted strings as literals + arguments( + "mysql double-quoted string", + "mysql", + "SELECT \"secret\" FROM t", + singletonList("secret")), + // Oracle q'...' escaped literals (bracket pairs + enumerated single-char delimiters) + arguments( + "oracle q bracket literal", + "oracle", + "SELECT q'[hello]' FROM dual", + singletonList("hello")), + arguments( + "oracle q paren literal", + "oracle", + "SELECT q'(hello)' FROM dual", + singletonList("hello")), + arguments( + "oracle q custom delimiter", + "oracle", + "SELECT q'#hello#' FROM dual", + singletonList("hello")), + // Oracle forbids whitespace as a q' delimiter, so a space-delimited q' is intentionally + // NOT recognized as a q-literal; its content is still captured by the generic + // single-quoted-string branch (so the secret is still redacted, just not q-unwrapped). + arguments( + "oracle q whitespace delimiter is not a q-literal", + "oracle", + "SELECT q' secret ' FROM dual", + singletonList(" secret ")), + // PostgreSQL dollar-quoting + arguments( + "postgres dollar quote", + "postgresql", + "SELECT $tag$secret$tag$", + singletonList("secret")), + arguments( + "postgres empty-tag dollar quote", + "postgresql", + "SELECT $$secret$$", + singletonList("secret")), + arguments("postgres overlapping tags", "postgresql", "SELECT $a$x$a$", singletonList("x")), + arguments( + "postgres unterminated tag is skipped", "postgresql", "SELECT $tag$value", emptyList()), + // Boundary cases around buildDollarTagPositions: the scan must not read past end-of-string. + arguments("postgres lone dollar at end of string", "postgresql", "SELECT a$", emptyList()), + arguments( + "postgres identifier tag without closing dollar at end of string", + "postgresql", + "SELECT $tag", + emptyList()), + arguments( + "postgres empty-tag opener without close at end of string", + "postgresql", + "SELECT $$secret", + emptyList()), + // Parameter placeholders ($1, $2) must NOT be treated as dollar-quote openers; only their + // digits are tokenized as numeric literals. + arguments( + "postgres placeholders are not dollar quotes", + "postgresql", + "SELECT * FROM t WHERE a = $1 AND b = $2", + asList("1", "2"))); + } + + @Test + void manyUnterminatedDollarTagsRunInLinearTime() { + // Each "$tN$" is a distinct, valid but unterminated dollar-quote opener. The previous + // indexOf-per-opener implementation scanned to end-of-string once per opener (O(n^2)) and would + // not finish anywhere near this budget; the precomputed tag index keeps it near-linear. + StringBuilder builder = new StringBuilder(); + for (int i = 0; i < 60_000; i++) { + builder.append('$').append('t').append(i).append('$'); + } + String sql = builder.toString(); + + assertTimeoutPreemptively( + Duration.ofSeconds(10), + () -> { + Tokenizer tokenizer = new SqlRegexpTokenizer(postgresEvidence(sql)); + // None of the openers has a matching close, so tokenization yields nothing and must + // simply terminate quickly. + assertFalse(tokenizer.next()); + }); + } + + private static Evidence postgresEvidence(String sql) { + return evidence("postgresql", sql); + } + + private static Evidence evidence(@Nullable String dialect, String sql) { + Evidence evidence = new Evidence(sql); + if (dialect != null) { + evidence.getContext().put(DATABASE_PARAMETER, dialect); + } + return evidence; + } + + private static List tokenize(@Nullable String dialect, String sql) { + Tokenizer tokenizer = new SqlRegexpTokenizer(evidence(dialect, sql)); + List tokens = new ArrayList<>(); + while (tokenizer.next()) { + Ranged range = tokenizer.current(); + tokens.add(sql.substring(range.getStart(), range.getStart() + range.getLength())); + } + return tokens; + } +} diff --git a/dd-java-agent/agent-iast/src/test/java/com/datadog/iast/sensitive/UrlRegexpTokenizerTest.java b/dd-java-agent/agent-iast/src/test/java/com/datadog/iast/sensitive/UrlRegexpTokenizerTest.java new file mode 100644 index 00000000000..1e5fd780fd8 --- /dev/null +++ b/dd-java-agent/agent-iast/src/test/java/com/datadog/iast/sensitive/UrlRegexpTokenizerTest.java @@ -0,0 +1,44 @@ +package com.datadog.iast.sensitive; + +import static java.util.Arrays.asList; +import static java.util.Collections.singletonList; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.params.provider.Arguments.arguments; + +import com.datadog.iast.model.Evidence; +import com.datadog.iast.sensitive.SensitiveHandler.Tokenizer; +import com.datadog.iast.util.Ranged; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Stream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class UrlRegexpTokenizerTest { + + @ParameterizedTest(name = "{0}") + @MethodSource("redactsUrlSecretsArguments") + void redactsUrlSecrets(final String description, final String url, final List expected) { + assertEquals(expected, tokenize(url)); + } + + static Stream redactsUrlSecretsArguments() { + return Stream.of( + arguments("userinfo authority", "https://user:pass@host/path", singletonList("user:pass")), + arguments("single user authority", "ftp://bob@server/file", singletonList("bob")), + arguments( + "query parameter values", "http://h/p?token=secret&id=42", asList("secret", "42")), + arguments("authority and query together", "https://user@host/p?q=v", asList("user", "v"))); + } + + private static List tokenize(String url) { + Tokenizer tokenizer = new UrlRegexpTokenizer(new Evidence(url)); + List tokens = new ArrayList<>(); + while (tokenizer.next()) { + Ranged range = tokenizer.current(); + tokens.add(url.substring(range.getStart(), range.getStart() + range.getLength())); + } + return tokens; + } +} diff --git a/dd-trace-api/src/main/java/datadog/trace/api/ConfigDefaults.java b/dd-trace-api/src/main/java/datadog/trace/api/ConfigDefaults.java index 4c9142eec2e..062687a458e 100644 --- a/dd-trace-api/src/main/java/datadog/trace/api/ConfigDefaults.java +++ b/dd-trace-api/src/main/java/datadog/trace/api/ConfigDefaults.java @@ -168,9 +168,9 @@ public final class ConfigDefaults { static final String DEFAULT_IAST_WEAK_CIPHER_ALGORITHMS = "^(?:PBEWITH(?:HMACSHA(?:2(?:24ANDAES_(?:128|256)|56ANDAES_(?:128|256))|384ANDAES_(?:128|256)|512ANDAES_(?:128|256)|1ANDAES_(?:128|256))|SHA1AND(?:RC(?:2_(?:128|40)|4_(?:128|40))|DESEDE)|MD5AND(?:TRIPLEDES|DES))|DES(?:EDE(?:WRAP)?)?|BLOWFISH|ARCFOUR|RC2).*$"; static final boolean DEFAULT_IAST_REDACTION_ENABLED = true; - static final String DEFAULT_IAST_REDACTION_NAME_PATTERN = + public static final String DEFAULT_IAST_REDACTION_NAME_PATTERN = "(?:p(?:ass)?w(?:or)?d|pass(?:_?phrase)?|secret|(?:api_?|private_?|public_?|access_?|secret_?)key(?:_?id)?|token|consumer_?(?:id|key|secret)|sign(?:ed|ature)?|auth(?:entication|orization)?)"; - static final String DEFAULT_IAST_REDACTION_VALUE_PATTERN = + public static final String DEFAULT_IAST_REDACTION_VALUE_PATTERN = "(?:bearer\\s+[a-z0-9\\._\\-]+|glpat-[\\w\\-]{20}|gh[opsu]_[0-9a-zA-Z]{36}|ey[I-L][\\w=\\-]+\\.ey[I-L][\\w=\\-]+(?:\\.[\\w.+/=\\-]+)?|(?:[\\-]{5}BEGIN[a-z\\s]+PRIVATE\\sKEY[\\-]{5}[^\\-]+[\\-]{5}END[a-z\\s]+PRIVATE\\sKEY[\\-]{5}|ssh-rsa\\s*[a-z0-9/\\.+]{100,}))"; public static final int DEFAULT_IAST_MAX_RANGE_COUNT = 10; static final boolean DEFAULT_IAST_STACKTRACE_LEAK_SUPPRESS = false; diff --git a/dd-trace-core/build.gradle b/dd-trace-core/build.gradle index 47199dab774..3df31b0388f 100644 --- a/dd-trace-core/build.gradle +++ b/dd-trace-core/build.gradle @@ -83,8 +83,7 @@ dependencies { implementation libs.slf4j implementation libs.moshi implementation libs.jctools - - implementation group: 'com.google.re2j', name: 're2j', version: '1.7' + implementation libs.re2j // sketches-java is shared compileOnly group: 'com.datadoghq', name: 'sketches-java', version: '0.8.3' diff --git a/dd-trace-core/src/main/java/datadog/trace/core/tagprocessor/QueryObfuscator.java b/dd-trace-core/src/main/java/datadog/trace/core/tagprocessor/QueryObfuscator.java index a1594e33eb6..2c99c50244b 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/tagprocessor/QueryObfuscator.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/tagprocessor/QueryObfuscator.java @@ -8,7 +8,6 @@ import datadog.trace.bootstrap.instrumentation.api.AppendableSpanLinks; import datadog.trace.bootstrap.instrumentation.api.Tags; import datadog.trace.core.DDSpanContext; -import datadog.trace.util.Strings; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -47,13 +46,20 @@ public QueryObfuscator(String regex) { } private String obfuscate(String query) { - if (pattern != null) { - Matcher matcher = pattern.matcher(query); - while (matcher.find()) { - query = Strings.replace(query, matcher.group(), ""); - } + if (pattern == null) { + return query; + } + final Matcher matcher = pattern.matcher(query); + if (!matcher.find()) { + return query; } - return query; + // TODO consider an upstream length cap too + final StringBuffer sb = new StringBuffer(query.length()); + do { + matcher.appendReplacement(sb, ""); + } while (matcher.find()); + matcher.appendTail(sb); + return sb.toString(); } @Override diff --git a/gradle/dependencies.gradle b/gradle/dependencies.gradle index ae4aa311ce1..e8daf61cd03 100644 --- a/gradle/dependencies.gradle +++ b/gradle/dependencies.gradle @@ -69,6 +69,9 @@ final class CachedData { // snakeyaml-engine and its transitives exclude(dependency('org.snakeyaml:snakeyaml-engine')) + + // re2j and its transitives + exclude(dependency('com.google.re2j:re2j')) } ] } @@ -95,7 +98,8 @@ CachedData.deps.shared = [ libs.moshi, libs.jctools, libs.lz4, - libs.aircompressor + libs.aircompressor, + libs.re2j ] ext { diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 601f72be16e..39a030b31ea 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -56,6 +56,7 @@ guava = "[16.0,20.0]" # Last version to support Java 7 javaparser = "3.24.4" jctools = "4.0.6" lz4 = "1.11.0" +re2j = "1.8" # Logging slf4j = "1.7.30" @@ -140,6 +141,7 @@ javaparser = {module = "com.github.javaparser:javaparser-core", version.ref = "j javaparser-symbol-solver = {module = "com.github.javaparser:javaparser-symbol-solver-core", version.ref = "javaparser"} jctools = { module = "org.jctools:jctools-core-jdk11", version.ref = "jctools" } lz4 = { module = "at.yawk.lz4:lz4-java", version.ref = "lz4" } +re2j = { module = "com.google.re2j:re2j", version.ref = "re2j" } # Logging logback-classic = { module = "ch.qos.logback:logback-classic", version.ref = "logback" }