diff --git a/runtime/drivers/clickhouse/dialect.go b/runtime/drivers/clickhouse/dialect.go index e9f38f1e679..d2510f4243e 100644 --- a/runtime/drivers/clickhouse/dialect.go +++ b/runtime/drivers/clickhouse/dialect.go @@ -81,6 +81,12 @@ func (d *dialect) RequiresArrayContainsForInOperator() bool { return true } func (d *dialect) GetArrayContainsFunction() (string, error) { return "hasAny", nil } +// ArrayContainsSubqueryExpr uses arrayExists so the match is evaluated per row. +// hasAny only accepts a literal array, and unnesting via LEFT ARRAY JOIN would duplicate rows whose array contains multiple matching values. +func (d *dialect) ArrayContainsSubqueryExpr(arrExpr, subqueryExpr string) (string, error) { + return fmt.Sprintf("arrayExists(x -> x IN %s, (%s))", subqueryExpr, arrExpr), nil +} + func (d *dialect) CastToDataType(typ runtimev1.Type_Code) (string, error) { switch typ { case runtimev1.Type_CODE_TIMESTAMP: diff --git a/runtime/drivers/dialect.go b/runtime/drivers/dialect.go index 3f3632a89ed..61f7acb2f05 100644 --- a/runtime/drivers/dialect.go +++ b/runtime/drivers/dialect.go @@ -49,6 +49,7 @@ type Dialect interface { GetRegexMatchFunction() (string, error) RequiresArrayContainsForInOperator() bool GetArrayContainsFunction() (string, error) + ArrayContainsSubqueryExpr(arrExpr, subqueryExpr string) (string, error) DimensionSelect(escapeTable string, dim *runtimev1.MetricsViewSpec_Dimension) (dimSelect, unnestClause string, err error) LateralUnnest(expr, tableAlias, colName string) (tbl string, tupleStyle, auto bool, err error) UnnestSQLSuffix(tbl string) string @@ -228,6 +229,10 @@ func (b *BaseDialect) GetArrayContainsFunction() (string, error) { return "", fmt.Errorf("array contains not supported for %s dialect", b.String()) } +func (b *BaseDialect) ArrayContainsSubqueryExpr(_, _ string) (string, error) { + return "", fmt.Errorf("array contains subquery not supported for %s dialect", b.String()) +} + func (b *BaseDialect) MetricsViewDimensionExpression(dimension *runtimev1.MetricsViewSpec_Dimension) (string, error) { if dimension.LookupTable != "" { return "", fmt.Errorf("lookup tables are not supported for %s dialect", b.String()) diff --git a/runtime/metricsview/astexpr.go b/runtime/metricsview/astexpr.go index 16363c730e3..744dea40923 100644 --- a/runtime/metricsview/astexpr.go +++ b/runtime/metricsview/astexpr.go @@ -81,6 +81,18 @@ func (b *sqlExprBuilder) writeValue(val any) error { } func (b *sqlExprBuilder) writeSubquery(sub *Subquery) error { + expr, args, err := b.subqueryExpr(sub) + if err != nil { + return err + } + b.writeString(expr) + b.args = append(b.args, args...) + return nil +} + +// subqueryExpr builds the SQL for a subquery and returns it as a parenthesized scalar subquery +// selecting the subquery's dimension, along with its bound args. It does not write to the builder. +func (b *sqlExprBuilder) subqueryExpr(sub *Subquery) (string, []any, error) { // We construct a Query that combines the parent Query's contextual info with that of the Subquery. outer := b.ast.Query inner := &Query{ @@ -110,21 +122,15 @@ func (b *sqlExprBuilder) writeSubquery(sub *Subquery) error { } innerAST, err := NewAST(b.ast.MetricsView, innerSecurity, inner, b.ast.Dialect) if err != nil { - return fmt.Errorf("failed to create AST for subquery: %w", err) + return "", nil, fmt.Errorf("failed to create AST for subquery: %w", err) } sql, args, err := innerAST.SQL() if err != nil { - return fmt.Errorf("failed to generate SQL for subquery: %w", err) + return "", nil, fmt.Errorf("failed to generate SQL for subquery: %w", err) } // Output: (SELECT FROM ()) - b.writeString("(SELECT ") - b.writeString(b.ast.Dialect.EscapeIdentifier(sub.Dimension.Name)) - b.writeString(" FROM (") - b.writeString(sql) - b.writeString("))") - b.args = append(b.args, args...) - return nil + return fmt.Sprintf("(SELECT %s FROM (%s))", b.ast.Dialect.EscapeIdentifier(sub.Dimension.Name), sql), args, nil } func (b *sqlExprBuilder) writeCondition(cond *Condition) error { @@ -279,7 +285,8 @@ func (b *sqlExprBuilder) writeBinaryCondition(exprs []*Expression, op Operator) // For IN/NIN on unnest dimensions backed by DuckDB or ClickHouse, use native array-contains // functions (list_has_any / hasAny). This avoids double-counting when a row's array contains multiple matching values. - if (op == OperatorIn || op == OperatorNin) && b.ast.Dialect.RequiresArrayContainsForInOperator() { + // These functions only accept a literal list, so a subquery right-hand side is handled separately below. + if (op == OperatorIn || op == OperatorNin) && right.Value != nil && b.ast.Dialect.RequiresArrayContainsForInOperator() { return b.writeArrayContainsCondition(leftExpr, right, op == OperatorNin) } @@ -293,6 +300,29 @@ func (b *sqlExprBuilder) writeBinaryCondition(exprs []*Expression, op Operator) // Means the DB automatically unnests, so we can treat it as a normal value return b.writeBinaryConditionInner(nil, right, leftExpr, op) } + + // For IN/NIN with a subquery right-hand side on a dialect that unnests via an array join (tupleStyle == false, e.g. ClickHouse), + // adding the join would double-count rows whose array contains multiple matching values. Use a per-row array-contains expression instead. + // On dialects that unnest via a lateral EXISTS semi-join (tupleStyle == true, e.g. DuckDB), the fall-through below already avoids duplication. + if !tupleStyle && (op == OperatorIn || op == OperatorNin) && right.Subquery != nil && b.ast.Dialect.RequiresArrayContainsForInOperator() { + subExpr, subArgs, err := b.subqueryExpr(right.Subquery) + if err != nil { + return err + } + containsExpr, err := b.ast.Dialect.ArrayContainsSubqueryExpr(leftExpr, subExpr) + if err != nil { + return err + } + b.writeByte('(') + if op == OperatorNin { + b.writeString("NOT ") + } + b.writeString(containsExpr) + b.writeByte(')') + b.args = append(b.args, subArgs...) + return nil + } + var unnestColAlias string if tupleStyle { unnestColAlias = b.ast.Dialect.EscapeMember(unnestTableAlias, left.Name) diff --git a/runtime/metricsview/astexpr_test.go b/runtime/metricsview/astexpr_test.go index 5abe4b87c43..1172d691b1e 100644 --- a/runtime/metricsview/astexpr_test.go +++ b/runtime/metricsview/astexpr_test.go @@ -138,6 +138,75 @@ func TestArrayContainsCondition(t *testing.T) { wantSQL: `(hasAny(("tags"), [?,?]))`, wantArgs: []any{nil, "a"}, }, + { + name: "clickhouse: in on unnest dim with subquery uses arrayExists", + dialect: clickhouse.DialectClickhouse, + where: &Expression{Condition: &Condition{ + Operator: OperatorIn, + Expressions: []*Expression{ + {Name: "tags"}, + {Subquery: &Subquery{ + Dimension: Dimension{Name: "tags"}, + Measures: []Measure{{Name: "count"}}, + Having: &Expression{Condition: &Condition{ + Operator: OperatorGt, + Expressions: []*Expression{ + {Name: "count"}, + {Value: 1}, + }, + }}, + }}, + }, + }}, + wantSQL: `(arrayExists(x -> x IN (SELECT "tags" FROM (SELECT ("t2"."tags") AS "tags", ("t2"."count") AS "count" FROM (SELECT ("tags") AS "tags", (count(*)) AS "count" FROM "test_table" LEFT ARRAY JOIN "tags" AS "tags" GROUP BY 1) t2 WHERE (("t2"."count") > ?))), ("tags")))`, + wantArgs: []any{1}, + }, + { + name: "clickhouse: nin on unnest dim with subquery uses NOT arrayExists", + dialect: clickhouse.DialectClickhouse, + where: &Expression{Condition: &Condition{ + Operator: OperatorNin, + Expressions: []*Expression{ + {Name: "tags"}, + {Subquery: &Subquery{ + Dimension: Dimension{Name: "tags"}, + Measures: []Measure{{Name: "count"}}, + Having: &Expression{Condition: &Condition{ + Operator: OperatorGt, + Expressions: []*Expression{ + {Name: "count"}, + {Value: 1}, + }, + }}, + }}, + }, + }}, + wantSQL: `(NOT arrayExists(x -> x IN (SELECT "tags" FROM (SELECT ("t2"."tags") AS "tags", ("t2"."count") AS "count" FROM (SELECT ("tags") AS "tags", (count(*)) AS "count" FROM "test_table" LEFT ARRAY JOIN "tags" AS "tags" GROUP BY 1) t2 WHERE (("t2"."count") > ?))), ("tags")))`, + wantArgs: []any{1}, + }, + { + name: "duckdb: in on unnest dim with subquery uses lateral EXISTS", + dialect: duckdb.DialectDuckDB, + where: &Expression{Condition: &Condition{ + Operator: OperatorIn, + Expressions: []*Expression{ + {Name: "tags"}, + {Subquery: &Subquery{ + Dimension: Dimension{Name: "tags"}, + Measures: []Measure{{Name: "count"}}, + Having: &Expression{Condition: &Condition{ + Operator: OperatorGt, + Expressions: []*Expression{ + {Name: "count"}, + {Value: 1}, + }, + }}, + }}, + }, + }}, + wantSQL: `EXISTS (SELECT 1 FROM LATERAL UNNEST("tags") t2("tags") WHERE (("t2"."tags") IN (SELECT "tags" FROM (SELECT ("t2"."tags") AS "tags", ("t2"."count") AS "count" FROM (SELECT ("t0"."tags") AS "tags", (count(*)) AS "count" FROM "test_table", LATERAL UNNEST("tags") t0("tags") GROUP BY 1) t2 WHERE (("t2"."count") > ?)))))`, + wantArgs: []any{1}, + }, { name: "duckdb: in on non-unnest dim uses normal IN", dialect: duckdb.DialectDuckDB,