From 0e0c90686cef289638053f86f2036e2fcd3d4080 Mon Sep 17 00:00:00 2001 From: Abhishek Balaji Radhakrishnan Date: Sun, 26 Apr 2026 22:02:24 -0700 Subject: [PATCH 1/7] feat: add a new scan transform for realtime ingestion (similar to unnest in MSQ) --- docs/ingestion/ingestion-spec.md | 37 +- .../indexing/KafkaScanTransformTest.java | 358 ++++++++++++++ .../segment/transform/ScanTransform.java | 268 ++++++++++ .../druid/segment/transform/Transform.java | 29 +- .../transform/TransformedInputRow.java | 6 + .../druid/segment/transform/Transformer.java | 73 ++- .../TransformingInputEntityReader.java | 8 + .../TransformingInputSourceReader.java | 8 + .../segment/transform/ScanTransformTest.java | 460 ++++++++++++++++++ .../segment/transform/TransformerTest.java | 35 ++ 10 files changed, 1277 insertions(+), 5 deletions(-) create mode 100644 embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaScanTransformTest.java create mode 100644 processing/src/main/java/org/apache/druid/segment/transform/ScanTransform.java create mode 100644 processing/src/test/java/org/apache/druid/segment/transform/ScanTransformTest.java diff --git a/docs/ingestion/ingestion-spec.md b/docs/ingestion/ingestion-spec.md index d1f901cc9538..f93175131a97 100644 --- a/docs/ingestion/ingestion-spec.md +++ b/docs/ingestion/ingestion-spec.md @@ -368,7 +368,7 @@ Transforms do have some limitations. They can only refer to fields present in th they cannot refer to other transforms. And they cannot remove fields, only add them. However, they can shadow a field with another field containing all nulls, which will act similarly to removing the field. -Druid currently includes one kind of built-in transform, the expression transform. It has the following syntax: +Druid includes two kinds of built-in transforms: expression transforms and [scan transforms](#scan-transform). The expression transform has the following syntax: ``` { @@ -387,6 +387,41 @@ The `expression` is a [Druid query expression](../querying/math-expr.md). your ingestion spec. ::: +#### Scan transform + +The scan transform unnests array-valued columns during ingestion, producing multiple output rows from a single input row. This allows streaming ingestion (Kafka, Kinesis) to explode arrays into individual rows at ingest time, rather than at query time. + +```json +{ + "type": "scan", + "name": "tag", + "unnestColumn": { + "type": "expression", + "name": "tag", + "expression": "\"tags\"", + "outputType": "STRING" + }, + "unnestFilter": { + "type": "selector", + "dimension": "tag", + "value": "sports" + } +} +``` + +|Property|Description|Required| +|--------|-----------|--------| +|`type`|Must be `"scan"`.|Yes| +|`name`|Output name for this transform.|Yes| +|`unnestColumn`|A [virtual column](../querying/virtual-columns.md) that defines which column to unnest and the output column name. Use `outputType` of `"STRING"` for string arrays or `"COMPLEX"` for arrays of objects.|Yes| +|`unnestFilter`|An optional [filter](../querying/filters.md) applied to the unnested output column. Only array elements matching this filter produce output rows.|No| + +You can define multiple scan transforms in the `transforms` list. They are applied sequentially, producing a cross join. For example, unnesting both `tags` (2 elements) and `services` (3 elements) produces 6 rows per input row. + +If the unnest column is missing or the array is empty, the input row passes through with the unnest output column set to null. + +Expression transforms are applied before scan transforms. The `transformSpec` filter is also applied before any unnesting, so it operates on the original input row. + #### Filter The `filter` conditionally filters input rows during ingestion. Only rows that pass the filter will be diff --git a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaScanTransformTest.java b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaScanTransformTest.java new file mode 100644 index 000000000000..cde856e1f38f --- /dev/null +++ b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaScanTransformTest.java @@ -0,0 +1,358 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.testing.embedded.indexing; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.google.common.collect.ImmutableList; +import org.apache.druid.data.input.impl.DimensionsSpec; +import org.apache.druid.data.input.impl.TimestampSpec; +import org.apache.druid.indexer.granularity.UniformGranularitySpec; +import org.apache.druid.indexing.kafka.KafkaIndexTaskModule; +import org.apache.druid.indexing.kafka.simulate.KafkaResource; +import org.apache.druid.indexing.kafka.supervisor.KafkaSupervisorSpec; +import org.apache.druid.indexing.kafka.supervisor.KafkaSupervisorSpecBuilder; +import org.apache.druid.java.util.common.StringUtils; +import org.apache.druid.java.util.common.granularity.Granularities; +import org.apache.druid.math.expr.ExprMacroTable; +import org.apache.druid.query.DruidMetrics; +import org.apache.druid.segment.TestHelper; +import org.apache.druid.segment.column.ColumnType; +import org.apache.druid.segment.transform.ScanTransform; +import org.apache.druid.segment.transform.TransformSpec; +import org.apache.druid.segment.virtual.ExpressionVirtualColumn; +import org.apache.druid.testing.embedded.EmbeddedBroker; +import org.apache.druid.testing.embedded.EmbeddedClusterApis; +import org.apache.druid.testing.embedded.EmbeddedCoordinator; +import org.apache.druid.testing.embedded.EmbeddedDruidCluster; +import org.apache.druid.testing.embedded.EmbeddedIndexer; +import org.apache.druid.testing.embedded.EmbeddedOverlord; +import org.apache.druid.testing.embedded.junit5.EmbeddedClusterTestBase; +import org.joda.time.Period; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +/** + * Verifies ScanTransform unnests array columns during Kafka ingestion. + * Uses two scan transforms to unnest both "tags" (string array) and "services" (object array) + * into a single datasource, producing a cross join of tag x service for each input row. + */ +public class KafkaScanTransformTest extends EmbeddedClusterTestBase +{ + // alice: 2 tags x 2 services = 4, bob: 1 tag x 3 services = 3 = 7 unnested rows + // carol (null arrays) and dave (missing columns) each produce 1 passthrough row = 2 + // total: 9 + private static final int EXPECTED_ROWS = 9; + + private final KafkaResource kafka = new KafkaResource(); + private final EmbeddedBroker broker = new EmbeddedBroker(); + private final EmbeddedIndexer indexer = new EmbeddedIndexer(); + private final EmbeddedOverlord overlord = new EmbeddedOverlord(); + private final EmbeddedCoordinator coordinator = new EmbeddedCoordinator(); + + private String topic; + + @Override + public EmbeddedDruidCluster createCluster() + { + coordinator.addProperty("druid.manager.segments.useIncrementalCache", "always"); + + indexer.setServerMemory(300_000_000) + .addProperty("druid.segment.handoff.pollDuration", "PT0.1s") + .addProperty("druid.processing.numThreads", "2") + .addProperty("druid.worker.capacity", "2"); + + return EmbeddedDruidCluster + .withEmbeddedDerbyAndZookeeper() + .addExtension(KafkaIndexTaskModule.class) + .addResource(kafka) + .addCommonProperty("druid.monitoring.emissionPeriod", "PT0.1s") + .useLatchableEmitter() + .useDefaultTimeoutForLatchableEmitter(30) + .addServer(coordinator) + .addServer(overlord) + .addServer(broker) + .addServer(indexer); + } + + @Override + protected void refreshDatasourceName() + { + // Do not refresh — datasource is set once in setupAll + } + + @BeforeAll + void setupAll() throws JsonProcessingException + { + topic = EmbeddedClusterApis.createTestDatasourceName(); + kafka.createTopicWithPartitions(topic, 1); + + super.refreshDatasourceName(); + submitSupervisor(); + publishTestData(); + + indexer.latchableEmitter().waitForEventAggregate( + event -> event.hasMetricName("ingest/events/processed") + .hasDimension(DruidMetrics.DATASOURCE, dataSource), + agg -> agg.hasSumAtLeast(EXPECTED_ROWS) + ); + } + + private void submitSupervisor() + { + final TransformSpec transformSpec = new TransformSpec( + null, + ImmutableList.of( + new ScanTransform( + "tag", + new ExpressionVirtualColumn("tag", "\"tags\"", ColumnType.STRING, ExprMacroTable.nil()), + null + ), + new ScanTransform( + "svc", + new ExpressionVirtualColumn("svc", "\"services\"", ColumnType.NESTED_DATA, ExprMacroTable.nil()), + null + ) + ) + ); + + final KafkaSupervisorSpec spec = new KafkaSupervisorSpecBuilder() + .withDataSchema( + schema -> schema + .withTimestamp(new TimestampSpec("__time", "auto", null)) + .withGranularity(new UniformGranularitySpec(Granularities.DAY, null, null)) + .withDimensions(DimensionsSpec.builder().useSchemaDiscovery(true).build()) + .withTransform(transformSpec) + ) + .withIoConfig( + ioConfig -> ioConfig + .withJsonInputFormat() + .withTaskCount(1) + .withTaskDuration(Period.hours(1)) + .withConsumerProperties(kafka.consumerProperties()) + .withStartDelay(Period.millis(10)) + .withSupervisorRunPeriod(Period.millis(500)) + .withUseEarliestSequenceNumber(true) + .withCompletionTimeout(Period.seconds(5)) + ) + .build(dataSource, topic); + + Assertions.assertEquals( + dataSource, + cluster.callApi().postSupervisor(spec) + ); + } + + private void publishTestData() throws JsonProcessingException + { + // alice: 2 tags x 2 services = 4 rows + // bob: 1 tag x 3 services = 3 rows + // carol: null tags x null services => 1 passthrough row + // dave: missing tags & services columns => 1 passthrough row + // total: 9 rows + final List> records = new ArrayList<>(); + records.add(Map.of( + "__time", "2024-01-01T00:00:00Z", + "user", "alice", + "tags", List.of("sports", "news"), + "services", List.of( + Map.of("type", "web", "dc", "us-east1"), + Map.of("type", "api", "dc", "us-west2") + ) + )); + records.add(Map.of( + "__time", "2024-01-01T00:01:00Z", + "user", "bob", + "tags", List.of("music"), + "services", List.of( + Map.of("type", "cdn", "dc", "eu-west1"), + Map.of("type", "cache", "dc", "eu-west1"), + Map.of("type", "db", "dc", "us-east1") + ) + )); + + // carol: explicit null values for both array columns + final HashMap carolRecord = new HashMap<>(); + carolRecord.put("__time", "2024-01-01T00:02:00Z"); + carolRecord.put("user", "carol"); + carolRecord.put("tags", null); + carolRecord.put("services", null); + records.add(carolRecord); + + // dave: columns not present at all + records.add(Map.of( + "__time", "2024-01-01T00:03:00Z", + "user", "dave" + )); + + final List recordBytes = new ArrayList<>(); + for (Map record : records) { + recordBytes.add(TestHelper.JSON_MAPPER.writeValueAsBytes(record)); + } + kafka.publishRecordsToTopic(topic, recordBytes); + } + + @Test + @Timeout(60) + public void test_countRows() + { + final long count = Long.parseLong(cluster.runSql( + StringUtils.format("SELECT COUNT(*) FROM \"%s\"", dataSource) + ).trim()); + // Exactly 9: alice(4) + bob(3) + carol(1 passthrough) + dave(1 passthrough). + Assertions.assertEquals(EXPECTED_ROWS, count); + } + + @Test + @Timeout(60) + public void test_crossJoinUnnest() + { + // Use GROUP BY to get deterministic, order-independent results. + // Each user+tag pair count reflects the number of services it was crossed with. + final String result = cluster.runSql( + StringUtils.format( + "SELECT \"user\", \"tag\", COUNT(*) AS cnt FROM \"%s\" GROUP BY 1, 2 ORDER BY 1, 2", + dataSource + ) + ); + final Set actual = new TreeSet<>(List.of(result.trim().split("\n"))); + final Set expected = new TreeSet<>(List.of( + "alice,news,2", // news x 2 services (web, api) + "alice,sports,2", // sports x 2 services (web, api) + "bob,music,3", // music x 3 services (cdn, cache, db) + "carol,,1", // passthrough (null tag, null svc) + "dave,,1" // passthrough (missing tag, missing svc) + )); + Assertions.assertEquals(expected, actual); + } + + @Test + @Timeout(60) + public void test_groupByTag() + { + final String result = cluster.runSql( + StringUtils.format( + "SELECT \"tag\", COUNT(*) AS cnt FROM \"%s\" WHERE \"tag\" IS NOT NULL GROUP BY 1 ORDER BY 1", + dataSource + ) + ); + // music: 1 x 3 services = 3, news: 1 x 2 services = 2, sports: 1 x 2 services = 2 + // carol/dave have null tags so they don't appear in this grouping + Assertions.assertEquals( + "music,3\nnews,2\nsports,2", + result.trim() + ); + } + + @Test + @Timeout(60) + public void test_groupByUser() + { + final String result = cluster.runSql( + StringUtils.format( + "SELECT \"user\", COUNT(*) AS cnt FROM \"%s\" GROUP BY 1 ORDER BY 1", + dataSource + ) + ); + Assertions.assertEquals( + "alice,4\nbob,3\ncarol,1\ndave,1", + result.trim() + ); + } + + @Test + @Timeout(60) + public void test_groupByServiceType() + { + // Extract the "type" field from the unnested service objects using JSON_VALUE + final String result = cluster.runSql( + StringUtils.format( + "SELECT JSON_VALUE(\"svc\", '$.type'), COUNT(*) AS cnt" + + " FROM \"%s\"" + + " WHERE \"svc\" IS NOT NULL" + + " GROUP BY 1 ORDER BY 1", + dataSource + ) + ); + // alice has 2 tags so each of her services appears twice (cross join) + // bob has 1 tag so each of his services appears once + final Set actual = new TreeSet<>(List.of(result.trim().split("\n"))); + final Set expected = new TreeSet<>(List.of( + "api,2", // alice: api x (sports, news) + "cache,1", // bob: cache x music + "cdn,1", // bob: cdn x music + "db,1", // bob: db x music + "web,2" // alice: web x (sports, news) + )); + Assertions.assertEquals(expected, actual); + } + + @Test + @Timeout(60) + public void test_groupByServiceDc() + { + // Extract the "dc" field from the unnested service objects + final String result = cluster.runSql( + StringUtils.format( + "SELECT JSON_VALUE(\"svc\", '$.dc'), COUNT(*) AS cnt" + + " FROM \"%s\"" + + " WHERE \"svc\" IS NOT NULL" + + " GROUP BY 1 ORDER BY 1", + dataSource + ) + ); + final Set actual = new TreeSet<>(List.of(result.trim().split("\n"))); + final Set expected = new TreeSet<>(List.of( + "eu-west1,2", // bob: cdn + cache (both eu-west1) x 1 tag + "us-east1,3", // alice: web(us-east1) x 2 tags + bob: db(us-east1) x 1 tag + "us-west2,2" // alice: api(us-west2) x 2 tags + )); + Assertions.assertEquals(expected, actual); + } + + @Test + @Timeout(60) + public void test_filterByServiceType() + { + // Filter to only rows where the service type is "web" + final String result = cluster.runSql( + StringUtils.format( + "SELECT \"user\", \"tag\", JSON_VALUE(\"svc\", '$.type'), JSON_VALUE(\"svc\", '$.dc')" + + " FROM \"%s\"" + + " WHERE JSON_VALUE(\"svc\", '$.type') = 'web'", + dataSource + ) + ); + final Set actual = new TreeSet<>(List.of(result.trim().split("\n"))); + final Set expected = new TreeSet<>(List.of( + "alice,news,web,us-east1", + "alice,sports,web,us-east1" + )); + Assertions.assertEquals(expected, actual); + } +} diff --git a/processing/src/main/java/org/apache/druid/segment/transform/ScanTransform.java b/processing/src/main/java/org/apache/druid/segment/transform/ScanTransform.java new file mode 100644 index 000000000000..f36f3e0922ce --- /dev/null +++ b/processing/src/main/java/org/apache/druid/segment/transform/ScanTransform.java @@ -0,0 +1,268 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.segment.transform; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.druid.data.input.InputRow; +import org.apache.druid.data.input.MapBasedInputRow; +import org.apache.druid.java.util.common.Intervals; +import org.apache.druid.java.util.common.guava.Sequences; +import org.apache.druid.query.filter.DimFilter; +import org.apache.druid.segment.BaseObjectColumnValueSelector; +import org.apache.druid.segment.ColumnSelectorFactory; +import org.apache.druid.segment.Cursor; +import org.apache.druid.segment.CursorBuildSpec; +import org.apache.druid.segment.CursorFactory; +import org.apache.druid.segment.CursorHolder; +import org.apache.druid.segment.RowAdapters; +import org.apache.druid.segment.RowBasedSegment; +import org.apache.druid.segment.UnnestCursorFactory; +import org.apache.druid.segment.VirtualColumn; +import org.apache.druid.segment.column.ColumnHolder; +import org.apache.druid.segment.column.ColumnType; +import org.apache.druid.segment.column.RowSignature; + +import javax.annotation.Nullable; +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; + +/** + * A multi-row transform that unnests array columns during ingestion using the cursor-based unnest machinery. + * Each input row is wrapped in a single-row segment, the unnest cursor iterates over array elements, + * and each element becomes a separate output row. + * + * If the unnest column is missing or the array is empty, the input row passes through with the + * unnest output column set to null. + */ +public class ScanTransform implements Transform +{ + private final String name; + private final VirtualColumn unnestColumn; + @Nullable + private final DimFilter unnestFilter; + + @JsonCreator + public ScanTransform( + @JsonProperty("name") final String name, + @JsonProperty("unnestColumn") final VirtualColumn unnestColumn, + @JsonProperty("unnestFilter") @Nullable final DimFilter unnestFilter + ) + { + this.name = name; + this.unnestColumn = unnestColumn; + this.unnestFilter = unnestFilter; + } + + @Override + @JsonProperty + public String getName() + { + return name; + } + + @Override + @Nullable + public RowFunction getRowFunction() + { + return null; + } + + @JsonProperty + public VirtualColumn getUnnestColumn() + { + return unnestColumn; + } + + @JsonProperty + @Nullable + public DimFilter getUnnestFilter() + { + return unnestFilter; + } + + @Override + public Set getRequiredColumns() + { + return Set.copyOf(unnestColumn.requiredColumns()); + } + + @Override + public boolean isMultiRow() + { + return true; + } + + @Override + public List applyMultiRow(final InputRow inputRow) + { + final List columns = getColumnsForProcessing(inputRow); + final String unnestOutputName = unnestColumn.getOutputName(); + if (!columns.contains(unnestOutputName)) { + columns.add(unnestOutputName); + } + + final List dimensionColumns = new ArrayList<>(inputRow.getDimensions()); + if (!dimensionColumns.contains(unnestOutputName)) { + dimensionColumns.add(unnestOutputName); + } + + final RowSignature.Builder signatureBuilder = RowSignature.builder(); + signatureBuilder.add(ColumnHolder.TIME_COLUMN_NAME, ColumnType.LONG); + for (final String column : columns) { + if (!ColumnHolder.TIME_COLUMN_NAME.equals(column)) { + signatureBuilder.add(column, ColumnType.NESTED_DATA); + } + } + final RowSignature inputSignature = signatureBuilder.build(); + + final RowBasedSegment segment = new RowBasedSegment<>( + Sequences.simple(List.of(inputRow)), + RowAdapters.standardRow(), + inputSignature + ); + + final CursorFactory baseCursorFactory = segment.as(CursorFactory.class); + final CursorBuildSpec cursorBuildSpec = CursorBuildSpec.builder().setInterval(Intervals.ETERNITY).build(); + try (final CursorHolder cursorHolder = makeUnnestCursorFactory(baseCursorFactory, unnestFilter).makeCursorHolder(cursorBuildSpec)) { + final Cursor cursor = cursorHolder.asCursor(); + if (cursor == null) { + return List.of(); + } + + final ColumnSelectorFactory factory = cursor.getColumnSelectorFactory(); + final List selectors = new ArrayList<>(columns.size()); + for (final String column : columns) { + selectors.add(factory.makeColumnValueSelector(column)); + } + + final List result = new ArrayList<>(); + + while (!cursor.isDone()) { + final Map event = new LinkedHashMap<>(); + for (int i = 0; i < columns.size(); i++) { + final Object value = selectors.get(i).getObject(); + if (value != null) { + event.put(columns.get(i), value); + } + } + + result.add(new MapBasedInputRow(inputRow.getTimestampFromEpoch(), dimensionColumns, event)); + cursor.advance(); + } + + if (result.isEmpty()) { + if (unnestFilter != null && hasAnyUnnestValues(baseCursorFactory, cursorBuildSpec)) { + return List.of(); + } + + final Map passthroughEvent = new LinkedHashMap<>(); + for (final String column : columns) { + if (!ColumnHolder.TIME_COLUMN_NAME.equals(column)) { + passthroughEvent.put(column, inputRow.getRaw(column)); + } + } + passthroughEvent.put(unnestOutputName, null); + result.add(new MapBasedInputRow(inputRow.getTimestampFromEpoch(), dimensionColumns, passthroughEvent)); + } + + return result; + } + } + + private List getColumnsForProcessing(final InputRow inputRow) + { + final LinkedHashSet columns = new LinkedHashSet<>(); + columns.add(ColumnHolder.TIME_COLUMN_NAME); + columns.addAll(inputRow.getDimensions()); + + final MapBasedInputRow mapBasedInputRow = getMapBasedInputRow(inputRow); + if (mapBasedInputRow != null) { + columns.addAll(mapBasedInputRow.getEvent().keySet()); + } + + if (inputRow instanceof TransformedInputRow) { + columns.addAll(((TransformedInputRow) inputRow).getTransformedColumns()); + } + + return new ArrayList<>(columns); + } + + @Nullable + private static MapBasedInputRow getMapBasedInputRow(final InputRow inputRow) + { + if (inputRow instanceof MapBasedInputRow) { + return (MapBasedInputRow) inputRow; + } + if (inputRow instanceof TransformedInputRow) { + return getMapBasedInputRow(((TransformedInputRow) inputRow).getBaseRow()); + } + return null; + } + + private UnnestCursorFactory makeUnnestCursorFactory(final CursorFactory baseCursorFactory, @Nullable final DimFilter filter) + { + return new UnnestCursorFactory(baseCursorFactory, unnestColumn, filter); + } + + private boolean hasAnyUnnestValues(final CursorFactory baseCursorFactory, final CursorBuildSpec cursorBuildSpec) + { + try (final CursorHolder unfilteredCursorHolder = makeUnnestCursorFactory(baseCursorFactory, null).makeCursorHolder(cursorBuildSpec)) { + final Cursor unfilteredCursor = unfilteredCursorHolder.asCursor(); + return unfilteredCursor != null && !unfilteredCursor.isDone(); + } + } + + @Override + public boolean equals(final Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final ScanTransform that = (ScanTransform) o; + return Objects.equals(name, that.name) + && Objects.equals(unnestColumn, that.unnestColumn) + && Objects.equals(unnestFilter, that.unnestFilter); + } + + @Override + public int hashCode() + { + return Objects.hash(name, unnestColumn, unnestFilter); + } + + @Override + public String toString() + { + return "ScanTransform{" + + "name=" + name + + ", unnestColumn=" + unnestColumn + + ", unnestFilter=" + unnestFilter + + '}'; + } +} diff --git a/processing/src/main/java/org/apache/druid/segment/transform/Transform.java b/processing/src/main/java/org/apache/druid/segment/transform/Transform.java index 8b6f75fa2d81..79a9d7f27c66 100644 --- a/processing/src/main/java/org/apache/druid/segment/transform/Transform.java +++ b/processing/src/main/java/org/apache/druid/segment/transform/Transform.java @@ -21,8 +21,11 @@ import com.fasterxml.jackson.annotation.JsonSubTypes; import com.fasterxml.jackson.annotation.JsonTypeInfo; +import org.apache.druid.data.input.InputRow; import org.apache.druid.guice.annotations.ExtensionPoint; +import javax.annotation.Nullable; +import java.util.List; import java.util.Set; /** @@ -36,11 +39,15 @@ * Transforms do have some limitations. They can only refer to fields present in the actual input rows; in particular, * they cannot refer to other transforms. And they cannot remove fields, only add them. However, they can shadow a * field with another field containing all nulls, which will act similarly to removing the field. + * + * Multi-row transforms (like {@link ScanTransform}) can produce multiple output rows from a single input row. + * These are applied after all single-row transforms. */ @ExtensionPoint @JsonTypeInfo(use = JsonTypeInfo.Id.NAME, property = "type") @JsonSubTypes(value = { - @JsonSubTypes.Type(name = "expression", value = ExpressionTransform.class) + @JsonSubTypes.Type(name = "expression", value = ExpressionTransform.class), + @JsonSubTypes.Type(name = "scan", value = ScanTransform.class) }) public interface Transform { @@ -51,12 +58,30 @@ public interface Transform /** * Returns the function for this transform. The RowFunction takes an entire row as input and returns a column value - * as output. + * as output. Multi-row transforms may return null here. */ + @Nullable RowFunction getRowFunction(); /** * Returns the names of all columns that this transform is going to read. */ Set getRequiredColumns(); + + /** + * Whether this transform can produce multiple output rows from a single input row. + */ + default boolean isMultiRow() + { + return false; + } + + /** + * For multi-row transforms, applies this transform to a single input row and returns zero or more output rows. + * Single-row transforms should not override this when {@link #isMultiRow()} is false. + */ + default List applyMultiRow(InputRow inputRow) + { + return List.of(inputRow); + } } diff --git a/processing/src/main/java/org/apache/druid/segment/transform/TransformedInputRow.java b/processing/src/main/java/org/apache/druid/segment/transform/TransformedInputRow.java index 7d1db5ca479f..f0d235ecc0b9 100644 --- a/processing/src/main/java/org/apache/druid/segment/transform/TransformedInputRow.java +++ b/processing/src/main/java/org/apache/druid/segment/transform/TransformedInputRow.java @@ -30,6 +30,7 @@ import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Set; public class TransformedInputRow implements InputRow { @@ -120,6 +121,11 @@ public InputRow getBaseRow() return row; } + Set getTransformedColumns() + { + return Set.copyOf(transforms.keySet()); + } + @Override public boolean equals(final Object o) { diff --git a/processing/src/main/java/org/apache/druid/segment/transform/Transformer.java b/processing/src/main/java/org/apache/druid/segment/transform/Transformer.java index 2ff263a64738..bfc167d259a1 100644 --- a/processing/src/main/java/org/apache/druid/segment/transform/Transformer.java +++ b/processing/src/main/java/org/apache/druid/segment/transform/Transformer.java @@ -40,13 +40,18 @@ public class Transformer { private final Map transforms = new HashMap<>(); + private final List multiRowTransforms = new ArrayList<>(); private final ThreadLocal rowSupplierForValueMatcher = new ThreadLocal<>(); private final ValueMatcher valueMatcher; Transformer(final TransformSpec transformSpec) { for (final Transform transform : transformSpec.getTransforms()) { - transforms.put(transform.getName(), transform.getRowFunction()); + if (transform.isMultiRow()) { + multiRowTransforms.add(transform); + } else { + transforms.put(transform.getName(), transform.getRowFunction()); + } } if (transformSpec.getFilter() != null) { @@ -64,6 +69,14 @@ public class Transformer } } + /** + * Whether any multi-row transforms are configured. + */ + public boolean hasMultiRowTransform() + { + return !multiRowTransforms.isEmpty(); + } + /** * Transforms an input row, or returns null if the row should be filtered out. * @@ -94,6 +107,38 @@ public InputRow transform(@Nullable final InputRow row) return transformedRow; } + /** + * Transforms an input row, returning zero or more output rows. + * Applies single-row transforms and filtering first, then chains multi-row transforms sequentially. + */ + public List transformToList(@Nullable final InputRow row) + { + final InputRow singleRowResult = transform(row); + if (singleRowResult == null) { + return List.of(); + } + + return applyMultiRowTransforms(singleRowResult); + } + + private List applyMultiRowTransforms(final InputRow inputRow) + { + if (multiRowTransforms.isEmpty()) { + return List.of(inputRow); + } + + List current = List.of(inputRow); + for (final Transform multiRowTransform : multiRowTransforms) { + final List next = new ArrayList<>(); + for (final InputRow currentRow : current) { + next.addAll(multiRowTransform.applyMultiRow(currentRow)); + } + current = next; + } + + return current; + } + @Nullable public InputRowListPlusRawValues transform(@Nullable final InputRowListPlusRawValues row) { @@ -139,6 +184,30 @@ public InputRowListPlusRawValues transform(@Nullable final InputRowListPlusRawVa } } - return inputRowListPlusRawValues; + return applyMultiRowTransforms(inputRowListPlusRawValues); + } + + private InputRowListPlusRawValues applyMultiRowTransforms(final InputRowListPlusRawValues row) + { + if (multiRowTransforms.isEmpty() || row.getInputRows() == null) { + return row; + } + + final List inputRows = row.getInputRows(); + final List> inputRawValues = row.getRawValuesList(); + final List outputRows = new ArrayList<>(); + final List> outputRawValues = inputRawValues == null ? null : new ArrayList<>(); + + for (int i = 0; i < inputRows.size(); i++) { + final List expandedRows = applyMultiRowTransforms(inputRows.get(i)); + outputRows.addAll(expandedRows); + if (outputRawValues != null) { + for (int j = 0; j < expandedRows.size(); j++) { + outputRawValues.add(inputRawValues.get(i)); + } + } + } + + return InputRowListPlusRawValues.ofList(outputRawValues, outputRows, row.getParseException()); } } diff --git a/processing/src/main/java/org/apache/druid/segment/transform/TransformingInputEntityReader.java b/processing/src/main/java/org/apache/druid/segment/transform/TransformingInputEntityReader.java index 33bed4658691..0f01d5c89d29 100644 --- a/processing/src/main/java/org/apache/druid/segment/transform/TransformingInputEntityReader.java +++ b/processing/src/main/java/org/apache/druid/segment/transform/TransformingInputEntityReader.java @@ -22,9 +22,11 @@ import org.apache.druid.data.input.InputEntityReader; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.InputRowListPlusRawValues; +import org.apache.druid.java.util.common.CloseableIterators; import org.apache.druid.java.util.common.parsers.CloseableIterator; import java.io.IOException; +import java.util.List; public class TransformingInputEntityReader implements InputEntityReader { @@ -40,6 +42,12 @@ public TransformingInputEntityReader(InputEntityReader delegate, Transformer tra @Override public CloseableIterator read() throws IOException { + if (transformer.hasMultiRowTransform()) { + return delegate.read().flatMap(row -> { + final List rows = transformer.transformToList(row); + return CloseableIterators.withEmptyBaggage(rows.iterator()); + }); + } return delegate.read().map(transformer::transform); } diff --git a/processing/src/main/java/org/apache/druid/segment/transform/TransformingInputSourceReader.java b/processing/src/main/java/org/apache/druid/segment/transform/TransformingInputSourceReader.java index fe1353c32d33..bad106755975 100644 --- a/processing/src/main/java/org/apache/druid/segment/transform/TransformingInputSourceReader.java +++ b/processing/src/main/java/org/apache/druid/segment/transform/TransformingInputSourceReader.java @@ -23,9 +23,11 @@ import org.apache.druid.data.input.InputRowListPlusRawValues; import org.apache.druid.data.input.InputSourceReader; import org.apache.druid.data.input.InputStats; +import org.apache.druid.java.util.common.CloseableIterators; import org.apache.druid.java.util.common.parsers.CloseableIterator; import java.io.IOException; +import java.util.List; public class TransformingInputSourceReader implements InputSourceReader { @@ -41,6 +43,12 @@ public class TransformingInputSourceReader implements InputSourceReader @Override public CloseableIterator read(InputStats inputStats) throws IOException { + if (transformer.hasMultiRowTransform()) { + return delegate.read(inputStats).flatMap(row -> { + final List rows = transformer.transformToList(row); + return CloseableIterators.withEmptyBaggage(rows.iterator()); + }); + } return delegate.read(inputStats).map(transformer::transform); } diff --git a/processing/src/test/java/org/apache/druid/segment/transform/ScanTransformTest.java b/processing/src/test/java/org/apache/druid/segment/transform/ScanTransformTest.java new file mode 100644 index 000000000000..2625052503c1 --- /dev/null +++ b/processing/src/test/java/org/apache/druid/segment/transform/ScanTransformTest.java @@ -0,0 +1,460 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.segment.transform; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.druid.data.input.InputRow; +import org.apache.druid.data.input.MapBasedInputRow; +import org.apache.druid.java.util.common.DateTimes; +import org.apache.druid.math.expr.ExprMacroTable; +import org.apache.druid.query.expression.TestExprMacroTable; +import org.apache.druid.query.filter.SelectorDimFilter; +import org.apache.druid.segment.TestHelper; +import org.apache.druid.segment.column.ColumnHolder; +import org.apache.druid.segment.column.ColumnType; +import org.apache.druid.segment.virtual.ExpressionVirtualColumn; +import org.apache.druid.testing.InitializedNullHandlingTest; +import org.junit.Assert; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +public class ScanTransformTest extends InitializedNullHandlingTest +{ + private static final long TIMESTAMP = DateTimes.of("2024-01-01").getMillis(); + + private static InputRow makeRow(Object... kvPairs) + { + final java.util.LinkedHashMap event = new java.util.LinkedHashMap<>(); + final List dimensions = new ArrayList<>(); + for (int i = 0; i < kvPairs.length; i += 2) { + final String key = (String) kvPairs[i]; + event.put(key, kvPairs[i + 1]); + if (!ColumnHolder.TIME_COLUMN_NAME.equals(key)) { + dimensions.add(key); + } + } + return new MapBasedInputRow(TIMESTAMP, dimensions, event); + } + + private static ScanTransform makeUnnestTransform(String inputColumn, String outputName) + { + return new ScanTransform( + outputName, + new ExpressionVirtualColumn(outputName, "\"" + inputColumn + "\"", ColumnType.STRING, ExprMacroTable.nil()), + null + ); + } + + @Test + public void testBasicUnnest() + { + final ScanTransform transform = makeUnnestTransform("tags", "tag"); + final InputRow input = makeRow("user", "alice", "tags", List.of("a", "b", "c")); + + final List result = transform.applyMultiRow(input); + Assert.assertEquals(3, result.size()); + + Assert.assertEquals("a", result.get(0).getRaw("tag")); + Assert.assertEquals("alice", result.get(0).getRaw("user")); + Assert.assertEquals(TIMESTAMP, result.get(0).getTimestampFromEpoch()); + + Assert.assertEquals("b", result.get(1).getRaw("tag")); + Assert.assertEquals("c", result.get(2).getRaw("tag")); + } + + @Test + public void testUnnestEmptyArray() + { + final ScanTransform transform = makeUnnestTransform("tags", "tag"); + final InputRow input = makeRow("user", "alice", "tags", List.of()); + + final List result = transform.applyMultiRow(input); + Assert.assertEquals(1, result.size()); + Assert.assertEquals("alice", result.get(0).getRaw("user")); + Assert.assertNull(result.get(0).getRaw("tag")); + } + + @Test + public void testUnnestMissingColumn() + { + final ScanTransform transform = makeUnnestTransform("services", "svc"); + final InputRow input = makeRow("user", "alice", "host", "web-01"); + + final List result = transform.applyMultiRow(input); + Assert.assertEquals(1, result.size()); + Assert.assertEquals("alice", result.get(0).getRaw("user")); + Assert.assertEquals("web-01", result.get(0).getRaw("host")); + Assert.assertNull(result.get(0).getRaw("svc")); + } + + @Test + public void testUnnestSingleElement() + { + final ScanTransform transform = makeUnnestTransform("tags", "tag"); + final InputRow input = makeRow("user", "alice", "tags", List.of("only")); + + final List result = transform.applyMultiRow(input); + Assert.assertEquals(1, result.size()); + Assert.assertEquals("only", result.get(0).getRaw("tag")); + } + + @Test + public void testUnnestScalarValue() + { + final ScanTransform transform = makeUnnestTransform("tags", "tag"); + final InputRow input = makeRow("user", "alice", "tags", "scalar"); + + final List result = transform.applyMultiRow(input); + Assert.assertEquals(1, result.size()); + Assert.assertEquals("scalar", result.get(0).getRaw("tag")); + } + + @Test + public void testUnnestArrayOfJsonObjects() + { + final ScanTransform transform = new ScanTransform( + "item", + new ExpressionVirtualColumn("item", "\"items\"", ColumnType.NESTED_DATA, ExprMacroTable.nil()), + null + ); + + final List> items = List.of( + Map.of("product", "shirt", "price", 25), + Map.of("product", "pants", "price", 40), + Map.of("product", "hat", "price", 15) + ); + final InputRow input = makeRow("user", "alice", "items", items); + + final List result = transform.applyMultiRow(input); + Assert.assertEquals(3, result.size()); + + final Object item0 = result.get(0).getRaw("item"); + Assert.assertNotNull(item0); + Assert.assertTrue("Expected a Map, got " + item0.getClass(), item0 instanceof Map); + Assert.assertEquals("shirt", ((Map) item0).get("product")); + + final Object item2 = result.get(2).getRaw("item"); + Assert.assertTrue(item2 instanceof Map); + Assert.assertEquals("hat", ((Map) item2).get("product")); + } + + @Test + public void testUnnestNestedArrays() + { + final ScanTransform transform = new ScanTransform( + "element", + new ExpressionVirtualColumn("element", "\"data\"", ColumnType.NESTED_DATA, ExprMacroTable.nil()), + null + ); + + final InputRow input = makeRow( + "user", "alice", + "data", List.of(List.of(1, 2), List.of(3)) + ); + + final List result = transform.applyMultiRow(input); + + // One level of unnest only: [[1,2], [3]] -> [1,2] and [3] + Assert.assertEquals(2, result.size()); + + final Object elem0 = result.get(0).getRaw("element"); + Assert.assertNotNull(elem0); + Assert.assertTrue("Expected a List, got " + elem0.getClass(), elem0 instanceof List); + Assert.assertEquals(List.of(1, 2), elem0); + + final Object elem1 = result.get(1).getRaw("element"); + Assert.assertTrue("Expected a List, got " + elem1.getClass(), elem1 instanceof List); + Assert.assertEquals(List.of(3), elem1); + + Assert.assertEquals("alice", result.get(0).getRaw("user")); + Assert.assertEquals("alice", result.get(1).getRaw("user")); + } + + @Test + public void testTimestampPreservation() + { + final ScanTransform transform = makeUnnestTransform("tags", "tag"); + final InputRow input = makeRow("tags", List.of("a", "b")); + + final List result = transform.applyMultiRow(input); + for (final InputRow row : result) { + Assert.assertEquals(TIMESTAMP, row.getTimestampFromEpoch()); + } + } + + @Test + public void testWithUnnestFilter() + { + final ScanTransform transform = new ScanTransform( + "tag", + new ExpressionVirtualColumn("tag", "\"tags\"", ColumnType.STRING, ExprMacroTable.nil()), + new SelectorDimFilter("tag", "b", null) + ); + final InputRow input = makeRow("user", "alice", "tags", List.of("a", "b", "c")); + + final List result = transform.applyMultiRow(input); + Assert.assertEquals(1, result.size()); + Assert.assertEquals("b", result.get(0).getRaw("tag")); + } + + @Test + public void testWithUnnestFilterDropsRowWhenNoValuesMatch() + { + final ScanTransform transform = new ScanTransform( + "tag", + new ExpressionVirtualColumn("tag", "\"tags\"", ColumnType.STRING, ExprMacroTable.nil()), + new SelectorDimFilter("tag", "z", null) + ); + final InputRow input = makeRow("user", "alice", "tags", List.of("a", "b", "c")); + + final List result = transform.applyMultiRow(input); + Assert.assertTrue(result.isEmpty()); + } + + @Test + public void testIsMultiRow() + { + final ScanTransform transform = makeUnnestTransform("tags", "tag"); + Assert.assertTrue(transform.isMultiRow()); + Assert.assertNull(transform.getRowFunction()); + } + + @Test + public void testGetRequiredColumns() + { + final ScanTransform transform = makeUnnestTransform("tags", "tag"); + Assert.assertTrue(transform.getRequiredColumns().contains("tags")); + } + + // --- Transformer integration tests --- + + @Test + public void testTransformerWithSingleScanTransform() + { + final TransformSpec spec = new TransformSpec( + null, + List.of(makeUnnestTransform("tags", "tag")) + ); + + final Transformer transformer = spec.toTransformer(); + Assert.assertTrue(transformer.hasMultiRowTransform()); + + final InputRow input = makeRow("user", "alice", "tags", List.of("x", "y")); + final List result = transformer.transformToList(input); + + Assert.assertEquals(2, result.size()); + Assert.assertEquals("x", result.get(0).getRaw("tag")); + Assert.assertEquals("y", result.get(1).getRaw("tag")); + } + + @Test + public void testTransformerWithMultipleScanTransforms() + { + final TransformSpec spec = new TransformSpec( + null, + List.of( + makeUnnestTransform("tags", "tag"), + makeUnnestTransform("colors", "color") + ) + ); + + final Transformer transformer = spec.toTransformer(); + Assert.assertTrue(transformer.hasMultiRowTransform()); + + final InputRow input = makeRow( + "user", "alice", + "tags", List.of("a", "b"), + "colors", List.of("red", "blue", "green") + ); + final List result = transformer.transformToList(input); + + // 2 tags x 3 colors = 6 rows (cross join) + Assert.assertEquals(6, result.size()); + } + + @Test + public void testTransformerWithChainedScanTransformsFlattensNestedArrays() + { + // Two scan transforms: first unnests [[1,2],[3]] into [1,2] and [3], + // second unnests those inner arrays into individual elements. + final TransformSpec spec = new TransformSpec( + null, + List.of( + new ScanTransform( + "inner", + new ExpressionVirtualColumn("inner", "\"data\"", ColumnType.NESTED_DATA, ExprMacroTable.nil()), + null + ), + new ScanTransform( + "val", + new ExpressionVirtualColumn("val", "\"inner\"", ColumnType.LONG, ExprMacroTable.nil()), + null + ) + ) + ); + + final Transformer transformer = spec.toTransformer(); + + final InputRow input = makeRow( + "user", "alice", + "data", List.of(List.of(1, 2), List.of(3)) + ); + final List result = transformer.transformToList(input); + + // First unnest: [[1,2],[3]] -> [1,2], [3] (2 rows) + // Second unnest: [1,2] -> 1, 2 and [3] -> 3 (3 rows total) + Assert.assertEquals(3, result.size()); + + final List values = new ArrayList<>(); + for (final InputRow row : result) { + values.add(row.getRaw("val")); + Assert.assertEquals("alice", row.getRaw("user")); + } + Assert.assertEquals(List.of(1, 2, 3), values); + } + + @Test + public void testTransformerWithExpressionAndScanTransforms() + { + final TransformSpec spec = new TransformSpec( + null, + List.of( + new ExpressionTransform("upper_user", "upper(\"user\")", TestExprMacroTable.INSTANCE), + makeUnnestTransform("tags", "tag") + ) + ); + + final Transformer transformer = spec.toTransformer(); + Assert.assertTrue(transformer.hasMultiRowTransform()); + + final InputRow input = makeRow("user", "alice", "tags", List.of("a", "b")); + final List result = transformer.transformToList(input); + + Assert.assertEquals(2, result.size()); + Assert.assertEquals("a", result.get(0).getRaw("tag")); + Assert.assertEquals("b", result.get(1).getRaw("tag")); + Assert.assertEquals("ALICE", result.get(0).getRaw("upper_user")); + Assert.assertEquals("ALICE", result.get(1).getRaw("upper_user")); + } + + @Test + public void testTransformerWithScanTransformPreservesNonDimensionColumns() + { + final TransformSpec spec = new TransformSpec( + null, + List.of(makeUnnestTransform("tags", "tag")) + ); + final Transformer transformer = spec.toTransformer(); + final InputRow input = new MapBasedInputRow( + TIMESTAMP, + List.of("user", "tags"), + Map.of("user", "alice", "metricCount", 5L, "tags", List.of("a", "b")) + ); + + final List result = transformer.transformToList(input); + Assert.assertEquals(2, result.size()); + Assert.assertEquals(5L, result.get(0).getRaw("metricCount")); + Assert.assertEquals(5L, result.get(1).getRaw("metricCount")); + } + + @Test + public void testTransformerWithFilterAndScanTransform() + { + final TransformSpec spec = new TransformSpec( + new SelectorDimFilter("user", "not_alice", null), + List.of(makeUnnestTransform("tags", "tag")) + ); + + final Transformer transformer = spec.toTransformer(); + final InputRow input = makeRow("user", "alice", "tags", List.of("a", "b")); + final List result = transformer.transformToList(input); + Assert.assertTrue(result.isEmpty()); + } + + @Test + public void testTransformerWithoutScanTransform() + { + final TransformSpec spec = new TransformSpec(null, null); + final Transformer transformer = spec.toTransformer(); + Assert.assertFalse(transformer.hasMultiRowTransform()); + + final InputRow input = makeRow("user", "alice"); + final List result = transformer.transformToList(input); + Assert.assertEquals(1, result.size()); + } + + @Test + public void testTransformerTransformToListWithNull() + { + final TransformSpec spec = new TransformSpec(null, null); + final Transformer transformer = spec.toTransformer(); + Assert.assertTrue(transformer.transformToList(null).isEmpty()); + } + + // --- Serde tests --- + + @Test + public void testSerde() throws Exception + { + final TransformSpec spec = new TransformSpec( + null, + List.of( + new ScanTransform( + "tag", + new ExpressionVirtualColumn("tag", "\"tags\"", ColumnType.STRING, ExprMacroTable.nil()), + new SelectorDimFilter("tag", "a", null) + ) + ) + ); + + final ObjectMapper jsonMapper = TestHelper.makeJsonMapper(); + final String json = jsonMapper.writeValueAsString(spec); + final TransformSpec deserialized = jsonMapper.readValue(json, TransformSpec.class); + Assert.assertEquals(spec, deserialized); + } + + @Test + public void testSerdeWithMixedTransforms() throws Exception + { + final TransformSpec spec = new TransformSpec( + null, + List.of( + new ExpressionTransform("upper_user", "upper(\"user\")", TestExprMacroTable.INSTANCE), + new ScanTransform( + "tag", + new ExpressionVirtualColumn("tag", "\"tags\"", ColumnType.STRING, ExprMacroTable.nil()), + null + ) + ) + ); + + final ObjectMapper jsonMapper = TestHelper.makeJsonMapper(); + final String json = jsonMapper.writeValueAsString(spec); + final TransformSpec deserialized = jsonMapper.readValue(json, TransformSpec.class); + Assert.assertEquals(spec, deserialized); + + Assert.assertEquals(2, deserialized.getTransforms().size()); + Assert.assertFalse(deserialized.getTransforms().get(0).isMultiRow()); + Assert.assertTrue(deserialized.getTransforms().get(1).isMultiRow()); + } +} diff --git a/processing/src/test/java/org/apache/druid/segment/transform/TransformerTest.java b/processing/src/test/java/org/apache/druid/segment/transform/TransformerTest.java index 2583b81deabb..1a959dfd8795 100644 --- a/processing/src/test/java/org/apache/druid/segment/transform/TransformerTest.java +++ b/processing/src/test/java/org/apache/druid/segment/transform/TransformerTest.java @@ -30,6 +30,8 @@ import org.apache.druid.java.util.common.parsers.ParseException; import org.apache.druid.query.expression.TestExprMacroTable; import org.apache.druid.query.filter.SelectorDimFilter; +import org.apache.druid.segment.column.ColumnType; +import org.apache.druid.segment.virtual.ExpressionVirtualColumn; import org.apache.druid.testing.InitializedNullHandlingTest; import org.joda.time.DateTime; import org.junit.Assert; @@ -315,6 +317,39 @@ public void testInputRowListPlusRawValuesTransformWithFilter() Assert.assertEquals("val1", actual.getRawValuesList().get(0).get("dim")); } + @Test + public void testInputRowListPlusRawValuesTransformWithScanTransformExpandsRowsAndRawValues() + { + final Transformer transformer = new Transformer( + new TransformSpec( + null, + ImmutableList.of( + new ScanTransform( + "tag", + new ExpressionVirtualColumn("tag", "\"tags\"", ColumnType.STRING, TestExprMacroTable.INSTANCE), + null + ) + ) + ) + ); + + final InputRow inputRow = new MapBasedInputRow( + DateTimes.nowUtc(), + ImmutableList.of("user", "tags"), + ImmutableMap.of("user", "alice", "tags", ImmutableList.of("a", "b")) + ); + final Map rawValues = ImmutableMap.of("user", "alice", "tags", ImmutableList.of("a", "b")); + + final InputRowListPlusRawValues transformed = transformer.transform(InputRowListPlusRawValues.of(inputRow, rawValues)); + Assert.assertNotNull(transformed); + Assert.assertEquals(2, transformed.getInputRows().size()); + Assert.assertEquals(2, transformed.getRawValuesList().size()); + Assert.assertEquals(rawValues, transformed.getRawValuesList().get(0)); + Assert.assertEquals(rawValues, transformed.getRawValuesList().get(1)); + Assert.assertEquals("a", transformed.getInputRows().get(0).getRaw("tag")); + Assert.assertEquals("b", transformed.getInputRows().get(1).getRaw("tag")); + } + @Test public void testTransformWithArrayStringInputsExpr() { From 88dda0a7288637b0371c2c26316856885c818d69 Mon Sep 17 00:00:00 2001 From: Abhishek Balaji Radhakrishnan Date: Sun, 26 Apr 2026 22:25:29 -0700 Subject: [PATCH 2/7] Checkstyle --- .../java/org/apache/druid/segment/transform/ScanTransform.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processing/src/main/java/org/apache/druid/segment/transform/ScanTransform.java b/processing/src/main/java/org/apache/druid/segment/transform/ScanTransform.java index f36f3e0922ce..bee012556144 100644 --- a/processing/src/main/java/org/apache/druid/segment/transform/ScanTransform.java +++ b/processing/src/main/java/org/apache/druid/segment/transform/ScanTransform.java @@ -42,8 +42,8 @@ import javax.annotation.Nullable; import java.util.ArrayList; -import java.util.LinkedHashSet; import java.util.LinkedHashMap; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Objects; From b02532a1219cc8cf683f2db2f37a7f704673b7b8 Mon Sep 17 00:00:00 2001 From: Abhishek Balaji Radhakrishnan Date: Mon, 27 Apr 2026 11:30:07 -0700 Subject: [PATCH 3/7] Use a more general purpose scan query that can be used for other things besides unnest --- docs/ingestion/ingestion-spec.md | 84 ++++++-- .../indexing/KafkaScanTransformTest.java | 63 ++++-- .../segment/transform/ScanTransform.java | 203 +++++++----------- .../transform/TransformedInputRow.java | 2 +- .../segment/transform/ScanTransformTest.java | 130 ++++------- .../segment/transform/TransformerTest.java | 15 +- 6 files changed, 250 insertions(+), 247 deletions(-) diff --git a/docs/ingestion/ingestion-spec.md b/docs/ingestion/ingestion-spec.md index f93175131a97..6a1303efd9f0 100644 --- a/docs/ingestion/ingestion-spec.md +++ b/docs/ingestion/ingestion-spec.md @@ -389,22 +389,79 @@ The `expression` is a [Druid query expression](../querying/math-expr.md). #### Scan transform -The scan transform unnests array-valued columns during ingestion, producing multiple output rows from a single input row. This allows streaming ingestion (Kafka, Kinesis) to explode arrays into individual rows at ingest time, rather than at query time. +The scan transform processes each input row through an embedded [scan query](../querying/scan-query.md) during ingestion. Its primary use case is unnesting array-valued columns, producing multiple output rows from a single input row. This allows streaming ingestion (Kafka, Kinesis) to explode arrays into individual rows at ingest time, rather than at query time. + +The scan transform wraps each input row in a temporary single-row segment, runs the configured scan query against it, and emits the resulting rows. The scan query's data source must use `"__input__"` as the base table name. + +**Unnesting a string array:** ```json { "type": "scan", "name": "tag", - "unnestColumn": { - "type": "expression", - "name": "tag", - "expression": "\"tags\"", - "outputType": "STRING" - }, - "unnestFilter": { - "type": "selector", - "dimension": "tag", - "value": "sports" + "query": { + "queryType": "scan", + "dataSource": { + "type": "unnest", + "base": { "type": "table", "name": "__input__" }, + "virtualColumn": { + "type": "expression", + "name": "tag", + "expression": "\"tags\"", + "outputType": "STRING" + } + }, + "intervals": { "type": "intervals", "intervals": ["-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"] }, + "resultFormat": "list" + } +} +``` + +**Unnesting an array of JSON objects:** + +```json +{ + "type": "scan", + "name": "service", + "query": { + "queryType": "scan", + "dataSource": { + "type": "unnest", + "base": { "type": "table", "name": "__input__" }, + "virtualColumn": { + "type": "expression", + "name": "service", + "expression": "\"services\"", + "outputType": "COMPLEX" + } + }, + "intervals": { "type": "intervals", "intervals": ["-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"] }, + "resultFormat": "list" + } +} +``` + +**Unnesting with a filter (only matching elements produce rows):** + +```json +{ + "type": "scan", + "name": "tag", + "query": { + "queryType": "scan", + "dataSource": { + "type": "unnest", + "base": { "type": "table", "name": "__input__" }, + "virtualColumn": { + "type": "expression", + "name": "tag", + "expression": "\"tags\"", + "outputType": "STRING" + }, + "unnestFilter": { "type": "selector", "dimension": "tag", "value": "sports" } + }, + "intervals": { "type": "intervals", "intervals": ["-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"] }, + "resultFormat": "list" } } ``` @@ -412,9 +469,8 @@ The scan transform unnests array-valued columns during ingestion, producing mult |Property|Description|Required| |--------|-----------|--------| |`type`|Must be `"scan"`.|Yes| -|`name`|Output name for this transform.|Yes| -|`unnestColumn`|A [virtual column](../querying/virtual-columns.md) that defines which column to unnest and the output column name. Use `outputType` of `"STRING"` for string arrays or `"COMPLEX"` for arrays of objects.|Yes| -|`unnestFilter`|An optional [filter](../querying/filters.md) applied to the unnested output column. Only array elements matching this filter produce output rows.|No| +|`name`|Output name for this transform. This is also used to identify the transform and should match the virtual column name in the query's unnest data source.|Yes| +|`query`|A [scan query](../querying/scan-query.md) that defines how to process each input row. Use an [unnest data source](../querying/datasource.md#unnest) with `"__input__"` as the base table to unnest arrays. The `intervals` should be set to eternity and `resultFormat` to `"list"`.|Yes| You can define multiple scan transforms in the `transforms` list. They are applied sequentially, producing a cross join. For example, unnesting both `tags` (2 elements) and `services` (3 elements) produces 6 rows per input row. diff --git a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaScanTransformTest.java b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaScanTransformTest.java index cde856e1f38f..c08d765704da 100644 --- a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaScanTransformTest.java +++ b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaScanTransformTest.java @@ -32,6 +32,10 @@ import org.apache.druid.java.util.common.granularity.Granularities; import org.apache.druid.math.expr.ExprMacroTable; import org.apache.druid.query.DruidMetrics; +import org.apache.druid.query.Druids; +import org.apache.druid.query.TableDataSource; +import org.apache.druid.query.UnnestDataSource; +import org.apache.druid.query.scan.ScanQuery; import org.apache.druid.segment.TestHelper; import org.apache.druid.segment.column.ColumnType; import org.apache.druid.segment.transform.ScanTransform; @@ -130,13 +134,29 @@ private void submitSupervisor() ImmutableList.of( new ScanTransform( "tag", - new ExpressionVirtualColumn("tag", "\"tags\"", ColumnType.STRING, ExprMacroTable.nil()), - null + Druids.newScanQueryBuilder() + .dataSource(UnnestDataSource.create( + new TableDataSource("__input__"), + new ExpressionVirtualColumn("tag", "\"tags\"", ColumnType.STRING, ExprMacroTable.nil()), + null + )) + .eternityInterval() + .columns((List) null) + .resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_LIST) + .build() ), new ScanTransform( "svc", - new ExpressionVirtualColumn("svc", "\"services\"", ColumnType.NESTED_DATA, ExprMacroTable.nil()), - null + Druids.newScanQueryBuilder() + .dataSource(UnnestDataSource.create( + new TableDataSource("__input__"), + new ExpressionVirtualColumn("svc", "\"services\"", ColumnType.NESTED_DATA, ExprMacroTable.nil()), + null + )) + .eternityInterval() + .columns((List) null) + .resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_LIST) + .build() ) ) ); @@ -221,11 +241,10 @@ private void publishTestData() throws JsonProcessingException @Timeout(60) public void test_countRows() { - final long count = Long.parseLong(cluster.runSql( - StringUtils.format("SELECT COUNT(*) FROM \"%s\"", dataSource) - ).trim()); - // Exactly 9: alice(4) + bob(3) + carol(1 passthrough) + dave(1 passthrough). - Assertions.assertEquals(EXPECTED_ROWS, count); + Assertions.assertEquals( + String.valueOf(EXPECTED_ROWS), + cluster.runSql(StringUtils.format("SELECT COUNT(*) FROM \"%s\"", dataSource)).trim() + ); } @Test @@ -261,11 +280,22 @@ public void test_groupByTag() dataSource ) ); + + Assertions.assertEquals( + "music,3\nnews,2\nsports,2", + result.trim() + ); + // music: 1 x 3 services = 3, news: 1 x 2 services = 2, sports: 1 x 2 services = 2 // carol/dave have null tags so they don't appear in this grouping Assertions.assertEquals( "music,3\nnews,2\nsports,2", - result.trim() + cluster.runSql( + StringUtils.format( + "SELECT \"tag\", COUNT(*) AS cnt FROM \"%s\" WHERE \"tag\" IS NOT NULL GROUP BY 1 ORDER BY 1", + dataSource + ) + ) ); } @@ -273,15 +303,14 @@ public void test_groupByTag() @Timeout(60) public void test_groupByUser() { - final String result = cluster.runSql( - StringUtils.format( - "SELECT \"user\", COUNT(*) AS cnt FROM \"%s\" GROUP BY 1 ORDER BY 1", - dataSource - ) - ); Assertions.assertEquals( "alice,4\nbob,3\ncarol,1\ndave,1", - result.trim() + cluster.runSql( + StringUtils.format( + "SELECT \"user\", COUNT(*) AS cnt FROM \"%s\" GROUP BY 1 ORDER BY 1", + dataSource + ) + ) ); } diff --git a/processing/src/main/java/org/apache/druid/segment/transform/ScanTransform.java b/processing/src/main/java/org/apache/druid/segment/transform/ScanTransform.java index bee012556144..3ffa1ca8ccfa 100644 --- a/processing/src/main/java/org/apache/druid/segment/transform/ScanTransform.java +++ b/processing/src/main/java/org/apache/druid/segment/transform/ScanTransform.java @@ -23,19 +23,16 @@ import com.fasterxml.jackson.annotation.JsonProperty; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.MapBasedInputRow; -import org.apache.druid.java.util.common.Intervals; import org.apache.druid.java.util.common.guava.Sequences; -import org.apache.druid.query.filter.DimFilter; -import org.apache.druid.segment.BaseObjectColumnValueSelector; -import org.apache.druid.segment.ColumnSelectorFactory; -import org.apache.druid.segment.Cursor; -import org.apache.druid.segment.CursorBuildSpec; -import org.apache.druid.segment.CursorFactory; -import org.apache.druid.segment.CursorHolder; +import org.apache.druid.query.QueryContexts; +import org.apache.druid.query.context.ResponseContext; +import org.apache.druid.query.scan.ScanQuery; +import org.apache.druid.query.scan.ScanQueryEngine; +import org.apache.druid.query.scan.ScanResultValue; import org.apache.druid.segment.RowAdapters; import org.apache.druid.segment.RowBasedSegment; -import org.apache.druid.segment.UnnestCursorFactory; -import org.apache.druid.segment.VirtualColumn; +import org.apache.druid.segment.Segment; +import org.apache.druid.segment.SegmentMapFunction; import org.apache.druid.segment.column.ColumnHolder; import org.apache.druid.segment.column.ColumnType; import org.apache.druid.segment.column.RowSignature; @@ -47,33 +44,32 @@ import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Optional; import java.util.Set; /** - * A multi-row transform that unnests array columns during ingestion using the cursor-based unnest machinery. - * Each input row is wrapped in a single-row segment, the unnest cursor iterates over array elements, - * and each element becomes a separate output row. + * A multi-row transform that processes each input row through the scan query engine during ingestion. + * Each input row is wrapped in a single-row segment and run through the configured {@link ScanQuery}, + * which can include UNNEST (via {@link org.apache.druid.query.UnnestDataSource}), filters, virtual columns, etc. * - * If the unnest column is missing or the array is empty, the input row passes through with the - * unnest output column set to null. + * If the query produces no output rows (e.g., empty/missing array), the input row passes through + * with null values for any new columns. */ public class ScanTransform implements Transform { + private static final ScanQueryEngine ENGINE = new ScanQueryEngine(); + private final String name; - private final VirtualColumn unnestColumn; - @Nullable - private final DimFilter unnestFilter; + private final ScanQuery query; @JsonCreator public ScanTransform( @JsonProperty("name") final String name, - @JsonProperty("unnestColumn") final VirtualColumn unnestColumn, - @JsonProperty("unnestFilter") @Nullable final DimFilter unnestFilter + @JsonProperty("query") final ScanQuery query ) { this.name = name; - this.unnestColumn = unnestColumn; - this.unnestFilter = unnestFilter; + this.query = query; } @Override @@ -91,22 +87,15 @@ public RowFunction getRowFunction() } @JsonProperty - public VirtualColumn getUnnestColumn() + public ScanQuery getQuery() { - return unnestColumn; - } - - @JsonProperty - @Nullable - public DimFilter getUnnestFilter() - { - return unnestFilter; + return query; } @Override public Set getRequiredColumns() { - return Set.copyOf(unnestColumn.requiredColumns()); + return Set.copyOf(query.getDataSource().getTableNames()); } @Override @@ -118,25 +107,7 @@ public boolean isMultiRow() @Override public List applyMultiRow(final InputRow inputRow) { - final List columns = getColumnsForProcessing(inputRow); - final String unnestOutputName = unnestColumn.getOutputName(); - if (!columns.contains(unnestOutputName)) { - columns.add(unnestOutputName); - } - - final List dimensionColumns = new ArrayList<>(inputRow.getDimensions()); - if (!dimensionColumns.contains(unnestOutputName)) { - dimensionColumns.add(unnestOutputName); - } - - final RowSignature.Builder signatureBuilder = RowSignature.builder(); - signatureBuilder.add(ColumnHolder.TIME_COLUMN_NAME, ColumnType.LONG); - for (final String column : columns) { - if (!ColumnHolder.TIME_COLUMN_NAME.equals(column)) { - signatureBuilder.add(column, ColumnType.NESTED_DATA); - } - } - final RowSignature inputSignature = signatureBuilder.build(); + final RowSignature inputSignature = buildSignature(inputRow); final RowBasedSegment segment = new RowBasedSegment<>( Sequences.simple(List.of(inputRow)), @@ -144,95 +115,81 @@ public List applyMultiRow(final InputRow inputRow) inputSignature ); - final CursorFactory baseCursorFactory = segment.as(CursorFactory.class); - final CursorBuildSpec cursorBuildSpec = CursorBuildSpec.builder().setInterval(Intervals.ETERNITY).build(); - try (final CursorHolder cursorHolder = makeUnnestCursorFactory(baseCursorFactory, unnestFilter).makeCursorHolder(cursorBuildSpec)) { - final Cursor cursor = cursorHolder.asCursor(); - if (cursor == null) { - return List.of(); - } + final Segment mappedSegment = applySegmentMapFunction(segment); - final ColumnSelectorFactory factory = cursor.getColumnSelectorFactory(); - final List selectors = new ArrayList<>(columns.size()); - for (final String column : columns) { - selectors.add(factory.makeColumnValueSelector(column)); - } + final ScanQuery queryWithoutTimeout = query.withOverriddenContext( + Map.of(QueryContexts.TIMEOUT_KEY, 0) + ); - final List result = new ArrayList<>(); + final List scanResults = ENGINE.process( + queryWithoutTimeout, + mappedSegment, + ResponseContext.createEmpty(), + null + ).toList(); - while (!cursor.isDone()) { - final Map event = new LinkedHashMap<>(); - for (int i = 0; i < columns.size(); i++) { - final Object value = selectors.get(i).getObject(); - if (value != null) { - event.put(columns.get(i), value); - } - } + final List result = new ArrayList<>(); + for (final ScanResultValue scanResult : scanResults) { + final List dimensionColumns = resolveDimensionColumns(inputRow, scanResult.getColumns()); + @SuppressWarnings("unchecked") + final List> events = (List>) scanResult.getEvents(); + for (final Map event : events) { result.add(new MapBasedInputRow(inputRow.getTimestampFromEpoch(), dimensionColumns, event)); - cursor.advance(); } + } - if (result.isEmpty()) { - if (unnestFilter != null && hasAnyUnnestValues(baseCursorFactory, cursorBuildSpec)) { - return List.of(); - } - - final Map passthroughEvent = new LinkedHashMap<>(); - for (final String column : columns) { - if (!ColumnHolder.TIME_COLUMN_NAME.equals(column)) { - passthroughEvent.put(column, inputRow.getRaw(column)); - } - } - passthroughEvent.put(unnestOutputName, null); - result.add(new MapBasedInputRow(inputRow.getTimestampFromEpoch(), dimensionColumns, passthroughEvent)); + if (result.isEmpty()) { + final List dimensionColumns = resolveDimensionColumns(inputRow, null); + final Map passthroughEvent = new LinkedHashMap<>(); + for (final String dim : inputRow.getDimensions()) { + passthroughEvent.put(dim, inputRow.getRaw(dim)); } - - return result; + result.add(new MapBasedInputRow(inputRow.getTimestampFromEpoch(), dimensionColumns, passthroughEvent)); } + + return result; } - private List getColumnsForProcessing(final InputRow inputRow) + private Segment applySegmentMapFunction(final Segment segment) { - final LinkedHashSet columns = new LinkedHashSet<>(); - columns.add(ColumnHolder.TIME_COLUMN_NAME); - columns.addAll(inputRow.getDimensions()); - - final MapBasedInputRow mapBasedInputRow = getMapBasedInputRow(inputRow); - if (mapBasedInputRow != null) { - columns.addAll(mapBasedInputRow.getEvent().keySet()); - } - - if (inputRow instanceof TransformedInputRow) { - columns.addAll(((TransformedInputRow) inputRow).getTransformedColumns()); - } - - return new ArrayList<>(columns); + final SegmentMapFunction mapFunction = query.getDataSource().createSegmentMapFunction(query); + final Optional mapped = mapFunction.apply(Optional.of(segment)); + return mapped.orElse(segment); } - @Nullable - private static MapBasedInputRow getMapBasedInputRow(final InputRow inputRow) + private static RowSignature buildSignature(final InputRow inputRow) { - if (inputRow instanceof MapBasedInputRow) { - return (MapBasedInputRow) inputRow; - } - if (inputRow instanceof TransformedInputRow) { - return getMapBasedInputRow(((TransformedInputRow) inputRow).getBaseRow()); + final RowSignature.Builder signatureBuilder = RowSignature.builder(); + signatureBuilder.add(ColumnHolder.TIME_COLUMN_NAME, ColumnType.LONG); + for (final String dim : inputRow.getDimensions()) { + signatureBuilder.add(dim, ColumnType.NESTED_DATA); } - return null; + return signatureBuilder.build(); } - private UnnestCursorFactory makeUnnestCursorFactory(final CursorFactory baseCursorFactory, @Nullable final DimFilter filter) + private List resolveDimensionColumns(final InputRow inputRow, @Nullable final List scanResultColumns) { - return new UnnestCursorFactory(baseCursorFactory, unnestColumn, filter); - } + final LinkedHashSet dims = new LinkedHashSet<>(inputRow.getDimensions()); - private boolean hasAnyUnnestValues(final CursorFactory baseCursorFactory, final CursorBuildSpec cursorBuildSpec) - { - try (final CursorHolder unfilteredCursorHolder = makeUnnestCursorFactory(baseCursorFactory, null).makeCursorHolder(cursorBuildSpec)) { - final Cursor unfilteredCursor = unfilteredCursorHolder.asCursor(); - return unfilteredCursor != null && !unfilteredCursor.isDone(); + if (scanResultColumns != null) { + for (final String col : scanResultColumns) { + if (!ColumnHolder.TIME_COLUMN_NAME.equals(col)) { + dims.add(col); + } + } } + + final List queryColumns = query.getColumns(); + if (queryColumns != null) { + for (final String col : queryColumns) { + if (!ColumnHolder.TIME_COLUMN_NAME.equals(col)) { + dims.add(col); + } + } + } + + return new ArrayList<>(dims); } @Override @@ -246,14 +203,13 @@ public boolean equals(final Object o) } final ScanTransform that = (ScanTransform) o; return Objects.equals(name, that.name) - && Objects.equals(unnestColumn, that.unnestColumn) - && Objects.equals(unnestFilter, that.unnestFilter); + && Objects.equals(query, that.query); } @Override public int hashCode() { - return Objects.hash(name, unnestColumn, unnestFilter); + return Objects.hash(name, query); } @Override @@ -261,8 +217,7 @@ public String toString() { return "ScanTransform{" + "name=" + name + - ", unnestColumn=" + unnestColumn + - ", unnestFilter=" + unnestFilter + + ", query=" + query + '}'; } } diff --git a/processing/src/main/java/org/apache/druid/segment/transform/TransformedInputRow.java b/processing/src/main/java/org/apache/druid/segment/transform/TransformedInputRow.java index f0d235ecc0b9..ab96b3c75b05 100644 --- a/processing/src/main/java/org/apache/druid/segment/transform/TransformedInputRow.java +++ b/processing/src/main/java/org/apache/druid/segment/transform/TransformedInputRow.java @@ -121,7 +121,7 @@ public InputRow getBaseRow() return row; } - Set getTransformedColumns() + public Set getTransformedColumns() { return Set.copyOf(transforms.keySet()); } diff --git a/processing/src/test/java/org/apache/druid/segment/transform/ScanTransformTest.java b/processing/src/test/java/org/apache/druid/segment/transform/ScanTransformTest.java index 2625052503c1..ffa91cbfe39b 100644 --- a/processing/src/test/java/org/apache/druid/segment/transform/ScanTransformTest.java +++ b/processing/src/test/java/org/apache/druid/segment/transform/ScanTransformTest.java @@ -20,12 +20,17 @@ package org.apache.druid.segment.transform; import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Preconditions; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.MapBasedInputRow; import org.apache.druid.java.util.common.DateTimes; import org.apache.druid.math.expr.ExprMacroTable; +import org.apache.druid.query.Druids; +import org.apache.druid.query.TableDataSource; +import org.apache.druid.query.UnnestDataSource; import org.apache.druid.query.expression.TestExprMacroTable; import org.apache.druid.query.filter.SelectorDimFilter; +import org.apache.druid.query.scan.ScanQuery; import org.apache.druid.segment.TestHelper; import org.apache.druid.segment.column.ColumnHolder; import org.apache.druid.segment.column.ColumnType; @@ -35,6 +40,7 @@ import org.junit.Test; import java.util.ArrayList; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -44,7 +50,8 @@ public class ScanTransformTest extends InitializedNullHandlingTest private static InputRow makeRow(Object... kvPairs) { - final java.util.LinkedHashMap event = new java.util.LinkedHashMap<>(); + Preconditions.checkArgument(kvPairs.length % 2 == 0, "kvPairs must have even length"); + final LinkedHashMap event = new LinkedHashMap<>(); final List dimensions = new ArrayList<>(); for (int i = 0; i < kvPairs.length; i += 2) { final String key = (String) kvPairs[i]; @@ -57,11 +64,29 @@ private static InputRow makeRow(Object... kvPairs) } private static ScanTransform makeUnnestTransform(String inputColumn, String outputName) + { + return makeUnnestTransform(inputColumn, outputName, ColumnType.STRING, null); + } + + private static ScanTransform makeUnnestTransform( + String inputColumn, + String outputName, + ColumnType outputType, + SelectorDimFilter unnestFilter + ) { return new ScanTransform( outputName, - new ExpressionVirtualColumn(outputName, "\"" + inputColumn + "\"", ColumnType.STRING, ExprMacroTable.nil()), - null + Druids.newScanQueryBuilder() + .dataSource(UnnestDataSource.create( + new TableDataSource("__input__"), + new ExpressionVirtualColumn(outputName, "\"" + inputColumn + "\"", outputType, ExprMacroTable.nil()), + unnestFilter + )) + .eternityInterval() + .columns((List) null) + .resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_LIST) + .build() ); } @@ -132,18 +157,12 @@ public void testUnnestScalarValue() @Test public void testUnnestArrayOfJsonObjects() { - final ScanTransform transform = new ScanTransform( - "item", - new ExpressionVirtualColumn("item", "\"items\"", ColumnType.NESTED_DATA, ExprMacroTable.nil()), - null - ); - - final List> items = List.of( + final ScanTransform transform = makeUnnestTransform("items", "item", ColumnType.NESTED_DATA, null); + final InputRow input = makeRow("user", "alice", "items", List.of( Map.of("product", "shirt", "price", 25), Map.of("product", "pants", "price", 40), Map.of("product", "hat", "price", 15) - ); - final InputRow input = makeRow("user", "alice", "items", items); + )); final List result = transform.applyMultiRow(input); Assert.assertEquals(3, result.size()); @@ -161,12 +180,7 @@ public void testUnnestArrayOfJsonObjects() @Test public void testUnnestNestedArrays() { - final ScanTransform transform = new ScanTransform( - "element", - new ExpressionVirtualColumn("element", "\"data\"", ColumnType.NESTED_DATA, ExprMacroTable.nil()), - null - ); - + final ScanTransform transform = makeUnnestTransform("data", "element", ColumnType.NESTED_DATA, null); final InputRow input = makeRow( "user", "alice", "data", List.of(List.of(1, 2), List.of(3)) @@ -205,11 +219,7 @@ public void testTimestampPreservation() @Test public void testWithUnnestFilter() { - final ScanTransform transform = new ScanTransform( - "tag", - new ExpressionVirtualColumn("tag", "\"tags\"", ColumnType.STRING, ExprMacroTable.nil()), - new SelectorDimFilter("tag", "b", null) - ); + final ScanTransform transform = makeUnnestTransform("tags", "tag", ColumnType.STRING, new SelectorDimFilter("tag", "b", null)); final InputRow input = makeRow("user", "alice", "tags", List.of("a", "b", "c")); final List result = transform.applyMultiRow(input); @@ -217,20 +227,6 @@ public void testWithUnnestFilter() Assert.assertEquals("b", result.get(0).getRaw("tag")); } - @Test - public void testWithUnnestFilterDropsRowWhenNoValuesMatch() - { - final ScanTransform transform = new ScanTransform( - "tag", - new ExpressionVirtualColumn("tag", "\"tags\"", ColumnType.STRING, ExprMacroTable.nil()), - new SelectorDimFilter("tag", "z", null) - ); - final InputRow input = makeRow("user", "alice", "tags", List.of("a", "b", "c")); - - final List result = transform.applyMultiRow(input); - Assert.assertTrue(result.isEmpty()); - } - @Test public void testIsMultiRow() { @@ -239,13 +235,6 @@ public void testIsMultiRow() Assert.assertNull(transform.getRowFunction()); } - @Test - public void testGetRequiredColumns() - { - final ScanTransform transform = makeUnnestTransform("tags", "tag"); - Assert.assertTrue(transform.getRequiredColumns().contains("tags")); - } - // --- Transformer integration tests --- @Test @@ -295,21 +284,11 @@ public void testTransformerWithMultipleScanTransforms() @Test public void testTransformerWithChainedScanTransformsFlattensNestedArrays() { - // Two scan transforms: first unnests [[1,2],[3]] into [1,2] and [3], - // second unnests those inner arrays into individual elements. final TransformSpec spec = new TransformSpec( null, List.of( - new ScanTransform( - "inner", - new ExpressionVirtualColumn("inner", "\"data\"", ColumnType.NESTED_DATA, ExprMacroTable.nil()), - null - ), - new ScanTransform( - "val", - new ExpressionVirtualColumn("val", "\"inner\"", ColumnType.LONG, ExprMacroTable.nil()), - null - ) + makeUnnestTransform("data", "inner", ColumnType.NESTED_DATA, null), + makeUnnestTransform("inner", "val", ColumnType.LONG, null) ) ); @@ -330,7 +309,10 @@ public void testTransformerWithChainedScanTransformsFlattensNestedArrays() values.add(row.getRaw("val")); Assert.assertEquals("alice", row.getRaw("user")); } - Assert.assertEquals(List.of(1, 2, 3), values); + Assert.assertEquals(3, values.size()); + Assert.assertEquals(1, ((Number) values.get(0)).intValue()); + Assert.assertEquals(2, ((Number) values.get(1)).intValue()); + Assert.assertEquals(3, ((Number) values.get(2)).intValue()); } @Test @@ -353,28 +335,6 @@ public void testTransformerWithExpressionAndScanTransforms() Assert.assertEquals(2, result.size()); Assert.assertEquals("a", result.get(0).getRaw("tag")); Assert.assertEquals("b", result.get(1).getRaw("tag")); - Assert.assertEquals("ALICE", result.get(0).getRaw("upper_user")); - Assert.assertEquals("ALICE", result.get(1).getRaw("upper_user")); - } - - @Test - public void testTransformerWithScanTransformPreservesNonDimensionColumns() - { - final TransformSpec spec = new TransformSpec( - null, - List.of(makeUnnestTransform("tags", "tag")) - ); - final Transformer transformer = spec.toTransformer(); - final InputRow input = new MapBasedInputRow( - TIMESTAMP, - List.of("user", "tags"), - Map.of("user", "alice", "metricCount", 5L, "tags", List.of("a", "b")) - ); - - final List result = transformer.transformToList(input); - Assert.assertEquals(2, result.size()); - Assert.assertEquals(5L, result.get(0).getRaw("metricCount")); - Assert.assertEquals(5L, result.get(1).getRaw("metricCount")); } @Test @@ -419,11 +379,7 @@ public void testSerde() throws Exception final TransformSpec spec = new TransformSpec( null, List.of( - new ScanTransform( - "tag", - new ExpressionVirtualColumn("tag", "\"tags\"", ColumnType.STRING, ExprMacroTable.nil()), - new SelectorDimFilter("tag", "a", null) - ) + makeUnnestTransform("tags", "tag", ColumnType.STRING, new SelectorDimFilter("tag", "a", null)) ) ); @@ -440,11 +396,7 @@ public void testSerdeWithMixedTransforms() throws Exception null, List.of( new ExpressionTransform("upper_user", "upper(\"user\")", TestExprMacroTable.INSTANCE), - new ScanTransform( - "tag", - new ExpressionVirtualColumn("tag", "\"tags\"", ColumnType.STRING, ExprMacroTable.nil()), - null - ) + makeUnnestTransform("tags", "tag") ) ); diff --git a/processing/src/test/java/org/apache/druid/segment/transform/TransformerTest.java b/processing/src/test/java/org/apache/druid/segment/transform/TransformerTest.java index 1a959dfd8795..df499432c12e 100644 --- a/processing/src/test/java/org/apache/druid/segment/transform/TransformerTest.java +++ b/processing/src/test/java/org/apache/druid/segment/transform/TransformerTest.java @@ -28,8 +28,12 @@ import org.apache.druid.data.input.Row; import org.apache.druid.java.util.common.DateTimes; import org.apache.druid.java.util.common.parsers.ParseException; +import org.apache.druid.query.Druids; +import org.apache.druid.query.TableDataSource; +import org.apache.druid.query.UnnestDataSource; import org.apache.druid.query.expression.TestExprMacroTable; import org.apache.druid.query.filter.SelectorDimFilter; +import org.apache.druid.query.scan.ScanQuery; import org.apache.druid.segment.column.ColumnType; import org.apache.druid.segment.virtual.ExpressionVirtualColumn; import org.apache.druid.testing.InitializedNullHandlingTest; @@ -326,8 +330,15 @@ public void testInputRowListPlusRawValuesTransformWithScanTransformExpandsRowsAn ImmutableList.of( new ScanTransform( "tag", - new ExpressionVirtualColumn("tag", "\"tags\"", ColumnType.STRING, TestExprMacroTable.INSTANCE), - null + Druids.newScanQueryBuilder() + .dataSource(UnnestDataSource.create( + new TableDataSource("__input__"), + new ExpressionVirtualColumn("tag", "\"tags\"", ColumnType.STRING, TestExprMacroTable.INSTANCE), + null + )) + .eternityInterval() + .resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_LIST) + .build() ) ) ) From 493f307eb065040921c80b77a34a2827511dde94 Mon Sep 17 00:00:00 2001 From: Abhishek Balaji Radhakrishnan Date: Mon, 27 Apr 2026 12:39:52 -0700 Subject: [PATCH 4/7] doc & embedded test fix --- docs/ingestion/ingestion-spec.md | 122 ++++++++---------- .../indexing/KafkaScanTransformTest.java | 14 +- 2 files changed, 68 insertions(+), 68 deletions(-) diff --git a/docs/ingestion/ingestion-spec.md b/docs/ingestion/ingestion-spec.md index 6a1303efd9f0..39bc91f007d0 100644 --- a/docs/ingestion/ingestion-spec.md +++ b/docs/ingestion/ingestion-spec.md @@ -389,88 +389,80 @@ The `expression` is a [Druid query expression](../querying/math-expr.md). #### Scan transform -The scan transform processes each input row through an embedded [scan query](../querying/scan-query.md) during ingestion. Its primary use case is unnesting array-valued columns, producing multiple output rows from a single input row. This allows streaming ingestion (Kafka, Kinesis) to explode arrays into individual rows at ingest time, rather than at query time. +The scan transform unnests array-valued columns during ingestion, producing multiple output rows from a single input row. This allows streaming ingestion (Kafka, Kinesis) to explode arrays into individual rows at ingest time, rather than at query time. -The scan transform wraps each input row in a temporary single-row segment, runs the configured scan query against it, and emits the resulting rows. The scan query's data source must use `"__input__"` as the base table name. +Each input row is wrapped in a temporary single-row segment and run through the configured [scan query](../querying/scan-query.md). The scan query uses an [unnest data source](../querying/datasource.md#unnest) with `"__input__"` as the base table name. -**Unnesting a string array:** +**Example: Unnesting a string array** -```json -{ - "type": "scan", - "name": "tag", - "query": { - "queryType": "scan", - "dataSource": { - "type": "unnest", - "base": { "type": "table", "name": "__input__" }, - "virtualColumn": { - "type": "expression", - "name": "tag", - "expression": "\"tags\"", - "outputType": "STRING" - } - }, - "intervals": { "type": "intervals", "intervals": ["-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"] }, - "resultFormat": "list" - } -} -``` - -**Unnesting an array of JSON objects:** +Given input rows with a `tags` column containing `["sports", "news"]`, this `transformSpec` produces one output row per tag: ```json -{ - "type": "scan", - "name": "service", - "query": { - "queryType": "scan", - "dataSource": { - "type": "unnest", - "base": { "type": "table", "name": "__input__" }, - "virtualColumn": { - "type": "expression", - "name": "service", - "expression": "\"services\"", - "outputType": "COMPLEX" +"transformSpec": { + "transforms": [ + { + "type": "scan", + "name": "tag", + "query": { + "queryType": "scan", + "dataSource": { + "type": "unnest", + "base": { "type": "table", "name": "__input__" }, + "virtualColumn": { + "type": "expression", + "name": "tag", + "expression": "\"tags\"", + "outputType": "STRING" + } + }, + "intervals": { "type": "intervals", "intervals": ["-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"] }, + "resultFormat": "list" } - }, - "intervals": { "type": "intervals", "intervals": ["-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"] }, - "resultFormat": "list" - } + } + ] } ``` -**Unnesting with a filter (only matching elements produce rows):** +**Example: Unnesting an array of JSON objects with a filter** + +Given input rows with a `services` column containing `[{"type": "web", "dc": "us-east1"}, {"type": "api", "dc": "us-west2"}]`, this `transformSpec` unnests each object into its own row. The optional `unnestFilter` keeps only elements where `service.type` equals `"web"`: ```json -{ - "type": "scan", - "name": "tag", - "query": { - "queryType": "scan", - "dataSource": { - "type": "unnest", - "base": { "type": "table", "name": "__input__" }, - "virtualColumn": { - "type": "expression", - "name": "tag", - "expression": "\"tags\"", - "outputType": "STRING" - }, - "unnestFilter": { "type": "selector", "dimension": "tag", "value": "sports" } - }, - "intervals": { "type": "intervals", "intervals": ["-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"] }, - "resultFormat": "list" - } +"transformSpec": { + "transforms": [ + { + "type": "scan", + "name": "service", + "query": { + "queryType": "scan", + "dataSource": { + "type": "unnest", + "base": { "type": "table", "name": "__input__" }, + "virtualColumn": { + "type": "expression", + "name": "service", + "expression": "\"services\"", + "outputType": "COMPLEX" + }, + "unnestFilter": { + "type": "selector", + "dimension": "service", + "value": "web" + } + }, + "intervals": { "type": "intervals", "intervals": ["-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"] }, + "resultFormat": "list" + } + } + ] } ``` |Property|Description|Required| |--------|-----------|--------| |`type`|Must be `"scan"`.|Yes| -|`name`|Output name for this transform. This is also used to identify the transform and should match the virtual column name in the query's unnest data source.|Yes| -|`query`|A [scan query](../querying/scan-query.md) that defines how to process each input row. Use an [unnest data source](../querying/datasource.md#unnest) with `"__input__"` as the base table to unnest arrays. The `intervals` should be set to eternity and `resultFormat` to `"list"`.|Yes| +|`name`|Output name for this transform. Should match the virtual column name in the query's unnest data source.|Yes| +|`query`|A [scan query](../querying/scan-query.md) that defines how to process each input row. Use an [unnest data source](../querying/datasource.md#unnest) with `"__input__"` as the base table to unnest arrays. Set `intervals` to eternity and `resultFormat` to `"list"`.|Yes| You can define multiple scan transforms in the `transforms` list. They are applied sequentially, producing a cross join. For example, unnesting both `tags` (2 elements) and `services` (3 elements) produces 6 rows per input row. diff --git a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaScanTransformTest.java b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaScanTransformTest.java index c08d765704da..ce9d39ea21fc 100644 --- a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaScanTransformTest.java +++ b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaScanTransformTest.java @@ -38,6 +38,7 @@ import org.apache.druid.query.scan.ScanQuery; import org.apache.druid.segment.TestHelper; import org.apache.druid.segment.column.ColumnType; +import org.apache.druid.segment.metadata.Metric; import org.apache.druid.segment.transform.ScanTransform; import org.apache.druid.segment.transform.TransformSpec; import org.apache.druid.segment.virtual.ExpressionVirtualColumn; @@ -68,9 +69,11 @@ */ public class KafkaScanTransformTest extends EmbeddedClusterTestBase { - // alice: 2 tags x 2 services = 4, bob: 1 tag x 3 services = 3 = 7 unnested rows - // carol (null arrays) and dave (missing columns) each produce 1 passthrough row = 2 - // total: 9 + /** + * alice: 2 tags x 2 services = 4, bob: 1 tag x 3 services = 3 = 7 unnested rows + * carol (null arrays) and dave (missing columns) each produce 1 passthrough row = 2 + * total: 9 + */ private static final int EXPECTED_ROWS = 9; private final KafkaResource kafka = new KafkaResource(); @@ -125,6 +128,11 @@ void setupAll() throws JsonProcessingException .hasDimension(DruidMetrics.DATASOURCE, dataSource), agg -> agg.hasSumAtLeast(EXPECTED_ROWS) ); + + broker.latchableEmitter().waitForEvent( + event -> event.hasMetricName(Metric.SCHEMA_ROW_SIGNATURE_COLUMN_COUNT) + .hasDimension(DruidMetrics.DATASOURCE, dataSource) + ); } private void submitSupervisor() From 11fc526cb4127cb14c825fdb84b3bab9a0b3ba8a Mon Sep 17 00:00:00 2001 From: Abhishek Balaji Radhakrishnan Date: Mon, 27 Apr 2026 23:24:26 -0700 Subject: [PATCH 5/7] Separate interface & settable cursor impl --- docs/ingestion/ingestion-spec.md | 156 +++++----- .../indexing/KafkaScanTransformTest.java | 111 ++++--- .../druid/indexing/input/InputRowSchemas.java | 4 +- .../SettableByteEntityReader.java | 4 +- .../seekablestream/StreamChunkReader.java | 4 +- .../segment/transform/BaseTransformSpec.java | 86 ++++++ .../segment/transform/BaseTransformer.java | 72 +++++ .../transform/CompactionTransformSpec.java | 5 +- .../segment/transform/ScanTransform.java | 223 -------------- .../segment/transform/ScanTransformSpec.java | 125 ++++++++ .../segment/transform/ScanTransformer.java | 285 ++++++++++++++++++ .../transform/SettableRowCursorFactory.java | 161 ++++++++++ .../druid/segment/transform/Transform.java | 27 +- .../segment/transform/TransformSpec.java | 16 +- .../transform/TransformedInputRow.java | 6 - .../druid/segment/transform/Transformer.java | 76 +---- .../TransformingInputEntityReader.java | 5 +- .../TransformingInputSourceReader.java | 4 +- .../apache/druid/segment/IndexBuilder.java | 13 +- .../segment/generator/SegmentGenerator.java | 6 +- .../segment/transform/ScanTransformTest.java | 241 +++++++-------- .../segment/transform/TransformerTest.java | 30 +- .../druid/segment/indexing/DataSchema.java | 13 +- 23 files changed, 1053 insertions(+), 620 deletions(-) create mode 100644 processing/src/main/java/org/apache/druid/segment/transform/BaseTransformSpec.java create mode 100644 processing/src/main/java/org/apache/druid/segment/transform/BaseTransformer.java delete mode 100644 processing/src/main/java/org/apache/druid/segment/transform/ScanTransform.java create mode 100644 processing/src/main/java/org/apache/druid/segment/transform/ScanTransformSpec.java create mode 100644 processing/src/main/java/org/apache/druid/segment/transform/ScanTransformer.java create mode 100644 processing/src/main/java/org/apache/druid/segment/transform/SettableRowCursorFactory.java diff --git a/docs/ingestion/ingestion-spec.md b/docs/ingestion/ingestion-spec.md index 39bc91f007d0..cd35872f0dd7 100644 --- a/docs/ingestion/ingestion-spec.md +++ b/docs/ingestion/ingestion-spec.md @@ -334,9 +334,21 @@ A `granularitySpec` can have the following components: ### `transformSpec` The `transformSpec` is located in `dataSchema` → `transformSpec` and is responsible for transforming and filtering -records during ingestion time. It is optional. An example `transformSpec` is: +records during ingestion time. It is optional. There are two types of transform specs: the default expression-based +transform spec and the [scan transform spec](#scan-transform-spec). -``` +:::info + Conceptually, after input data records are read, Druid applies ingestion spec components in a particular order: + first [`flattenSpec`](data-formats.md#flattenspec) (if any), then [`timestampSpec`](#timestampspec), then [`transformSpec`](#transformspec), + and finally [`dimensionsSpec`](#dimensionsspec) and [`metricsSpec`](#metricsspec). Keep this in mind when writing + your ingestion spec. +::: + +#### Expression transform spec + +The default `transformSpec` uses expression-based transforms and an optional filter: + +```json "transformSpec": { "transforms": [ { "type": "expression", "name": "countryUpper", "expression": "upper(country)" } @@ -349,14 +361,7 @@ records during ingestion time. It is optional. An example `transformSpec` is: } ``` -:::info - Conceptually, after input data records are read, Druid applies ingestion spec components in a particular order: - first [`flattenSpec`](data-formats.md#flattenspec) (if any), then [`timestampSpec`](#timestampspec), then [`transformSpec`](#transformspec), - and finally [`dimensionsSpec`](#dimensionsspec) and [`metricsSpec`](#metricsspec). Keep this in mind when writing - your ingestion spec. -::: - -#### Transforms +##### Transforms The `transforms` list allows you to specify a set of expressions to evaluate on top of input data. Each transform has a "name" which can be referred to by your `dimensionsSpec`, `metricsSpec`, etc. @@ -368,7 +373,7 @@ Transforms do have some limitations. They can only refer to fields present in th they cannot refer to other transforms. And they cannot remove fields, only add them. However, they can shadow a field with another field containing all nulls, which will act similarly to removing the field. -Druid includes two kinds of built-in transforms: expression transforms and [scan transforms](#scan-transform). The expression transform has the following syntax: +The expression transform has the following syntax: ``` { @@ -380,18 +385,17 @@ Druid includes two kinds of built-in transforms: expression transforms and [scan The `expression` is a [Druid query expression](../querying/math-expr.md). -:::info - Conceptually, after input data records are read, Druid applies ingestion spec components in a particular order: - first [`flattenSpec`](data-formats.md#flattenspec) (if any), then [`timestampSpec`](#timestampspec), then [`transformSpec`](#transformspec), - and finally [`dimensionsSpec`](#dimensionsspec) and [`metricsSpec`](#metricsspec). Keep this in mind when writing - your ingestion spec. -::: +##### Filter + +The `filter` conditionally filters input rows during ingestion. Only rows that pass the filter will be +ingested. Any of Druid's standard [query filters](../querying/filters.md) can be used. Note that within a +`transformSpec`, the `transforms` are applied before the `filter`, so the filter can refer to a transform. -#### Scan transform +#### Scan transform spec -The scan transform unnests array-valued columns during ingestion, producing multiple output rows from a single input row. This allows streaming ingestion (Kafka, Kinesis) to explode arrays into individual rows at ingest time, rather than at query time. +The scan transform spec (`"type": "scan"`) processes each input row through an embedded [scan query](../querying/scan-query.md). Its primary use case is unnesting array-valued columns into individual rows during streaming ingestion (Kafka, Kinesis), similar to existing UNNEST functionality with Druid SQL and the MSQ engine. -Each input row is wrapped in a temporary single-row segment and run through the configured [scan query](../querying/scan-query.md). The scan query uses an [unnest data source](../querying/datasource.md#unnest) with `"__input__"` as the base table name. +The scan query uses `"__input__"` as the base table name and can include [unnest data sources](../querying/datasource.md#unnest), [virtual columns](../querying/virtual-columns.md) (for expression-based column derivations), and [filters](../querying/filters.md). **Example: Unnesting a string array** @@ -399,82 +403,74 @@ Given input rows with a `tags` column containing `["sports", "news"]`, this `tra ```json "transformSpec": { - "transforms": [ - { - "type": "scan", - "name": "tag", - "query": { - "queryType": "scan", - "dataSource": { - "type": "unnest", - "base": { "type": "table", "name": "__input__" }, - "virtualColumn": { - "type": "expression", - "name": "tag", - "expression": "\"tags\"", - "outputType": "STRING" - } - }, - "intervals": { "type": "intervals", "intervals": ["-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"] }, - "resultFormat": "list" + "type": "scan", + "query": { + "queryType": "scan", + "dataSource": { + "type": "unnest", + "base": { "type": "table", "name": "__input__" }, + "virtualColumn": { + "type": "expression", + "name": "tag", + "expression": "\"tags\"", + "outputType": "STRING" } - } - ] + }, + "intervals": { "type": "intervals", "intervals": ["-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"] }, + "resultFormat": "list" + } } ``` -**Example: Unnesting an array of JSON objects with a filter** +**Example: Unnesting with virtual columns and a filter** -Given input rows with a `services` column containing `[{"type": "web", "dc": "us-east1"}, {"type": "api", "dc": "us-west2"}]`, this `transformSpec` unnests each object into its own row. The optional `unnestFilter` keeps only elements where `service.type` equals `"web"`: +This example unnests both `tags` and `services` arrays (via nested unnest data sources), computes derived columns (`upper_user`, `user_tag`) via virtual columns, and filters with `unnestFilter`: ```json "transformSpec": { - "transforms": [ - { - "type": "scan", - "name": "service", - "query": { - "queryType": "scan", - "dataSource": { - "type": "unnest", - "base": { "type": "table", "name": "__input__" }, - "virtualColumn": { - "type": "expression", - "name": "service", - "expression": "\"services\"", - "outputType": "COMPLEX" - }, - "unnestFilter": { - "type": "selector", - "dimension": "service", - "value": "web" - } - }, - "intervals": { "type": "intervals", "intervals": ["-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"] }, - "resultFormat": "list" + "type": "scan", + "query": { + "queryType": "scan", + "dataSource": { + "type": "unnest", + "base": { + "type": "unnest", + "base": { "type": "table", "name": "__input__" }, + "virtualColumn": { + "type": "expression", + "name": "tag", + "expression": "\"tags\"", + "outputType": "STRING" + } + }, + "virtualColumn": { + "type": "expression", + "name": "service", + "expression": "\"services\"", + "outputType": "COMPLEX" + }, + "unnestFilter": { + "type": "selector", + "dimension": "service", + "value": "web" } - } - ] + }, + "virtualColumns": [ + { "type": "expression", "name": "upper_user", "expression": "upper(\"user\")", "outputType": "STRING" }, + { "type": "expression", "name": "user_tag", "expression": "concat(\"user\", '_', \"tag\")", "outputType": "STRING" } + ], + "intervals": { "type": "intervals", "intervals": ["-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"] }, + "resultFormat": "list" + } } ``` |Property|Description|Required| |--------|-----------|--------| |`type`|Must be `"scan"`.|Yes| -|`name`|Output name for this transform. Should match the virtual column name in the query's unnest data source.|Yes| -|`query`|A [scan query](../querying/scan-query.md) that defines how to process each input row. Use an [unnest data source](../querying/datasource.md#unnest) with `"__input__"` as the base table to unnest arrays. Set `intervals` to eternity and `resultFormat` to `"list"`.|Yes| - -You can define multiple scan transforms in the `transforms` list. They are applied sequentially, producing a cross join. For example, unnesting both `tags` (2 elements) and `services` (3 elements) produces 6 rows per input row. +|`query`|A [scan query](../querying/scan-query.md) that defines how to process each input row. Use an [unnest data source](../querying/datasource.md#unnest) with `"__input__"` as the base table to unnest arrays. Nest multiple unnest data sources for cross-join unnesting. Add `virtualColumns` on the scan query for expression-based column derivations. Set `intervals` to eternity and `resultFormat` to `"list"`.|Yes| -If the unnest column is missing or the array is empty, the input row passes through with the unnest output column set to null. - -Expression transforms are applied before scan transforms. The `transformSpec` filter is also applied before any unnesting, so it operates on the original input row. - -#### Filter - -The `filter` conditionally filters input rows during ingestion. Only rows that pass the filter will be -ingested. Any of Druid's standard [query filters](../querying/filters.md) can be used. Note that within a -`transformSpec`, the `transforms` are applied before the `filter`, so the filter can refer to a transform. +If an unnest column is missing or the array is empty, the input row passes through with the unnest output columns set to null. Virtual columns are still evaluated on passthrough rows. ### Projections diff --git a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaScanTransformTest.java b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaScanTransformTest.java index ce9d39ea21fc..6b5ee6764991 100644 --- a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaScanTransformTest.java +++ b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaScanTransformTest.java @@ -20,7 +20,6 @@ package org.apache.druid.testing.embedded.indexing; import com.fasterxml.jackson.core.JsonProcessingException; -import com.google.common.collect.ImmutableList; import org.apache.druid.data.input.impl.DimensionsSpec; import org.apache.druid.data.input.impl.TimestampSpec; import org.apache.druid.indexer.granularity.UniformGranularitySpec; @@ -39,8 +38,7 @@ import org.apache.druid.segment.TestHelper; import org.apache.druid.segment.column.ColumnType; import org.apache.druid.segment.metadata.Metric; -import org.apache.druid.segment.transform.ScanTransform; -import org.apache.druid.segment.transform.TransformSpec; +import org.apache.druid.segment.transform.ScanTransformSpec; import org.apache.druid.segment.virtual.ExpressionVirtualColumn; import org.apache.druid.testing.embedded.EmbeddedBroker; import org.apache.druid.testing.embedded.EmbeddedClusterApis; @@ -63,9 +61,12 @@ import java.util.TreeSet; /** - * Verifies ScanTransform unnests array columns during Kafka ingestion. - * Uses two scan transforms to unnest both "tags" (string array) and "services" (object array) - * into a single datasource, producing a cross join of tag x service for each input row. + * Verifies ScanTransform during Kafka ingestion: + *
    + *
  • Unnests both "tags" (string array) and "services" (object array) via nested UnnestDataSources
  • + *
  • Computes derived columns via virtual columns (upper case, string concat)
  • + *
  • All in a single scan query — demonstrates unnest + expression transforms combined
  • + *
*/ public class KafkaScanTransformTest extends EmbeddedClusterTestBase { @@ -137,36 +138,35 @@ void setupAll() throws JsonProcessingException private void submitSupervisor() { - final TransformSpec transformSpec = new TransformSpec( - null, - ImmutableList.of( - new ScanTransform( - "tag", - Druids.newScanQueryBuilder() - .dataSource(UnnestDataSource.create( - new TableDataSource("__input__"), - new ExpressionVirtualColumn("tag", "\"tags\"", ColumnType.STRING, ExprMacroTable.nil()), - null - )) - .eternityInterval() - .columns((List) null) - .resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_LIST) - .build() - ), - new ScanTransform( - "svc", - Druids.newScanQueryBuilder() - .dataSource(UnnestDataSource.create( - new TableDataSource("__input__"), - new ExpressionVirtualColumn("svc", "\"services\"", ColumnType.NESTED_DATA, ExprMacroTable.nil()), - null - )) - .eternityInterval() - .columns((List) null) - .resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_LIST) - .build() - ) - ) + final ScanTransformSpec transformSpec = new ScanTransformSpec( + Druids.newScanQueryBuilder() + .dataSource(UnnestDataSource.create( + UnnestDataSource.create( + new TableDataSource("__input__"), + new ExpressionVirtualColumn("tag", "\"tags\"", ColumnType.STRING, ExprMacroTable.nil()), + null + ), + new ExpressionVirtualColumn("svc", "\"services\"", ColumnType.NESTED_DATA, ExprMacroTable.nil()), + null + )) + .virtualColumns( + new ExpressionVirtualColumn( + "upper_user", + "upper(\"user\")", + ColumnType.STRING, + ExprMacroTable.nil() + ), + new ExpressionVirtualColumn( + "user_tag", + "concat(\"user\", '_', \"tag\")", + ColumnType.STRING, + ExprMacroTable.nil() + ) + ) + .eternityInterval() + .columns((List) null) + .resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_LIST) + .build() ); final KafkaSupervisorSpec spec = new KafkaSupervisorSpecBuilder() @@ -372,6 +372,45 @@ public void test_groupByServiceDc() Assertions.assertEquals(expected, actual); } + @Test + @Timeout(60) + public void test_upperCaseVirtualColumn() + { + final String result = cluster.runSql( + StringUtils.format( + "SELECT \"upper_user\", COUNT(*) AS cnt FROM \"%s\" GROUP BY 1 ORDER BY 1", + dataSource + ) + ); + Assertions.assertEquals( + "ALICE,4\nBOB,3\nCAROL,1\nDAVE,1", + result.trim() + ); + } + + @Test + @Timeout(60) + public void test_concatVirtualColumn() + { + // user_tag = concat(user, '_', tag) — computed at ingest time via scan query virtual column + final String result = cluster.runSql( + StringUtils.format( + "SELECT \"user_tag\", COUNT(*) AS cnt" + + " FROM \"%s\"" + + " WHERE \"tag\" IS NOT NULL" + + " GROUP BY 1 ORDER BY 1", + dataSource + ) + ); + final Set actual = new TreeSet<>(List.of(result.trim().split("\n"))); + final Set expected = new TreeSet<>(List.of( + "alice_news,2", // alice_news x 2 services + "alice_sports,2", // alice_sports x 2 services + "bob_music,3" // bob_music x 3 services + )); + Assertions.assertEquals(expected, actual); + } + @Test @Timeout(60) public void test_filterByServiceType() diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/input/InputRowSchemas.java b/indexing-service/src/main/java/org/apache/druid/indexing/input/InputRowSchemas.java index a5f95c2e15f7..4ed007d4f83c 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/input/InputRowSchemas.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/input/InputRowSchemas.java @@ -25,8 +25,8 @@ import org.apache.druid.data.input.impl.TimestampSpec; import org.apache.druid.query.aggregation.AggregatorFactory; import org.apache.druid.segment.indexing.DataSchema; +import org.apache.druid.segment.transform.BaseTransformSpec; import org.apache.druid.segment.transform.Transform; -import org.apache.druid.segment.transform.TransformSpec; import java.util.Arrays; import java.util.HashSet; @@ -71,7 +71,7 @@ public static InputRowSchema fromDataSchema(final DataSchema dataSchema) public static ColumnsFilter createColumnsFilter( final TimestampSpec timestampSpec, final DimensionsSpec dimensionsSpec, - final TransformSpec transformSpec, + final BaseTransformSpec transformSpec, final AggregatorFactory[] aggregators ) { diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/SettableByteEntityReader.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/SettableByteEntityReader.java index 2314d7408425..80f27de75f67 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/SettableByteEntityReader.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/SettableByteEntityReader.java @@ -28,7 +28,7 @@ import org.apache.druid.data.input.impl.ByteEntity; import org.apache.druid.data.input.impl.JsonInputFormat; import org.apache.druid.java.util.common.parsers.CloseableIterator; -import org.apache.druid.segment.transform.TransformSpec; +import org.apache.druid.segment.transform.BaseTransformSpec; import org.apache.druid.segment.transform.TransformingInputEntityReader; import java.io.File; @@ -46,7 +46,7 @@ class SettableByteEntityReader implements InputEntityReade SettableByteEntityReader( InputFormat inputFormat, InputRowSchema inputRowSchema, - TransformSpec transformSpec, + BaseTransformSpec transformSpec, File indexingTmpDir ) { diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/StreamChunkReader.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/StreamChunkReader.java index a0ac1f01ea5a..e58609c6f084 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/StreamChunkReader.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/StreamChunkReader.java @@ -31,7 +31,7 @@ import org.apache.druid.segment.incremental.InputRowFilterResult; import org.apache.druid.segment.incremental.ParseExceptionHandler; import org.apache.druid.segment.incremental.RowIngestionMeters; -import org.apache.druid.segment.transform.TransformSpec; +import org.apache.druid.segment.transform.BaseTransformSpec; import javax.annotation.Nullable; import java.io.File; @@ -56,7 +56,7 @@ class StreamChunkReader StreamChunkReader( InputFormat inputFormat, InputRowSchema inputRowSchema, - TransformSpec transformSpec, + BaseTransformSpec transformSpec, File indexingTmpDir, InputRowFilter rowFilter, RowIngestionMeters rowIngestionMeters, diff --git a/processing/src/main/java/org/apache/druid/segment/transform/BaseTransformSpec.java b/processing/src/main/java/org/apache/druid/segment/transform/BaseTransformSpec.java new file mode 100644 index 000000000000..35d68e63a71e --- /dev/null +++ b/processing/src/main/java/org/apache/druid/segment/transform/BaseTransformSpec.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.segment.transform; + +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; +import org.apache.druid.data.input.InputSourceReader; +import org.apache.druid.query.filter.DimFilter; + +import javax.annotation.Nullable; +import java.util.List; +import java.util.Set; + +/** + * Specification for how input rows should be transformed during ingestion. This is the base interface + * for the {@code transformSpec} field in {@link org.apache.druid.segment.indexing.DataSchema}. + * + *

Two implementations are provided: + *

    + *
  • {@link TransformSpec} — the default, for expression-based transforms and filters
  • + *
  • {@link ScanTransformSpec} — for scan-query-based transforms (unnest, virtual columns, filters)
  • + *
+ * + *

When no {@code "type"} is specified in JSON, the default {@link TransformSpec} is used for backward + * compatibility. + */ +@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, property = "type", defaultImpl = TransformSpec.class) +@JsonSubTypes({ + @JsonSubTypes.Type(name = "scan", value = ScanTransformSpec.class) +}) +public interface BaseTransformSpec +{ + /** + * Creates a {@link BaseTransformer} that applies this spec's transforms to input rows. + */ + BaseTransformer toTransformer(); + + /** + * Wraps an {@link InputSourceReader} with this spec's transforms applied to each row. + */ + default InputSourceReader decorate(InputSourceReader reader) + { + return new TransformingInputSourceReader(reader, toTransformer()); + } + + /** + * Returns the names of all input columns required by this spec's transforms and filters. + */ + Set getRequiredColumns(); + + /** + * Returns the list of individual {@link Transform} objects, if applicable. + * Defaults to an empty list for specs that don't use the transforms list (e.g., {@link ScanTransformSpec}). + */ + default List getTransforms() + { + return List.of(); + } + + /** + * Returns the filter applied to input rows, if applicable. + * Defaults to null for specs that handle filtering internally (e.g., {@link ScanTransformSpec}). + */ + @Nullable + default DimFilter getFilter() + { + return null; + } +} diff --git a/processing/src/main/java/org/apache/druid/segment/transform/BaseTransformer.java b/processing/src/main/java/org/apache/druid/segment/transform/BaseTransformer.java new file mode 100644 index 000000000000..335b09c4ca2d --- /dev/null +++ b/processing/src/main/java/org/apache/druid/segment/transform/BaseTransformer.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.segment.transform; + +import org.apache.druid.data.input.InputRow; +import org.apache.druid.data.input.InputRowListPlusRawValues; + +import javax.annotation.Nullable; +import java.io.Closeable; +import java.io.IOException; +import java.util.List; + +/** + * Interface for transforming input rows during ingestion. Created by {@link BaseTransformSpec#toTransformer()}. + * + * @see Transformer for expression-based transforms + * @see ScanTransformer for scan-query-based transforms + */ +public interface BaseTransformer extends Closeable +{ + /** + * Whether this transformer can produce multiple output rows from a single input row. + * When true, readers use {@link #transformToList} with flatMap iteration. + * When false, readers use {@link #transform(InputRow)} with map iteration. + */ + boolean hasMultiRowTransform(); + + /** + * Transforms a single input row, or returns null if the row should be filtered out. + * Only called when {@link #hasMultiRowTransform()} is false. + */ + @Nullable + InputRow transform(@Nullable InputRow row); + + /** + * Transforms a single input row into zero or more output rows. + * Returns an empty list if the row is null or filtered out. + */ + List transformToList(@Nullable InputRow row); + + /** + * Transforms a batch of input rows with their associated raw values, used by the sampling path. + * Applies transforms and filtering while maintaining the correspondence between input rows and raw values. + */ + @Nullable + InputRowListPlusRawValues transform(@Nullable InputRowListPlusRawValues row); + + /** + * Releases any resources held by this transformer. The default implementation is a no-op. + */ + @Override + default void close() throws IOException + { + } +} diff --git a/processing/src/main/java/org/apache/druid/segment/transform/CompactionTransformSpec.java b/processing/src/main/java/org/apache/druid/segment/transform/CompactionTransformSpec.java index 0a0c2243875d..bd0fafe016e7 100644 --- a/processing/src/main/java/org/apache/druid/segment/transform/CompactionTransformSpec.java +++ b/processing/src/main/java/org/apache/druid/segment/transform/CompactionTransformSpec.java @@ -39,7 +39,7 @@ public class CompactionTransformSpec { @Nullable - public static CompactionTransformSpec of(@Nullable TransformSpec transformSpec) + public static CompactionTransformSpec of(@Nullable BaseTransformSpec transformSpec) { if (transformSpec == null) { return null; @@ -47,6 +47,9 @@ public static CompactionTransformSpec of(@Nullable TransformSpec transformSpec) if (TransformSpec.NONE.equals(transformSpec)) { return null; } + if (!(transformSpec instanceof TransformSpec)) { + return null; + } return new CompactionTransformSpec(transformSpec.getFilter(), VirtualColumns.EMPTY); } diff --git a/processing/src/main/java/org/apache/druid/segment/transform/ScanTransform.java b/processing/src/main/java/org/apache/druid/segment/transform/ScanTransform.java deleted file mode 100644 index 3ffa1ca8ccfa..000000000000 --- a/processing/src/main/java/org/apache/druid/segment/transform/ScanTransform.java +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.segment.transform; - -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; -import org.apache.druid.data.input.InputRow; -import org.apache.druid.data.input.MapBasedInputRow; -import org.apache.druid.java.util.common.guava.Sequences; -import org.apache.druid.query.QueryContexts; -import org.apache.druid.query.context.ResponseContext; -import org.apache.druid.query.scan.ScanQuery; -import org.apache.druid.query.scan.ScanQueryEngine; -import org.apache.druid.query.scan.ScanResultValue; -import org.apache.druid.segment.RowAdapters; -import org.apache.druid.segment.RowBasedSegment; -import org.apache.druid.segment.Segment; -import org.apache.druid.segment.SegmentMapFunction; -import org.apache.druid.segment.column.ColumnHolder; -import org.apache.druid.segment.column.ColumnType; -import org.apache.druid.segment.column.RowSignature; - -import javax.annotation.Nullable; -import java.util.ArrayList; -import java.util.LinkedHashMap; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.Optional; -import java.util.Set; - -/** - * A multi-row transform that processes each input row through the scan query engine during ingestion. - * Each input row is wrapped in a single-row segment and run through the configured {@link ScanQuery}, - * which can include UNNEST (via {@link org.apache.druid.query.UnnestDataSource}), filters, virtual columns, etc. - * - * If the query produces no output rows (e.g., empty/missing array), the input row passes through - * with null values for any new columns. - */ -public class ScanTransform implements Transform -{ - private static final ScanQueryEngine ENGINE = new ScanQueryEngine(); - - private final String name; - private final ScanQuery query; - - @JsonCreator - public ScanTransform( - @JsonProperty("name") final String name, - @JsonProperty("query") final ScanQuery query - ) - { - this.name = name; - this.query = query; - } - - @Override - @JsonProperty - public String getName() - { - return name; - } - - @Override - @Nullable - public RowFunction getRowFunction() - { - return null; - } - - @JsonProperty - public ScanQuery getQuery() - { - return query; - } - - @Override - public Set getRequiredColumns() - { - return Set.copyOf(query.getDataSource().getTableNames()); - } - - @Override - public boolean isMultiRow() - { - return true; - } - - @Override - public List applyMultiRow(final InputRow inputRow) - { - final RowSignature inputSignature = buildSignature(inputRow); - - final RowBasedSegment segment = new RowBasedSegment<>( - Sequences.simple(List.of(inputRow)), - RowAdapters.standardRow(), - inputSignature - ); - - final Segment mappedSegment = applySegmentMapFunction(segment); - - final ScanQuery queryWithoutTimeout = query.withOverriddenContext( - Map.of(QueryContexts.TIMEOUT_KEY, 0) - ); - - final List scanResults = ENGINE.process( - queryWithoutTimeout, - mappedSegment, - ResponseContext.createEmpty(), - null - ).toList(); - - final List result = new ArrayList<>(); - - for (final ScanResultValue scanResult : scanResults) { - final List dimensionColumns = resolveDimensionColumns(inputRow, scanResult.getColumns()); - @SuppressWarnings("unchecked") - final List> events = (List>) scanResult.getEvents(); - for (final Map event : events) { - result.add(new MapBasedInputRow(inputRow.getTimestampFromEpoch(), dimensionColumns, event)); - } - } - - if (result.isEmpty()) { - final List dimensionColumns = resolveDimensionColumns(inputRow, null); - final Map passthroughEvent = new LinkedHashMap<>(); - for (final String dim : inputRow.getDimensions()) { - passthroughEvent.put(dim, inputRow.getRaw(dim)); - } - result.add(new MapBasedInputRow(inputRow.getTimestampFromEpoch(), dimensionColumns, passthroughEvent)); - } - - return result; - } - - private Segment applySegmentMapFunction(final Segment segment) - { - final SegmentMapFunction mapFunction = query.getDataSource().createSegmentMapFunction(query); - final Optional mapped = mapFunction.apply(Optional.of(segment)); - return mapped.orElse(segment); - } - - private static RowSignature buildSignature(final InputRow inputRow) - { - final RowSignature.Builder signatureBuilder = RowSignature.builder(); - signatureBuilder.add(ColumnHolder.TIME_COLUMN_NAME, ColumnType.LONG); - for (final String dim : inputRow.getDimensions()) { - signatureBuilder.add(dim, ColumnType.NESTED_DATA); - } - return signatureBuilder.build(); - } - - private List resolveDimensionColumns(final InputRow inputRow, @Nullable final List scanResultColumns) - { - final LinkedHashSet dims = new LinkedHashSet<>(inputRow.getDimensions()); - - if (scanResultColumns != null) { - for (final String col : scanResultColumns) { - if (!ColumnHolder.TIME_COLUMN_NAME.equals(col)) { - dims.add(col); - } - } - } - - final List queryColumns = query.getColumns(); - if (queryColumns != null) { - for (final String col : queryColumns) { - if (!ColumnHolder.TIME_COLUMN_NAME.equals(col)) { - dims.add(col); - } - } - } - - return new ArrayList<>(dims); - } - - @Override - public boolean equals(final Object o) - { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - final ScanTransform that = (ScanTransform) o; - return Objects.equals(name, that.name) - && Objects.equals(query, that.query); - } - - @Override - public int hashCode() - { - return Objects.hash(name, query); - } - - @Override - public String toString() - { - return "ScanTransform{" + - "name=" + name + - ", query=" + query + - '}'; - } -} diff --git a/processing/src/main/java/org/apache/druid/segment/transform/ScanTransformSpec.java b/processing/src/main/java/org/apache/druid/segment/transform/ScanTransformSpec.java new file mode 100644 index 000000000000..c2bac7e4e8cc --- /dev/null +++ b/processing/src/main/java/org/apache/druid/segment/transform/ScanTransformSpec.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.segment.transform; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonTypeName; +import org.apache.druid.query.DataSource; +import org.apache.druid.query.UnnestDataSource; +import org.apache.druid.query.scan.ScanQuery; +import org.apache.druid.segment.VirtualColumn; + +import java.util.HashSet; +import java.util.Objects; +import java.util.Set; + +/** + * A {@link BaseTransformSpec} that processes input rows through an embedded {@link ScanQuery}. + * The scan query can include unnest data sources, virtual columns, and filters. + * + *

Example JSON: + *

{@code
+ * "transformSpec": {
+ *   "type": "scan",
+ *   "query": {
+ *     "queryType": "scan",
+ *     "dataSource": {
+ *       "type": "unnest",
+ *       "base": { "type": "table", "name": "__input__" },
+ *       "virtualColumn": { "type": "expression", "name": "tag", "expression": "\"tags\"", "outputType": "STRING" }
+ *     },
+ *     "intervals": { "type": "intervals", "intervals": ["-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"] },
+ *     "resultFormat": "list"
+ *   }
+ * }
+ * }
+ */ +@JsonTypeName("scan") +public class ScanTransformSpec implements BaseTransformSpec +{ + private final ScanQuery query; + + @JsonCreator + public ScanTransformSpec(@JsonProperty("query") final ScanQuery query) + { + this.query = query; + } + + @JsonProperty + public ScanQuery getQuery() + { + return query; + } + + @Override + public BaseTransformer toTransformer() + { + return new ScanTransformer(query); + } + + @Override + public Set getRequiredColumns() + { + final Set columns = new HashSet<>(); + collectRequiredColumns(query.getDataSource(), columns); + for (final VirtualColumn vc : query.getVirtualColumns().getVirtualColumns()) { + columns.addAll(vc.requiredColumns()); + } + if (query.getFilter() != null) { + columns.addAll(query.getFilter().getRequiredColumns()); + } + return columns; + } + + private static void collectRequiredColumns(final DataSource dataSource, final Set columns) + { + if (dataSource instanceof UnnestDataSource) { + final UnnestDataSource unnest = (UnnestDataSource) dataSource; + columns.addAll(unnest.getVirtualColumn().requiredColumns()); + collectRequiredColumns(unnest.getBase(), columns); + } + } + + @Override + public boolean equals(final Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final ScanTransformSpec that = (ScanTransformSpec) o; + return Objects.equals(query, that.query); + } + + @Override + public int hashCode() + { + return Objects.hash(query); + } + + @Override + public String toString() + { + return "ScanTransformSpec{query=" + query + '}'; + } +} diff --git a/processing/src/main/java/org/apache/druid/segment/transform/ScanTransformer.java b/processing/src/main/java/org/apache/druid/segment/transform/ScanTransformer.java new file mode 100644 index 000000000000..1dd8b3b401f6 --- /dev/null +++ b/processing/src/main/java/org/apache/druid/segment/transform/ScanTransformer.java @@ -0,0 +1,285 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.segment.transform; + +import org.apache.druid.data.input.InputRow; +import org.apache.druid.data.input.InputRowListPlusRawValues; +import org.apache.druid.data.input.MapBasedInputRow; +import org.apache.druid.java.util.common.ISE; +import org.apache.druid.java.util.common.Intervals; +import org.apache.druid.query.DataSource; +import org.apache.druid.query.QueryContexts; +import org.apache.druid.query.UnnestDataSource; +import org.apache.druid.query.scan.ScanQuery; +import org.apache.druid.segment.ColumnSelectorFactory; +import org.apache.druid.segment.Cursor; +import org.apache.druid.segment.CursorBuildSpec; +import org.apache.druid.segment.CursorFactory; +import org.apache.druid.segment.CursorHolder; +import org.apache.druid.segment.Segment; +import org.apache.druid.segment.SegmentMapFunction; +import org.apache.druid.segment.VirtualColumn; +import org.apache.druid.segment.column.ColumnHolder; +import org.apache.druid.segment.column.ColumnType; +import org.apache.druid.segment.column.RowSignature; +import org.apache.druid.segment.filter.Filters; +import org.apache.druid.timeline.SegmentId; +import org.joda.time.Interval; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; + +/** + * A {@link BaseTransformer} that processes input rows through a reusable scan query cursor pipeline. + * + *

The pipeline is built once at construction: a {@link SettableRowCursorFactory} is wrapped by the + * scan query's {@link SegmentMapFunction} (e.g., unnest, filter). For each input row, the row is set + * on the factory and the cursor is {@link Cursor#reset reset} — no per-row segment or cursor allocation. + * + *

When the scan query produces zero output rows (e.g., null/missing arrays, or filter rejection), + * the input row passes through with unnest output columns set to null and virtual columns still evaluated. + * This differs from {@link TransformSpec}'s filter behavior which drops the row entirely. + * + *

This class is not thread-safe. Each reader thread should have its own instance. + */ +public class ScanTransformer implements BaseTransformer +{ + private final ScanQuery query; + private final SettableRowCursorFactory baseCursorFactory; + private final CursorHolder cursorHolder; + private Cursor cursor; + + ScanTransformer(final ScanQuery scanQuery) + { + this.query = scanQuery.withOverriddenContext( + Map.of(QueryContexts.TIMEOUT_KEY, 0) + ); + + final RowSignature broadSignature = RowSignature.builder() + .add(ColumnHolder.TIME_COLUMN_NAME, ColumnType.LONG) + .build(); + + final CursorBuildSpec cursorBuildSpec = CursorBuildSpec.builder() + .setInterval(query.getSingleInterval()) + .setFilter(Filters.toFilter(query.getFilter())) + .setVirtualColumns(query.getVirtualColumns()) + .build(); + + this.baseCursorFactory = new SettableRowCursorFactory(broadSignature); + final SegmentMapFunction segmentMapFunction = query.getDataSource().createSegmentMapFunction(query); + final Segment mappedSegment = segmentMapFunction.apply(Optional.of(new CursorFactorySegment(baseCursorFactory))) + .orElseThrow(() -> new ISE("SegmentMapFunction returned empty")); + final CursorFactory mappedCursorFactory = mappedSegment.as(CursorFactory.class); + this.cursorHolder = mappedCursorFactory.makeCursorHolder(cursorBuildSpec); + } + + @Override + public boolean hasMultiRowTransform() + { + return true; + } + + @Override + @Nullable + public InputRow transform(@Nullable final InputRow row) + { + throw new UnsupportedOperationException( + "ScanTransformer does not support single-row transform; use transformToList()" + ); + } + + @Override + public List transformToList(@Nullable final InputRow row) + { + if (row == null) { + return List.of(); + } + + return process(row); + } + + @Override + @Nullable + public InputRowListPlusRawValues transform(@Nullable final InputRowListPlusRawValues row) + { + if (row == null || row.getInputRows() == null) { + return row; + } + + final List inputRows = row.getInputRows(); + final List> inputRawValues = row.getRawValuesList(); + final List outputRows = new ArrayList<>(); + final List> outputRawValues = inputRawValues == null ? null : new ArrayList<>(); + + for (int i = 0; i < inputRows.size(); i++) { + final List expandedRows = transformToList(inputRows.get(i)); + outputRows.addAll(expandedRows); + if (outputRawValues != null) { + for (int j = 0; j < expandedRows.size(); j++) { + outputRawValues.add(inputRawValues.get(i)); + } + } + } + + return InputRowListPlusRawValues.ofList(outputRawValues, outputRows, row.getParseException()); + } + + @Override + public void close() throws IOException + { + cursorHolder.close(); + } + + private List process(final InputRow inputRow) + { + baseCursorFactory.set(inputRow); + + if (cursor == null) { + cursor = cursorHolder.asCursor(); + } else { + cursor.reset(); + } + + if (cursor == null || cursor.isDone()) { + return List.of(buildPassthroughRow(inputRow)); + } + + final List columns = resolveColumnsForRow(inputRow); + final List dimensionColumns = resolveDimensionColumns(inputRow, columns); + final ColumnSelectorFactory selectorFactory = cursor.getColumnSelectorFactory(); + + final List result = new ArrayList<>(); + while (!cursor.isDone()) { + final Map event = new LinkedHashMap<>(); + for (final String col : columns) { + event.put(col, selectorFactory.makeColumnValueSelector(col).getObject()); + } + result.add(new MapBasedInputRow(inputRow.getTimestampFromEpoch(), dimensionColumns, event)); + cursor.advance(); + } + + return result.isEmpty() ? List.of(buildPassthroughRow(inputRow)) : result; + } + + private InputRow buildPassthroughRow(final InputRow inputRow) + { + final Set unnestOutputColumns = new LinkedHashSet<>(); + collectOutputColumnNames(query.getDataSource(), unnestOutputColumns); + + final List columns = resolveColumnsForRow(inputRow); + final List dimensionColumns = resolveDimensionColumns(inputRow, columns); + final ColumnSelectorFactory factory = baseCursorFactory.getColumnSelectorFactory(query.getVirtualColumns()); + final Map event = new LinkedHashMap<>(); + for (final String col : columns) { + if (unnestOutputColumns.contains(col)) { + event.put(col, null); + } else { + event.put(col, factory.makeColumnValueSelector(col).getObject()); + } + } + return new MapBasedInputRow(inputRow.getTimestampFromEpoch(), dimensionColumns, event); + } + + private List resolveColumnsForRow(final InputRow inputRow) + { + final Set columns = new LinkedHashSet<>(); + columns.add(ColumnHolder.TIME_COLUMN_NAME); + columns.addAll(inputRow.getDimensions()); + for (final VirtualColumn vc : query.getVirtualColumns().getVirtualColumns()) { + columns.add(vc.getOutputName()); + } + collectOutputColumnNames(query.getDataSource(), columns); + return new ArrayList<>(columns); + } + + private static void collectOutputColumnNames(final DataSource dataSource, final Set columns) + { + if (dataSource instanceof UnnestDataSource) { + final UnnestDataSource unnest = (UnnestDataSource) dataSource; + columns.add(unnest.getVirtualColumn().getOutputName()); + } + for (final DataSource child : dataSource.getChildren()) { + collectOutputColumnNames(child, columns); + } + } + + private static List resolveDimensionColumns( + final InputRow inputRow, + @Nullable final List resultColumns + ) + { + final LinkedHashSet dims = new LinkedHashSet<>(inputRow.getDimensions()); + if (resultColumns != null) { + for (final String col : resultColumns) { + if (!ColumnHolder.TIME_COLUMN_NAME.equals(col)) { + dims.add(col); + } + } + } + return new ArrayList<>(dims); + } + + private static class CursorFactorySegment implements Segment + { + private final CursorFactory cursorFactory; + + CursorFactorySegment(final CursorFactory cursorFactory) + { + this.cursorFactory = cursorFactory; + } + + @Nullable + @Override + public SegmentId getId() + { + return null; + } + + @Nonnull + @Override + public Interval getDataInterval() + { + return Intervals.ETERNITY; + } + + @Nullable + @Override + public T as(final Class clazz) + { + if (CursorFactory.class.equals(clazz)) { + return (T) cursorFactory; + } + return null; + } + + @Override + public void close() + { + } + } +} diff --git a/processing/src/main/java/org/apache/druid/segment/transform/SettableRowCursorFactory.java b/processing/src/main/java/org/apache/druid/segment/transform/SettableRowCursorFactory.java new file mode 100644 index 000000000000..927c3d68ce51 --- /dev/null +++ b/processing/src/main/java/org/apache/druid/segment/transform/SettableRowCursorFactory.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.segment.transform; + +import org.apache.druid.data.input.InputRow; +import org.apache.druid.query.filter.Filter; +import org.apache.druid.query.filter.ValueMatcher; +import org.apache.druid.segment.ColumnSelectorFactory; +import org.apache.druid.segment.Cursor; +import org.apache.druid.segment.CursorBuildSpec; +import org.apache.druid.segment.CursorFactory; +import org.apache.druid.segment.CursorHolder; +import org.apache.druid.segment.RowAdapters; +import org.apache.druid.segment.RowBasedColumnSelectorFactory; +import org.apache.druid.segment.RowIdSupplier; +import org.apache.druid.segment.VirtualColumns; +import org.apache.druid.segment.column.ColumnCapabilities; +import org.apache.druid.segment.column.RowSignature; +import org.apache.druid.segment.filter.ValueMatchers; + +import javax.annotation.Nullable; + +/** + * A {@link CursorFactory} backed by a mutable {@link InputRow} holder. Each call to + * {@link #makeCursorHolder(CursorBuildSpec)} returns a cursor that applies the spec's virtual columns + * on top of the current row. The underlying row holder is shared — call {@link #set(InputRow)} to + * swap the current row, then {@link Cursor#reset()} on the returned cursor. + */ +class SettableRowCursorFactory implements CursorFactory +{ + private final RowSignature rowSignature; + private final ColumnSelectorFactory baseSelectorFactory; + private InputRow currentRow; + private long rowId = RowIdSupplier.INIT; + + SettableRowCursorFactory(final RowSignature rowSignature) + { + this.rowSignature = rowSignature; + this.baseSelectorFactory = new RowBasedColumnSelectorFactory<>( + this::getCurrentRow, + this::getRowId, + RowAdapters.standardRow(), + rowSignature, + false + ); + } + + void set(final InputRow row) + { + this.currentRow = row; + this.rowId++; + } + + ColumnSelectorFactory getColumnSelectorFactory(final VirtualColumns virtualColumns) + { + return virtualColumns.wrap(baseSelectorFactory); + } + + @Override + public CursorHolder makeCursorHolder(final CursorBuildSpec spec) + { + final ColumnSelectorFactory selectorFactory = spec.getVirtualColumns().wrap(baseSelectorFactory); + final Filter filter = spec.getFilter(); + final ValueMatcher filterMatcher = filter == null + ? ValueMatchers.allTrue() + : filter.makeMatcher(selectorFactory); + + return new CursorHolder() + { + @Override + public Cursor asCursor() + { + return new Cursor() + { + private boolean done = currentRow == null || !filterMatcher.matches(false); + + @Override + public ColumnSelectorFactory getColumnSelectorFactory() + { + return selectorFactory; + } + + @Override + public void advance() + { + done = true; + } + + @Override + public void advanceUninterruptibly() + { + done = true; + } + + @Override + public boolean isDone() + { + return done; + } + + @Override + public boolean isDoneOrInterrupted() + { + return done || Thread.currentThread().isInterrupted(); + } + + @Override + public void reset() + { + done = currentRow == null || !filterMatcher.matches(false); + } + }; + } + + @Override + public void close() + { + } + }; + } + + private InputRow getCurrentRow() + { + return currentRow; + } + + private long getRowId() + { + return rowId; + } + + @Override + public RowSignature getRowSignature() + { + return rowSignature; + } + + @Nullable + @Override + public ColumnCapabilities getColumnCapabilities(final String column) + { + return rowSignature.getColumnCapabilities(column); + } +} diff --git a/processing/src/main/java/org/apache/druid/segment/transform/Transform.java b/processing/src/main/java/org/apache/druid/segment/transform/Transform.java index 79a9d7f27c66..098f1b3a41e8 100644 --- a/processing/src/main/java/org/apache/druid/segment/transform/Transform.java +++ b/processing/src/main/java/org/apache/druid/segment/transform/Transform.java @@ -21,11 +21,9 @@ import com.fasterxml.jackson.annotation.JsonSubTypes; import com.fasterxml.jackson.annotation.JsonTypeInfo; -import org.apache.druid.data.input.InputRow; import org.apache.druid.guice.annotations.ExtensionPoint; import javax.annotation.Nullable; -import java.util.List; import java.util.Set; /** @@ -39,15 +37,11 @@ * Transforms do have some limitations. They can only refer to fields present in the actual input rows; in particular, * they cannot refer to other transforms. And they cannot remove fields, only add them. However, they can shadow a * field with another field containing all nulls, which will act similarly to removing the field. - * - * Multi-row transforms (like {@link ScanTransform}) can produce multiple output rows from a single input row. - * These are applied after all single-row transforms. */ @ExtensionPoint @JsonTypeInfo(use = JsonTypeInfo.Id.NAME, property = "type") @JsonSubTypes(value = { - @JsonSubTypes.Type(name = "expression", value = ExpressionTransform.class), - @JsonSubTypes.Type(name = "scan", value = ScanTransform.class) + @JsonSubTypes.Type(name = "expression", value = ExpressionTransform.class) }) public interface Transform { @@ -58,7 +52,7 @@ public interface Transform /** * Returns the function for this transform. The RowFunction takes an entire row as input and returns a column value - * as output. Multi-row transforms may return null here. + * as output. */ @Nullable RowFunction getRowFunction(); @@ -67,21 +61,4 @@ public interface Transform * Returns the names of all columns that this transform is going to read. */ Set getRequiredColumns(); - - /** - * Whether this transform can produce multiple output rows from a single input row. - */ - default boolean isMultiRow() - { - return false; - } - - /** - * For multi-row transforms, applies this transform to a single input row and returns zero or more output rows. - * Single-row transforms should not override this when {@link #isMultiRow()} is false. - */ - default List applyMultiRow(InputRow inputRow) - { - return List.of(inputRow); - } } diff --git a/processing/src/main/java/org/apache/druid/segment/transform/TransformSpec.java b/processing/src/main/java/org/apache/druid/segment/transform/TransformSpec.java index 30324524d6ca..8fcfd10a474b 100644 --- a/processing/src/main/java/org/apache/druid/segment/transform/TransformSpec.java +++ b/processing/src/main/java/org/apache/druid/segment/transform/TransformSpec.java @@ -37,9 +37,10 @@ * input rows) and "transforms" (which can add fields to input rows). Filters may refer to fields generated by * a transform. * - * See {@link Transform} for details on how each transform works. + * See {@link Transform} for details on how each transform works and only works for {@link Transform} like + * {@link ExpressionTransform}. For scan query transform, see {@link ScanTransformSpec}. */ -public class TransformSpec +public class TransformSpec implements BaseTransformSpec { public static final TransformSpec NONE = new TransformSpec(null, null); @@ -64,6 +65,7 @@ public TransformSpec( } } + @Override @JsonProperty @Nullable public DimFilter getFilter() @@ -71,26 +73,26 @@ public DimFilter getFilter() return filter; } + @Override @JsonProperty public List getTransforms() { return transforms; } + @Override public InputSourceReader decorate(InputSourceReader reader) { return new TransformingInputSourceReader(reader, toTransformer()); } - /** - * Create a {@link Transformer} from this TransformSpec, when the rows to be transformed do not have a known - * signature. - */ - public Transformer toTransformer() + @Override + public BaseTransformer toTransformer() { return new Transformer(this); } + @Override public Set getRequiredColumns() { final Set requiredColumns = new HashSet<>(); diff --git a/processing/src/main/java/org/apache/druid/segment/transform/TransformedInputRow.java b/processing/src/main/java/org/apache/druid/segment/transform/TransformedInputRow.java index ab96b3c75b05..7d1db5ca479f 100644 --- a/processing/src/main/java/org/apache/druid/segment/transform/TransformedInputRow.java +++ b/processing/src/main/java/org/apache/druid/segment/transform/TransformedInputRow.java @@ -30,7 +30,6 @@ import java.util.List; import java.util.Map; import java.util.Objects; -import java.util.Set; public class TransformedInputRow implements InputRow { @@ -121,11 +120,6 @@ public InputRow getBaseRow() return row; } - public Set getTransformedColumns() - { - return Set.copyOf(transforms.keySet()); - } - @Override public boolean equals(final Object o) { diff --git a/processing/src/main/java/org/apache/druid/segment/transform/Transformer.java b/processing/src/main/java/org/apache/druid/segment/transform/Transformer.java index bfc167d259a1..efa8ace6e699 100644 --- a/processing/src/main/java/org/apache/druid/segment/transform/Transformer.java +++ b/processing/src/main/java/org/apache/druid/segment/transform/Transformer.java @@ -35,23 +35,18 @@ import java.util.Map; /** - * + * Expression-based transformer {@link ExpressionTransform} that accepts {@link TransformSpec}. */ -public class Transformer +public class Transformer implements BaseTransformer { private final Map transforms = new HashMap<>(); - private final List multiRowTransforms = new ArrayList<>(); private final ThreadLocal rowSupplierForValueMatcher = new ThreadLocal<>(); private final ValueMatcher valueMatcher; Transformer(final TransformSpec transformSpec) { for (final Transform transform : transformSpec.getTransforms()) { - if (transform.isMultiRow()) { - multiRowTransforms.add(transform); - } else { - transforms.put(transform.getName(), transform.getRowFunction()); - } + transforms.put(transform.getName(), transform.getRowFunction()); } if (transformSpec.getFilter() != null) { @@ -69,12 +64,10 @@ public class Transformer } } - /** - * Whether any multi-row transforms are configured. - */ + @Override public boolean hasMultiRowTransform() { - return !multiRowTransforms.isEmpty(); + return false; } /** @@ -82,6 +75,7 @@ public boolean hasMultiRowTransform() * * @param row the input row */ + @Override @Nullable public InputRow transform(@Nullable final InputRow row) { @@ -107,38 +101,14 @@ public InputRow transform(@Nullable final InputRow row) return transformedRow; } - /** - * Transforms an input row, returning zero or more output rows. - * Applies single-row transforms and filtering first, then chains multi-row transforms sequentially. - */ + @Override public List transformToList(@Nullable final InputRow row) { - final InputRow singleRowResult = transform(row); - if (singleRowResult == null) { - return List.of(); - } - - return applyMultiRowTransforms(singleRowResult); - } - - private List applyMultiRowTransforms(final InputRow inputRow) - { - if (multiRowTransforms.isEmpty()) { - return List.of(inputRow); - } - - List current = List.of(inputRow); - for (final Transform multiRowTransform : multiRowTransforms) { - final List next = new ArrayList<>(); - for (final InputRow currentRow : current) { - next.addAll(multiRowTransform.applyMultiRow(currentRow)); - } - current = next; - } - - return current; + final InputRow result = transform(row); + return result == null ? List.of() : List.of(result); } + @Override @Nullable public InputRowListPlusRawValues transform(@Nullable final InputRowListPlusRawValues row) { @@ -184,30 +154,6 @@ public InputRowListPlusRawValues transform(@Nullable final InputRowListPlusRawVa } } - return applyMultiRowTransforms(inputRowListPlusRawValues); - } - - private InputRowListPlusRawValues applyMultiRowTransforms(final InputRowListPlusRawValues row) - { - if (multiRowTransforms.isEmpty() || row.getInputRows() == null) { - return row; - } - - final List inputRows = row.getInputRows(); - final List> inputRawValues = row.getRawValuesList(); - final List outputRows = new ArrayList<>(); - final List> outputRawValues = inputRawValues == null ? null : new ArrayList<>(); - - for (int i = 0; i < inputRows.size(); i++) { - final List expandedRows = applyMultiRowTransforms(inputRows.get(i)); - outputRows.addAll(expandedRows); - if (outputRawValues != null) { - for (int j = 0; j < expandedRows.size(); j++) { - outputRawValues.add(inputRawValues.get(i)); - } - } - } - - return InputRowListPlusRawValues.ofList(outputRawValues, outputRows, row.getParseException()); + return inputRowListPlusRawValues; } } diff --git a/processing/src/main/java/org/apache/druid/segment/transform/TransformingInputEntityReader.java b/processing/src/main/java/org/apache/druid/segment/transform/TransformingInputEntityReader.java index 0f01d5c89d29..28845f473273 100644 --- a/processing/src/main/java/org/apache/druid/segment/transform/TransformingInputEntityReader.java +++ b/processing/src/main/java/org/apache/druid/segment/transform/TransformingInputEntityReader.java @@ -31,9 +31,9 @@ public class TransformingInputEntityReader implements InputEntityReader { private final InputEntityReader delegate; - private final Transformer transformer; + private final BaseTransformer transformer; - public TransformingInputEntityReader(InputEntityReader delegate, Transformer transformer) + public TransformingInputEntityReader(InputEntityReader delegate, BaseTransformer transformer) { this.delegate = delegate; this.transformer = transformer; @@ -51,6 +51,7 @@ public CloseableIterator read() throws IOException return delegate.read().map(transformer::transform); } + @Override public CloseableIterator sample() throws IOException { diff --git a/processing/src/main/java/org/apache/druid/segment/transform/TransformingInputSourceReader.java b/processing/src/main/java/org/apache/druid/segment/transform/TransformingInputSourceReader.java index bad106755975..ef557a207656 100644 --- a/processing/src/main/java/org/apache/druid/segment/transform/TransformingInputSourceReader.java +++ b/processing/src/main/java/org/apache/druid/segment/transform/TransformingInputSourceReader.java @@ -32,9 +32,9 @@ public class TransformingInputSourceReader implements InputSourceReader { private final InputSourceReader delegate; - private final Transformer transformer; + private final BaseTransformer transformer; - TransformingInputSourceReader(InputSourceReader delegate, Transformer transformer) + TransformingInputSourceReader(InputSourceReader delegate, BaseTransformer transformer) { this.delegate = delegate; this.transformer = transformer; diff --git a/processing/src/test/java/org/apache/druid/segment/IndexBuilder.java b/processing/src/test/java/org/apache/druid/segment/IndexBuilder.java index f30b04c2c717..6d6ac6b7a1b6 100644 --- a/processing/src/test/java/org/apache/druid/segment/IndexBuilder.java +++ b/processing/src/test/java/org/apache/druid/segment/IndexBuilder.java @@ -41,6 +41,7 @@ import org.apache.druid.segment.incremental.IncrementalIndex; import org.apache.druid.segment.incremental.IncrementalIndexSchema; import org.apache.druid.segment.incremental.OnheapIncrementalIndex; +import org.apache.druid.segment.transform.BaseTransformSpec; import org.apache.druid.segment.transform.TransformSpec; import org.apache.druid.segment.writeout.OffHeapMemorySegmentWriteOutMediumFactory; import org.apache.druid.segment.writeout.SegmentWriteOutMediumFactory; @@ -106,7 +107,7 @@ public static IndexBuilder create(ObjectMapper jsonMapper, ColumnConfig columnCo @Nullable private InputFormat inputFormat = null; @Nullable - private TransformSpec transformSpec = null; + private BaseTransformSpec transformSpec = null; @Nullable private File inputSourceTmpDir = null; @@ -185,7 +186,7 @@ public IndexBuilder inputFormat(InputFormat inputFormat) return this; } - public IndexBuilder transform(TransformSpec transformSpec) + public IndexBuilder transform(BaseTransformSpec transformSpec) { this.transformSpec = transformSpec; return this; @@ -201,7 +202,7 @@ public IndexBuilder rows( InputSource inputSource, InputFormat inputFormat, InputRowSchema rowSchema, - TransformSpec transformSpec, + BaseTransformSpec transformSpec, File tmp ) throws IOException @@ -330,7 +331,7 @@ public QueryableIndex buildMMappedMergedIndex() Preconditions.checkNotNull(inputFormat, "inputFormat"); Preconditions.checkNotNull(inputSourceTmpDir, "inputSourceTmpDir"); - TransformSpec transformer = transformSpec != null ? transformSpec : TransformSpec.NONE; + BaseTransformSpec transformer = transformSpec != null ? transformSpec : TransformSpec.NONE; InputRowSchema rowSchema = new InputRowSchema(schema.getTimestampSpec(), schema.getDimensionsSpec(), null); InputSourceReader reader = inputSource.reader(rowSchema, inputFormat, inputSourceTmpDir); InputSourceReader transformingReader = transformer.decorate(reader); @@ -475,14 +476,14 @@ public static InputSourceReader buildIncrementalIndexWithInputSource( IncrementalIndexSchema schema, InputSource inputSource, InputFormat inputFormat, - @Nullable TransformSpec transformSpec, + @Nullable BaseTransformSpec transformSpec, File inputSourceTmpDir) { Preconditions.checkNotNull(schema, "schema"); Preconditions.checkNotNull(inputSource, "inputSource"); Preconditions.checkNotNull(inputFormat, "inputFormat"); Preconditions.checkNotNull(inputSourceTmpDir, "inputSourceTmpDir"); - TransformSpec transformer = transformSpec != null ? transformSpec : TransformSpec.NONE; + BaseTransformSpec transformer = transformSpec != null ? transformSpec : TransformSpec.NONE; InputRowSchema rowSchema = new InputRowSchema(schema.getTimestampSpec(), schema.getDimensionsSpec(), null); InputSourceReader reader = inputSource.reader(rowSchema, inputFormat, inputSourceTmpDir); InputSourceReader transformingReader = transformer.decorate(reader); diff --git a/processing/src/test/java/org/apache/druid/segment/generator/SegmentGenerator.java b/processing/src/test/java/org/apache/druid/segment/generator/SegmentGenerator.java index 97a9d25afd26..ffc4bd4e58d9 100644 --- a/processing/src/test/java/org/apache/druid/segment/generator/SegmentGenerator.java +++ b/processing/src/test/java/org/apache/druid/segment/generator/SegmentGenerator.java @@ -44,8 +44,8 @@ import org.apache.druid.segment.incremental.IncrementalIndex; import org.apache.druid.segment.incremental.IncrementalIndexSchema; import org.apache.druid.segment.serde.ComplexMetrics; +import org.apache.druid.segment.transform.BaseTransformer; import org.apache.druid.segment.transform.TransformSpec; -import org.apache.druid.segment.transform.Transformer; import org.apache.druid.segment.writeout.OffHeapMemorySegmentWriteOutMediumFactory; import org.apache.druid.timeline.DataSegment; import org.apache.druid.timeline.SegmentId; @@ -250,7 +250,7 @@ public QueryableIndex generate( final List rows = new ArrayList<>(); final List indexes = new ArrayList<>(); - final Transformer transformer = transformSpec.toTransformer(); + final BaseTransformer transformer = transformSpec.toTransformer(); final InputRowSchema rowSchema = new InputRowSchema( TimestampSpec.DEFAULT, dimensionsSpec, @@ -368,7 +368,7 @@ public IncrementalIndex generateIncrementalIndex( final List rows = new ArrayList<>(); - final Transformer transformer = transformSpec.toTransformer(); + final BaseTransformer transformer = transformSpec.toTransformer(); final InputRowSchema rowSchema = new InputRowSchema( TimestampSpec.DEFAULT, dimensionsSpec, diff --git a/processing/src/test/java/org/apache/druid/segment/transform/ScanTransformTest.java b/processing/src/test/java/org/apache/druid/segment/transform/ScanTransformTest.java index ffa91cbfe39b..df6d9cb8c6dc 100644 --- a/processing/src/test/java/org/apache/druid/segment/transform/ScanTransformTest.java +++ b/processing/src/test/java/org/apache/druid/segment/transform/ScanTransformTest.java @@ -28,7 +28,6 @@ import org.apache.druid.query.Druids; import org.apache.druid.query.TableDataSource; import org.apache.druid.query.UnnestDataSource; -import org.apache.druid.query.expression.TestExprMacroTable; import org.apache.druid.query.filter.SelectorDimFilter; import org.apache.druid.query.scan.ScanQuery; import org.apache.druid.segment.TestHelper; @@ -63,40 +62,42 @@ private static InputRow makeRow(Object... kvPairs) return new MapBasedInputRow(TIMESTAMP, dimensions, event); } - private static ScanTransform makeUnnestTransform(String inputColumn, String outputName) + private static ScanQuery makeUnnestQuery(String inputColumn, String outputName) { - return makeUnnestTransform(inputColumn, outputName, ColumnType.STRING, null); + return makeUnnestQuery(inputColumn, outputName, ColumnType.STRING, null); } - private static ScanTransform makeUnnestTransform( + private static ScanQuery makeUnnestQuery( String inputColumn, String outputName, ColumnType outputType, SelectorDimFilter unnestFilter ) { - return new ScanTransform( - outputName, - Druids.newScanQueryBuilder() - .dataSource(UnnestDataSource.create( - new TableDataSource("__input__"), - new ExpressionVirtualColumn(outputName, "\"" + inputColumn + "\"", outputType, ExprMacroTable.nil()), - unnestFilter - )) - .eternityInterval() - .columns((List) null) - .resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_LIST) - .build() - ); + return Druids.newScanQueryBuilder() + .dataSource(UnnestDataSource.create( + new TableDataSource("__input__"), + new ExpressionVirtualColumn(outputName, "\"" + inputColumn + "\"", outputType, ExprMacroTable.nil()), + unnestFilter + )) + .eternityInterval() + .columns((List) null) + .resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_LIST) + .build(); + } + + private static BaseTransformer makeTransformer(ScanQuery query) + { + return new ScanTransformSpec(query).toTransformer(); } @Test public void testBasicUnnest() { - final ScanTransform transform = makeUnnestTransform("tags", "tag"); + final BaseTransformer transformer = makeTransformer(makeUnnestQuery("tags", "tag")); final InputRow input = makeRow("user", "alice", "tags", List.of("a", "b", "c")); - final List result = transform.applyMultiRow(input); + final List result = transformer.transformToList(input); Assert.assertEquals(3, result.size()); Assert.assertEquals("a", result.get(0).getRaw("tag")); @@ -110,10 +111,10 @@ public void testBasicUnnest() @Test public void testUnnestEmptyArray() { - final ScanTransform transform = makeUnnestTransform("tags", "tag"); + final BaseTransformer transformer = makeTransformer(makeUnnestQuery("tags", "tag")); final InputRow input = makeRow("user", "alice", "tags", List.of()); - final List result = transform.applyMultiRow(input); + final List result = transformer.transformToList(input); Assert.assertEquals(1, result.size()); Assert.assertEquals("alice", result.get(0).getRaw("user")); Assert.assertNull(result.get(0).getRaw("tag")); @@ -122,10 +123,10 @@ public void testUnnestEmptyArray() @Test public void testUnnestMissingColumn() { - final ScanTransform transform = makeUnnestTransform("services", "svc"); + final BaseTransformer transformer = makeTransformer(makeUnnestQuery("services", "svc")); final InputRow input = makeRow("user", "alice", "host", "web-01"); - final List result = transform.applyMultiRow(input); + final List result = transformer.transformToList(input); Assert.assertEquals(1, result.size()); Assert.assertEquals("alice", result.get(0).getRaw("user")); Assert.assertEquals("web-01", result.get(0).getRaw("host")); @@ -135,10 +136,10 @@ public void testUnnestMissingColumn() @Test public void testUnnestSingleElement() { - final ScanTransform transform = makeUnnestTransform("tags", "tag"); + final BaseTransformer transformer = makeTransformer(makeUnnestQuery("tags", "tag")); final InputRow input = makeRow("user", "alice", "tags", List.of("only")); - final List result = transform.applyMultiRow(input); + final List result = transformer.transformToList(input); Assert.assertEquals(1, result.size()); Assert.assertEquals("only", result.get(0).getRaw("tag")); } @@ -146,10 +147,10 @@ public void testUnnestSingleElement() @Test public void testUnnestScalarValue() { - final ScanTransform transform = makeUnnestTransform("tags", "tag"); + final BaseTransformer transformer = makeTransformer(makeUnnestQuery("tags", "tag")); final InputRow input = makeRow("user", "alice", "tags", "scalar"); - final List result = transform.applyMultiRow(input); + final List result = transformer.transformToList(input); Assert.assertEquals(1, result.size()); Assert.assertEquals("scalar", result.get(0).getRaw("tag")); } @@ -157,14 +158,16 @@ public void testUnnestScalarValue() @Test public void testUnnestArrayOfJsonObjects() { - final ScanTransform transform = makeUnnestTransform("items", "item", ColumnType.NESTED_DATA, null); + final BaseTransformer transformer = makeTransformer( + makeUnnestQuery("items", "item", ColumnType.NESTED_DATA, null) + ); final InputRow input = makeRow("user", "alice", "items", List.of( Map.of("product", "shirt", "price", 25), Map.of("product", "pants", "price", 40), Map.of("product", "hat", "price", 15) )); - final List result = transform.applyMultiRow(input); + final List result = transformer.transformToList(input); Assert.assertEquals(3, result.size()); final Object item0 = result.get(0).getRaw("item"); @@ -180,25 +183,24 @@ public void testUnnestArrayOfJsonObjects() @Test public void testUnnestNestedArrays() { - final ScanTransform transform = makeUnnestTransform("data", "element", ColumnType.NESTED_DATA, null); + final BaseTransformer transformer = makeTransformer( + makeUnnestQuery("data", "element", ColumnType.NESTED_DATA, null) + ); final InputRow input = makeRow( "user", "alice", "data", List.of(List.of(1, 2), List.of(3)) ); - final List result = transform.applyMultiRow(input); - - // One level of unnest only: [[1,2], [3]] -> [1,2] and [3] + final List result = transformer.transformToList(input); Assert.assertEquals(2, result.size()); final Object elem0 = result.get(0).getRaw("element"); Assert.assertNotNull(elem0); - Assert.assertTrue("Expected a List, got " + elem0.getClass(), elem0 instanceof List); - Assert.assertEquals(List.of(1, 2), elem0); + Assert.assertArrayEquals(new Object[]{1L, 2L}, (Object[]) elem0); final Object elem1 = result.get(1).getRaw("element"); - Assert.assertTrue("Expected a List, got " + elem1.getClass(), elem1 instanceof List); - Assert.assertEquals(List.of(3), elem1); + Assert.assertNotNull(elem1); + Assert.assertArrayEquals(new Object[]{3L}, (Object[]) elem1); Assert.assertEquals("alice", result.get(0).getRaw("user")); Assert.assertEquals("alice", result.get(1).getRaw("user")); @@ -207,10 +209,10 @@ public void testUnnestNestedArrays() @Test public void testTimestampPreservation() { - final ScanTransform transform = makeUnnestTransform("tags", "tag"); + final BaseTransformer transformer = makeTransformer(makeUnnestQuery("tags", "tag")); final InputRow input = makeRow("tags", List.of("a", "b")); - final List result = transform.applyMultiRow(input); + final List result = transformer.transformToList(input); for (final InputRow row : result) { Assert.assertEquals(TIMESTAMP, row.getTimestampFromEpoch()); } @@ -219,34 +221,28 @@ public void testTimestampPreservation() @Test public void testWithUnnestFilter() { - final ScanTransform transform = makeUnnestTransform("tags", "tag", ColumnType.STRING, new SelectorDimFilter("tag", "b", null)); + final BaseTransformer transformer = makeTransformer( + makeUnnestQuery("tags", "tag", ColumnType.STRING, new SelectorDimFilter("tag", "b", null)) + ); final InputRow input = makeRow("user", "alice", "tags", List.of("a", "b", "c")); - final List result = transform.applyMultiRow(input); + final List result = transformer.transformToList(input); Assert.assertEquals(1, result.size()); Assert.assertEquals("b", result.get(0).getRaw("tag")); } - @Test - public void testIsMultiRow() - { - final ScanTransform transform = makeUnnestTransform("tags", "tag"); - Assert.assertTrue(transform.isMultiRow()); - Assert.assertNull(transform.getRowFunction()); - } - // --- Transformer integration tests --- @Test public void testTransformerWithSingleScanTransform() { - final TransformSpec spec = new TransformSpec( - null, - List.of(makeUnnestTransform("tags", "tag")) + final ScanTransformSpec spec = new ScanTransformSpec( + makeUnnestQuery("tags", "tag") ); - final Transformer transformer = spec.toTransformer(); + final BaseTransformer transformer = spec.toTransformer(); Assert.assertTrue(transformer.hasMultiRowTransform()); + Assert.assertTrue(transformer instanceof ScanTransformer); final InputRow input = makeRow("user", "alice", "tags", List.of("x", "y")); final List result = transformer.transformToList(input); @@ -257,17 +253,24 @@ public void testTransformerWithSingleScanTransform() } @Test - public void testTransformerWithMultipleScanTransforms() + public void testNestedUnnestCrossJoin() { - final TransformSpec spec = new TransformSpec( - null, - List.of( - makeUnnestTransform("tags", "tag"), - makeUnnestTransform("colors", "color") - ) - ); - - final Transformer transformer = spec.toTransformer(); + final BaseTransformer transformer = new ScanTransformSpec( + Druids.newScanQueryBuilder() + .dataSource(UnnestDataSource.create( + UnnestDataSource.create( + new TableDataSource("__input__"), + new ExpressionVirtualColumn("tag", "\"tags\"", ColumnType.STRING, ExprMacroTable.nil()), + null + ), + new ExpressionVirtualColumn("color", "\"colors\"", ColumnType.STRING, ExprMacroTable.nil()), + null + )) + .eternityInterval() + .columns((List) null) + .resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_LIST) + .build() + ).toTransformer(); Assert.assertTrue(transformer.hasMultiRowTransform()); final InputRow input = makeRow( @@ -282,17 +285,24 @@ public void testTransformerWithMultipleScanTransforms() } @Test - public void testTransformerWithChainedScanTransformsFlattensNestedArrays() + public void testNestedUnnestFlattensNestedArrays() { - final TransformSpec spec = new TransformSpec( - null, - List.of( - makeUnnestTransform("data", "inner", ColumnType.NESTED_DATA, null), - makeUnnestTransform("inner", "val", ColumnType.LONG, null) - ) - ); - - final Transformer transformer = spec.toTransformer(); + final BaseTransformer transformer = new ScanTransformSpec( + Druids.newScanQueryBuilder() + .dataSource(UnnestDataSource.create( + UnnestDataSource.create( + new TableDataSource("__input__"), + new ExpressionVirtualColumn("inner", "\"data\"", ColumnType.NESTED_DATA, ExprMacroTable.nil()), + null + ), + new ExpressionVirtualColumn("val", "\"inner\"", ColumnType.LONG, ExprMacroTable.nil()), + null + )) + .eternityInterval() + .columns((List) null) + .resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_LIST) + .build() + ).toTransformer(); final InputRow input = makeRow( "user", "alice", @@ -316,47 +326,35 @@ public void testTransformerWithChainedScanTransformsFlattensNestedArrays() } @Test - public void testTransformerWithExpressionAndScanTransforms() + public void testScanTransformWithQueryFilter() { - final TransformSpec spec = new TransformSpec( - null, - List.of( - new ExpressionTransform("upper_user", "upper(\"user\")", TestExprMacroTable.INSTANCE), - makeUnnestTransform("tags", "tag") - ) - ); - - final Transformer transformer = spec.toTransformer(); - Assert.assertTrue(transformer.hasMultiRowTransform()); - - final InputRow input = makeRow("user", "alice", "tags", List.of("a", "b")); - final List result = transformer.transformToList(input); - - Assert.assertEquals(2, result.size()); - Assert.assertEquals("a", result.get(0).getRaw("tag")); - Assert.assertEquals("b", result.get(1).getRaw("tag")); - } - - @Test - public void testTransformerWithFilterAndScanTransform() - { - final TransformSpec spec = new TransformSpec( - new SelectorDimFilter("user", "not_alice", null), - List.of(makeUnnestTransform("tags", "tag")) - ); - - final Transformer transformer = spec.toTransformer(); + final BaseTransformer transformer = new ScanTransformSpec( + Druids.newScanQueryBuilder() + .dataSource(UnnestDataSource.create( + new TableDataSource("__input__"), + new ExpressionVirtualColumn("tag", "\"tags\"", ColumnType.STRING, ExprMacroTable.nil()), + null + )) + .eternityInterval() + .filters(new SelectorDimFilter("user", "not_alice", null)) + .columns((List) null) + .resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_LIST) + .build() + ).toTransformer(); final InputRow input = makeRow("user", "alice", "tags", List.of("a", "b")); final List result = transformer.transformToList(input); - Assert.assertTrue(result.isEmpty()); + // Filter rejects the row (user != "not_alice"), so passthrough with no unnest + Assert.assertEquals(1, result.size()); + Assert.assertNull(result.get(0).getRaw("tag")); } @Test public void testTransformerWithoutScanTransform() { final TransformSpec spec = new TransformSpec(null, null); - final Transformer transformer = spec.toTransformer(); + final BaseTransformer transformer = spec.toTransformer(); Assert.assertFalse(transformer.hasMultiRowTransform()); + Assert.assertTrue(transformer instanceof Transformer); final InputRow input = makeRow("user", "alice"); final List result = transformer.transformToList(input); @@ -367,46 +365,23 @@ public void testTransformerWithoutScanTransform() public void testTransformerTransformToListWithNull() { final TransformSpec spec = new TransformSpec(null, null); - final Transformer transformer = spec.toTransformer(); + final BaseTransformer transformer = spec.toTransformer(); Assert.assertTrue(transformer.transformToList(null).isEmpty()); } // --- Serde tests --- @Test - public void testSerde() throws Exception + public void testScanTransformSpecSerde() throws Exception { - final TransformSpec spec = new TransformSpec( - null, - List.of( - makeUnnestTransform("tags", "tag", ColumnType.STRING, new SelectorDimFilter("tag", "a", null)) - ) + final ScanTransformSpec spec = new ScanTransformSpec( + makeUnnestQuery("tags", "tag", ColumnType.STRING, new SelectorDimFilter("tag", "a", null)) ); final ObjectMapper jsonMapper = TestHelper.makeJsonMapper(); final String json = jsonMapper.writeValueAsString(spec); - final TransformSpec deserialized = jsonMapper.readValue(json, TransformSpec.class); + final BaseTransformSpec deserialized = jsonMapper.readValue(json, BaseTransformSpec.class); + Assert.assertTrue(deserialized instanceof ScanTransformSpec); Assert.assertEquals(spec, deserialized); } - - @Test - public void testSerdeWithMixedTransforms() throws Exception - { - final TransformSpec spec = new TransformSpec( - null, - List.of( - new ExpressionTransform("upper_user", "upper(\"user\")", TestExprMacroTable.INSTANCE), - makeUnnestTransform("tags", "tag") - ) - ); - - final ObjectMapper jsonMapper = TestHelper.makeJsonMapper(); - final String json = jsonMapper.writeValueAsString(spec); - final TransformSpec deserialized = jsonMapper.readValue(json, TransformSpec.class); - Assert.assertEquals(spec, deserialized); - - Assert.assertEquals(2, deserialized.getTransforms().size()); - Assert.assertFalse(deserialized.getTransforms().get(0).isMultiRow()); - Assert.assertTrue(deserialized.getTransforms().get(1).isMultiRow()); - } } diff --git a/processing/src/test/java/org/apache/druid/segment/transform/TransformerTest.java b/processing/src/test/java/org/apache/druid/segment/transform/TransformerTest.java index df499432c12e..9d032cfd5701 100644 --- a/processing/src/test/java/org/apache/druid/segment/transform/TransformerTest.java +++ b/processing/src/test/java/org/apache/druid/segment/transform/TransformerTest.java @@ -324,25 +324,17 @@ public void testInputRowListPlusRawValuesTransformWithFilter() @Test public void testInputRowListPlusRawValuesTransformWithScanTransformExpandsRowsAndRawValues() { - final Transformer transformer = new Transformer( - new TransformSpec( - null, - ImmutableList.of( - new ScanTransform( - "tag", - Druids.newScanQueryBuilder() - .dataSource(UnnestDataSource.create( - new TableDataSource("__input__"), - new ExpressionVirtualColumn("tag", "\"tags\"", ColumnType.STRING, TestExprMacroTable.INSTANCE), - null - )) - .eternityInterval() - .resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_LIST) - .build() - ) - ) - ) - ); + final BaseTransformer transformer = new ScanTransformSpec( + Druids.newScanQueryBuilder() + .dataSource(UnnestDataSource.create( + new TableDataSource("__input__"), + new ExpressionVirtualColumn("tag", "\"tags\"", ColumnType.STRING, TestExprMacroTable.INSTANCE), + null + )) + .eternityInterval() + .resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_LIST) + .build() + ).toTransformer(); final InputRow inputRow = new MapBasedInputRow( DateTimes.nowUtc(), diff --git a/server/src/main/java/org/apache/druid/segment/indexing/DataSchema.java b/server/src/main/java/org/apache/druid/segment/indexing/DataSchema.java index dbe0836c4dd8..3cf470e3270c 100644 --- a/server/src/main/java/org/apache/druid/segment/indexing/DataSchema.java +++ b/server/src/main/java/org/apache/druid/segment/indexing/DataSchema.java @@ -43,6 +43,7 @@ import org.apache.druid.segment.column.ColumnHolder; import org.apache.druid.segment.column.ValueType; import org.apache.druid.segment.projections.AggregateProjectionSchema; +import org.apache.druid.segment.transform.BaseTransformSpec; import org.apache.druid.segment.transform.TransformSpec; import javax.annotation.Nullable; @@ -77,7 +78,7 @@ public static Builder builder(DataSchema schema) private final String dataSource; private final AggregatorFactory[] aggregators; private final GranularitySpec granularitySpec; - private final TransformSpec transformSpec; + private final BaseTransformSpec transformSpec; @Nullable private final TimestampSpec timestampSpec; @Nullable @@ -92,7 +93,7 @@ public DataSchema( @JsonProperty("dimensionsSpec") @Nullable DimensionsSpec dimensionsSpec, @JsonProperty("metricsSpec") @Nullable AggregatorFactory[] aggregators, @JsonProperty("granularitySpec") @Nullable GranularitySpec granularitySpec, - @JsonProperty("transformSpec") TransformSpec transformSpec, + @JsonProperty("transformSpec") BaseTransformSpec transformSpec, @JsonProperty("projections") @Nullable List projections, @Deprecated @JsonProperty("parser") @Nullable Map parserMap ) @@ -168,7 +169,7 @@ public GranularitySpec getGranularitySpec() } @JsonProperty - public TransformSpec getTransformSpec() + public BaseTransformSpec getTransformSpec() { return transformSpec; } @@ -195,7 +196,7 @@ public DataSchema withGranularitySpec(GranularitySpec granularitySpec) return builder(this).withGranularity(granularitySpec).build(); } - public DataSchema withTransformSpec(TransformSpec transformSpec) + public DataSchema withTransformSpec(BaseTransformSpec transformSpec) { return builder(this).withTransform(transformSpec).build(); } @@ -492,7 +493,7 @@ public static class Builder private String dataSource; private AggregatorFactory[] aggregators; private GranularitySpec granularitySpec; - private TransformSpec transformSpec; + private BaseTransformSpec transformSpec; private TimestampSpec timestampSpec; private DimensionsSpec dimensionsSpec; private List projections; @@ -554,7 +555,7 @@ public Builder withGranularity(GranularitySpec granularitySpec) return this; } - public Builder withTransform(TransformSpec transformSpec) + public Builder withTransform(BaseTransformSpec transformSpec) { this.transformSpec = transformSpec; return this; From dd287db9e02cdbf8911a98904fda55896a7073e7 Mon Sep 17 00:00:00 2001 From: Abhishek Balaji Radhakrishnan Date: Mon, 27 Apr 2026 23:44:33 -0700 Subject: [PATCH 6/7] Spelling --- website/.spelling | 1 + 1 file changed, 1 insertion(+) diff --git a/website/.spelling b/website/.spelling index 28701817f362..8ed5ac2da309 100644 --- a/website/.spelling +++ b/website/.spelling @@ -2646,3 +2646,4 @@ nginx - ../docs/development/extensions-core/s3.md NIO +passthrough From 929c3edaa7061ff2363b2e13a32c26583820f591 Mon Sep 17 00:00:00 2001 From: Abhishek Balaji Radhakrishnan Date: Tue, 28 Apr 2026 08:56:25 -0700 Subject: [PATCH 7/7] Remove passthrough logic to align with native query / unnest behavior --- .../segment/transform/ScanTransformer.java | 27 ++--------- .../transform/SettableRowCursorFactory.java | 6 --- .../segment/transform/ScanTransformTest.java | 45 ++++++++++++++----- 3 files changed, 39 insertions(+), 39 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/segment/transform/ScanTransformer.java b/processing/src/main/java/org/apache/druid/segment/transform/ScanTransformer.java index 1dd8b3b401f6..624a206ac30b 100644 --- a/processing/src/main/java/org/apache/druid/segment/transform/ScanTransformer.java +++ b/processing/src/main/java/org/apache/druid/segment/transform/ScanTransformer.java @@ -62,8 +62,8 @@ * on the factory and the cursor is {@link Cursor#reset reset} — no per-row segment or cursor allocation. * *

When the scan query produces zero output rows (e.g., null/missing arrays, or filter rejection), - * the input row passes through with unnest output columns set to null and virtual columns still evaluated. - * This differs from {@link TransformSpec}'s filter behavior which drops the row entirely. + * the input row is dropped. This matches native Druid UNNEST / CROSS JOIN semantics where + * null or empty arrays produce zero rows. * *

This class is not thread-safe. Each reader thread should have its own instance. */ @@ -166,7 +166,7 @@ private List process(final InputRow inputRow) } if (cursor == null || cursor.isDone()) { - return List.of(buildPassthroughRow(inputRow)); + return List.of(); } final List columns = resolveColumnsForRow(inputRow); @@ -183,26 +183,7 @@ private List process(final InputRow inputRow) cursor.advance(); } - return result.isEmpty() ? List.of(buildPassthroughRow(inputRow)) : result; - } - - private InputRow buildPassthroughRow(final InputRow inputRow) - { - final Set unnestOutputColumns = new LinkedHashSet<>(); - collectOutputColumnNames(query.getDataSource(), unnestOutputColumns); - - final List columns = resolveColumnsForRow(inputRow); - final List dimensionColumns = resolveDimensionColumns(inputRow, columns); - final ColumnSelectorFactory factory = baseCursorFactory.getColumnSelectorFactory(query.getVirtualColumns()); - final Map event = new LinkedHashMap<>(); - for (final String col : columns) { - if (unnestOutputColumns.contains(col)) { - event.put(col, null); - } else { - event.put(col, factory.makeColumnValueSelector(col).getObject()); - } - } - return new MapBasedInputRow(inputRow.getTimestampFromEpoch(), dimensionColumns, event); + return result; } private List resolveColumnsForRow(final InputRow inputRow) diff --git a/processing/src/main/java/org/apache/druid/segment/transform/SettableRowCursorFactory.java b/processing/src/main/java/org/apache/druid/segment/transform/SettableRowCursorFactory.java index 927c3d68ce51..f71da27e31e0 100644 --- a/processing/src/main/java/org/apache/druid/segment/transform/SettableRowCursorFactory.java +++ b/processing/src/main/java/org/apache/druid/segment/transform/SettableRowCursorFactory.java @@ -30,7 +30,6 @@ import org.apache.druid.segment.RowAdapters; import org.apache.druid.segment.RowBasedColumnSelectorFactory; import org.apache.druid.segment.RowIdSupplier; -import org.apache.druid.segment.VirtualColumns; import org.apache.druid.segment.column.ColumnCapabilities; import org.apache.druid.segment.column.RowSignature; import org.apache.druid.segment.filter.ValueMatchers; @@ -68,11 +67,6 @@ void set(final InputRow row) this.rowId++; } - ColumnSelectorFactory getColumnSelectorFactory(final VirtualColumns virtualColumns) - { - return virtualColumns.wrap(baseSelectorFactory); - } - @Override public CursorHolder makeCursorHolder(final CursorBuildSpec spec) { diff --git a/processing/src/test/java/org/apache/druid/segment/transform/ScanTransformTest.java b/processing/src/test/java/org/apache/druid/segment/transform/ScanTransformTest.java index df6d9cb8c6dc..003f9f53bd57 100644 --- a/processing/src/test/java/org/apache/druid/segment/transform/ScanTransformTest.java +++ b/processing/src/test/java/org/apache/druid/segment/transform/ScanTransformTest.java @@ -115,9 +115,8 @@ public void testUnnestEmptyArray() final InputRow input = makeRow("user", "alice", "tags", List.of()); final List result = transformer.transformToList(input); - Assert.assertEquals(1, result.size()); - Assert.assertEquals("alice", result.get(0).getRaw("user")); - Assert.assertNull(result.get(0).getRaw("tag")); + // Empty array produces 0 rows, matching native CROSS JOIN UNNEST semantics + Assert.assertEquals(0, result.size()); } @Test @@ -127,10 +126,8 @@ public void testUnnestMissingColumn() final InputRow input = makeRow("user", "alice", "host", "web-01"); final List result = transformer.transformToList(input); - Assert.assertEquals(1, result.size()); - Assert.assertEquals("alice", result.get(0).getRaw("user")); - Assert.assertEquals("web-01", result.get(0).getRaw("host")); - Assert.assertNull(result.get(0).getRaw("svc")); + // Missing column produces 0 rows, matching native CROSS JOIN UNNEST semantics + Assert.assertEquals(0, result.size()); } @Test @@ -284,6 +281,35 @@ public void testNestedUnnestCrossJoin() Assert.assertEquals(6, result.size()); } + @Test + public void testNestedUnnestWithMissingOuterColumn() + { + final BaseTransformer transformer = new ScanTransformSpec( + Druids.newScanQueryBuilder() + .dataSource(UnnestDataSource.create( + UnnestDataSource.create( + new TableDataSource("__input__"), + new ExpressionVirtualColumn("tag", "\"tags\"", ColumnType.STRING, ExprMacroTable.nil()), + null + ), + new ExpressionVirtualColumn("svc", "\"services\"", ColumnType.NESTED_DATA, ExprMacroTable.nil()), + null + )) + .eternityInterval() + .columns((List) null) + .resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_LIST) + .build() + ).toTransformer(); + + // tags present (2 elements), services missing + final InputRow input = makeRow("trace_id", "abc", "tags", List.of("music", "blll")); + final List result = transformer.transformToList(input); + + // Nested unnest is a cross join: tags x services. With services missing, the cross join + // produces 0 rows — matching native CROSS JOIN UNNEST semantics. + Assert.assertEquals(0, result.size()); + } + @Test public void testNestedUnnestFlattensNestedArrays() { @@ -343,9 +369,8 @@ public void testScanTransformWithQueryFilter() ).toTransformer(); final InputRow input = makeRow("user", "alice", "tags", List.of("a", "b")); final List result = transformer.transformToList(input); - // Filter rejects the row (user != "not_alice"), so passthrough with no unnest - Assert.assertEquals(1, result.size()); - Assert.assertNull(result.get(0).getRaw("tag")); + // Filter rejects the row (user != "not_alice"), so 0 rows — matching native scan query semantics + Assert.assertEquals(0, result.size()); } @Test