apache · johanl-db · Apr 9, 2026 · Apr 9, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -8192,6 +8192,23 @@
     ],
     "sqlState" : "0A000"
   },
+  "UNSUPPORTED_STREAMING_SCHEMA_EVOLUTION" : {
+    "message" : [
+      "Schema evolution is not supported for this streaming write:"
+    ],
+    "subClass" : {
+      "CONTINUOUS_TRIGGER" : {
+        "message" : [
+          "Continuous triggers are not supported. Use a micro-batch trigger instead."
+        ]
+      },
+      "NOT_V2_TABLE" : {
+        "message" : [
+          "The sink is not a V2 table. Schema evolution requires a V2 table that supports the AUTOMATIC_SCHEMA_EVOLUTION capability."
+        ]
+      }
+    }
+  },
   "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY" : {
     "message" : [
       "Unsupported subquery expression:"

diff --git a/sql/api/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/api/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
@@ -170,6 +170,19 @@ abstract class DataStreamWriter[T] extends WriteConfigMethods[DataStreamWriter[T
     foreachBatch((batchDs: Dataset[T], batchId: Long) => function.call(batchDs, batchId))
   }
 
+  /**
+   * Enables automatic schema evolution for the streaming write. When enabled, if the source
+   * schema has columns not present in the sink table (or type changes), the sink table schema
+   * will be evolved to accommodate the new schema before data is written. The sink table must
+   * support the `AUTOMATIC_SCHEMA_EVOLUTION` capability.
+   *
+   * Schema evolution is applied at query analysis time: when the streaming query is started
+   * (or restarted after failure), the table schema is evolved if needed.
+   *
+   * @since 4.2.0
+   */
+  def withSchemaEvolution(): this.type
+
   /**
    * Starts the execution of the streaming query, which will continually output results to the
    * given path as new data arrives. The returned

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/streaming/WriteToStream.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/streaming/WriteToStream.scala
@@ -25,6 +25,8 @@ import org.apache.spark.sql.streaming.OutputMode
 
 /**
  * Used to create a [[StreamExecution]].
+ *
+ * @param withSchemaEvolution Whether to evolve the sink table schema to match the source.
  */
 case class WriteToStream(
     name: String,
@@ -34,7 +36,8 @@ case class WriteToStream(
     deleteCheckpointOnStop: Boolean,
     inputQuery: LogicalPlan,
     catalogAndIdent: Option[(TableCatalog, Identifier)] = None,
-    catalogTable: Option[CatalogTable]) extends UnaryNode {
+    catalogTable: Option[CatalogTable],
+    withSchemaEvolution: Boolean) extends UnaryNode {
 
   override def isStreaming: Boolean = true
 

diff --git a/...alyst/src/main/scala/org/apache/spark/sql/catalyst/streaming/WriteToStreamStatement.scala b/...alyst/src/main/scala/org/apache/spark/sql/catalyst/streaming/WriteToStreamStatement.scala
@@ -44,6 +44,7 @@ import org.apache.spark.sql.streaming.{OutputMode, Trigger}
  *                for unsupported operations, which happens during resolution.
  * @param inputQuery  The analyzed query plan from the streaming DataFrame.
  * @param catalogAndIdent Catalog and identifier for the sink, set when it is a V2 catalog table
+ * @param withSchemaEvolution  Whether to evolve the sink table schema to match the source.
  */
 case class WriteToStreamStatement(
     userSpecifiedName: Option[String],
@@ -55,8 +56,9 @@ case class WriteToStreamStatement(
     hadoopConf: Configuration,
     trigger: Trigger,
     inputQuery: LogicalPlan,
-    catalogAndIdent: Option[(TableCatalog, Identifier)] = None,
-    catalogTable: Option[CatalogTable] = None) extends UnaryNode {
+    catalogAndIdent: Option[(TableCatalog, Identifier)],
+    catalogTable: Option[CatalogTable],
+    withSchemaEvolution: Boolean) extends UnaryNode {
 
   override def isStreaming: Boolean = true
 

diff --git a/sql/connect/common/src/main/protobuf/spark/connect/commands.proto b/sql/connect/common/src/main/protobuf/spark/connect/commands.proto
@@ -249,6 +249,9 @@ message WriteStreamOperationStart {
 
   // (Optional) Columns used for clustering the table.
   repeated string clustering_column_names = 15;
+
+  // (Optional) Enable automatic schema evolution for the streaming write.
+  bool with_schema_evolution = 16;
 }
 
 message StreamingForeachFunction {

diff --git a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/DataStreamWriter.scala b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/DataStreamWriter.scala
@@ -82,6 +82,12 @@ final class DataStreamWriter[T] private[sql] (ds: Dataset[T])
     this
   }
 
+  /** @inheritdoc */
+  def withSchemaEvolution(): this.type = {
+    sinkBuilder.setWithSchemaEvolution(true)
+    this
+  }
+
   /** @inheritdoc */
   def format(source: String): this.type = {
     sinkBuilder.setFormat(source)

diff --git a/...nect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/...nect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
@@ -3491,6 +3491,10 @@ class SparkConnectPlanner(
       writer.queryName(writeOp.getQueryName)
     }
 
+    if (writeOp.getWithSchemaEvolution) {
+      writer.withSchemaEvolution()
+    }
+
     if (writeOp.hasForeachWriter) {
       if (writeOp.getForeachWriter.hasPythonFunction) {
         val foreach = writeOp.getForeachWriter.getPythonFunction

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/classic/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/classic/DataStreamWriter.scala
@@ -83,6 +83,12 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) extends streaming.D
     this
   }
 
+  /** @inheritdoc */
+  def withSchemaEvolution(): this.type = {
+    this.schemaEvolution = true
+    this
+  }
+
   /** @inheritdoc */
   def format(source: String): this.type = {
     this.source = source
@@ -205,7 +211,8 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) extends streaming.D
     import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._
     tableInstance match {
       case t: SupportsWrite if t.supports(STREAMING_WRITE) =>
-        startQuery(t, extraOptions, catalogAndIdent = Some(catalog.asTableCatalog, identifier))
+        startQuery(t, extraOptions, catalogAndIdent = Some(catalog.asTableCatalog, identifier),
+          withSchemaEvolution = schemaEvolution)
       case t: V2TableWithV1Fallback =>
         writeToV1Table(t.v1Table)
       case t: V1Table =>
@@ -244,7 +251,8 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) extends streaming.D
         throw QueryCompilationErrors.sourceNotSupportedWithContinuousTriggerError(source)
       }
       val sink = new ForeachBatchSink[T](foreachBatchWriter, ds.exprEnc)
-      startQuery(sink, extraOptions, catalogTable = catalogTable)
+      startQuery(sink, extraOptions, catalogTable = catalogTable,
+        withSchemaEvolution = schemaEvolution)
     } else {
       val cls = DataSource.lookupDataSource(source, ds.sparkSession.sessionState.conf)
       val disabledSources =
@@ -290,7 +298,8 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) extends streaming.D
         createV1Sink(optionsWithPath)
       }
 
-      startQuery(sink, optionsWithPath, catalogTable = catalogTable)
+      startQuery(sink, optionsWithPath, catalogTable = catalogTable,
+        withSchemaEvolution = schemaEvolution)
     }
   }
 
@@ -299,7 +308,8 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) extends streaming.D
       newOptions: CaseInsensitiveMap[String],
       recoverFromCheckpoint: Boolean = true,
       catalogAndIdent: Option[(TableCatalog, Identifier)] = None,
-      catalogTable: Option[CatalogTable] = None): StreamingQuery = {
+      catalogTable: Option[CatalogTable] = None,
+      withSchemaEvolution: Boolean = false): StreamingQuery = {
     if (trigger.isInstanceOf[RealTimeTrigger]) {
       RealTimeModeAllowlist.checkAllowedSink(
         sink,
@@ -321,7 +331,8 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) extends streaming.D
       recoverFromCheckpointLocation = recoverFromCheckpoint,
       trigger = trigger,
       catalogAndIdent = catalogAndIdent,
-      catalogTable = catalogTable)
+      catalogTable = catalogTable,
+      withSchemaEvolution = withSchemaEvolution)
   }
 
   private def createV1Sink(optionsWithPath: CaseInsensitiveMap[String]): Sink = {
@@ -444,6 +455,8 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) extends streaming.D
   private var partitioningColumns: Option[Seq[String]] = None
 
   private var clusteringColumns: Option[Seq[String]] = None
+
+  private var schemaEvolution: Boolean = false
 }
 
 object DataStreamWriter {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/classic/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/classic/StreamingQueryManager.scala
@@ -24,7 +24,7 @@ import javax.annotation.concurrent.GuardedBy
 import scala.collection.mutable
 import scala.jdk.CollectionConverters._
 
-import org.apache.spark.SparkIllegalArgumentException
+import org.apache.spark.{SparkIllegalArgumentException, SparkUnsupportedOperationException}
 import org.apache.spark.annotation.Evolving
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.LogKeys.{CLASS_NAME, QUERY_ID, RUN_ID}
@@ -186,7 +186,8 @@ class StreamingQueryManager private[sql] (
       trigger: Trigger,
       triggerClock: Clock,
       catalogAndIdent: Option[(TableCatalog, Identifier)] = None,
-      catalogTable: Option[CatalogTable] = None): StreamingQueryWrapper = {
+      catalogTable: Option[CatalogTable] = None,
+      withSchemaEvolution: Boolean = false): StreamingQueryWrapper = {
     val analyzedPlan = df.queryExecution.analyzed
     df.queryExecution.assertAnalyzed()
 
@@ -216,14 +217,21 @@ class StreamingQueryManager private[sql] (
       trigger,
       analyzedPlan,
       catalogAndIdent,
-      catalogTable)
+      catalogTable,
+      withSchemaEvolution)
 
     val analyzedStreamWritePlan =
       sparkSession.sessionState.executePlan(dataStreamWritePlan).analyzed
         .asInstanceOf[WriteToStream]
 
     (sink, trigger) match {
       case (_: SupportsWrite, trigger: ContinuousTrigger) =>
+        if (withSchemaEvolution) {
+          throw new SparkUnsupportedOperationException(
+            errorClass =
+              "UNSUPPORTED_STREAMING_SCHEMA_EVOLUTION.CONTINUOUS_TRIGGER",
+            messageParameters = Map.empty[String, String])
+        }
         new StreamingQueryWrapper(new ContinuousExecution(
           sparkSession,
           trigger,
@@ -287,7 +295,8 @@ class StreamingQueryManager private[sql] (
       trigger: Trigger = Trigger.ProcessingTime(0),
       triggerClock: Clock = new SystemClock(),
       catalogAndIdent: Option[(TableCatalog, Identifier)] = None,
-      catalogTable: Option[CatalogTable] = None): StreamingQuery = {
+      catalogTable: Option[CatalogTable] = None,
+      withSchemaEvolution: Boolean = false): StreamingQuery = {
     val query = createQuery(
       userSpecifiedName,
       userSpecifiedCheckpointLocation,
@@ -300,7 +309,8 @@ class StreamingQueryManager private[sql] (
       trigger,
       triggerClock,
       catalogAndIdent,
-      catalogTable)
+      catalogTable,
+      withSchemaEvolution)
     // scalastyle:on argcount
 
     // The following code block checks if a stream with the same name or id is running. Then it

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2Writes.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2Writes.scala
@@ -91,7 +91,7 @@ object V2Writes extends Rule[LogicalPlan] with PredicateHelper {
       o.copy(write = Some(write), query = newQuery)
 
     case WriteToMicroBatchDataSource(
-        relationOpt, table, query, queryId, options, outputMode, Some(batchId)) =>
+        relationOpt, table, query, queryId, options, outputMode, _, Some(batchId)) =>
       val writeOptions = mergeOptions(
         options,
         relationOpt.map(r => r.options.asCaseSensitiveMap.asScala.toMap).getOrElse(Map.empty))

diff --git a/...src/main/scala/org/apache/spark/sql/execution/streaming/runtime/MicroBatchExecution.scala b/...src/main/scala/org/apache/spark/sql/execution/streaming/runtime/MicroBatchExecution.scala
@@ -26,7 +26,7 @@ import scala.util.control.NonFatal
 
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.{SparkIllegalArgumentException, SparkIllegalStateException}
+import org.apache.spark.{SparkIllegalArgumentException, SparkIllegalStateException, SparkUnsupportedOperationException}
 import org.apache.spark.internal.LogKeys
 import org.apache.spark.internal.LogKeys._
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
@@ -354,13 +354,20 @@ class MicroBatchExecution(
         }
         WriteToMicroBatchDataSource(
           relationOpt,
-          table = s,
+          sinkTable = s,
           query = _logicalPlan,
           queryId = id.toString,
           extraOptions,
-          outputMode)
+          outputMode,
+          withSchemaEvolution = plan.withSchemaEvolution)
 
       case s: Sink =>
+        if (plan.withSchemaEvolution) {
+          throw new SparkUnsupportedOperationException(
+            errorClass =
+              "UNSUPPORTED_STREAMING_SCHEMA_EVOLUTION.NOT_V2_TABLE",
+            messageParameters = Map.empty[String, String])
+        }
         // SinkV1 is not compatible with Real-Time Mode due to API limitations.
         // SinkV1 does not support writing outputs row by row.
         if (trigger.isInstanceOf[RealTimeTrigger]) {

diff --git a/...rc/main/scala/org/apache/spark/sql/execution/streaming/runtime/ResolveWriteToStream.scala b/...rc/main/scala/org/apache/spark/sql/execution/streaming/runtime/ResolveWriteToStream.scala
@@ -72,7 +72,8 @@ object ResolveWriteToStream extends Rule[LogicalPlan] {
         deleteCheckpointOnStop,
         s.inputQuery,
         s.catalogAndIdent,
-        s.catalogTable)
+        s.catalogTable,
+        s.withSchemaEvolution)
   }
 
   def resolveCheckpointLocation(s: WriteToStreamStatement): (String, Boolean) = {

diff --git a/.../scala/org/apache/spark/sql/execution/streaming/sources/WriteToMicroBatchDataSource.scala b/.../scala/org/apache/spark/sql/execution/streaming/sources/WriteToMicroBatchDataSource.scala
@@ -17,10 +17,13 @@
 
 package org.apache.spark.sql.execution.streaming.sources
 
+import org.apache.spark.sql.catalyst.analysis.{NamedRelation, ResolveSchemaEvolution}
 import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode}
-import org.apache.spark.sql.connector.catalog.SupportsWrite
-import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode, WriteWithSchemaEvolution}
+import org.apache.spark.sql.catalyst.trees.TreePattern.COMMAND
+import org.apache.spark.sql.catalyst.types.DataTypeUtils
+import org.apache.spark.sql.connector.catalog.{SupportsWrite, TableChange, TableWritePrivilege}
+import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, ExtractV2CatalogAndIdentifier}
 import org.apache.spark.sql.streaming.OutputMode
 
 /**
@@ -29,19 +32,59 @@ import org.apache.spark.sql.streaming.OutputMode
  * Note that this logical plan does not have a corresponding physical plan, as it will be converted
  * to [[org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2 WriteToDataSourceV2]]
  * with [[MicroBatchWrite]] before execution.
+ *
+ * @param withSchemaEvolution Whether to evolve the sink table schema to match the source.
  */
 case class WriteToMicroBatchDataSource(
     relation: Option[DataSourceV2Relation],
-    table: SupportsWrite,
+    sinkTable: SupportsWrite,
     query: LogicalPlan,
     queryId: String,
     writeOptions: Map[String, String],
     outputMode: OutputMode,
+    override val withSchemaEvolution: Boolean,
     batchId: Option[Long] = None)
-  extends UnaryNode {
+  extends UnaryNode with WriteWithSchemaEvolution {
   override def child: LogicalPlan = query
   override def output: Seq[Attribute] = Nil
 
+  final override val nodePatterns = Seq(COMMAND)
+
+  override def table: LogicalPlan = relation.getOrElse {
+    throw new IllegalStateException(
+      "Cannot access table for schema evolution: no DataSourceV2Relation is set.")
+  }
+
+  override lazy val schemaEvolutionReady: Boolean =
+    relation.exists(_.resolved) && query.resolved
+
+  override def pendingSchemaChanges: Seq[TableChange] = {
+    if (relation.isEmpty || !schemaEvolutionEnabled || !schemaEvolutionReady) {
+      return Seq.empty
+    }
+
+    val currentRelation = relation.get match {
+      case r @ ExtractV2CatalogAndIdentifier(catalog, ident) =>
+        // Loading the current table from the catalog ensures we don't use a stale schema.
+        val currentTable = catalog.loadTable(ident)
+        r.copy(
+          table = currentTable,
+          output = DataTypeUtils.toAttributes(currentTable.columns))
+      case r => r
+    }
+    ResolveSchemaEvolution.computeSupportedSchemaChanges(
+      currentRelation, query.schema, isByName = true).toSeq
+  }
+
+  override val writePrivileges: Set[TableWritePrivilege] = Set(TableWritePrivilege.INSERT)
+
+  override def withNewTable(newTable: NamedRelation): WriteToMicroBatchDataSource = {
+    val newRelation = newTable.asInstanceOf[DataSourceV2Relation]
+    copy(
+      relation = Some(newRelation),
+      sinkTable = newRelation.table.asInstanceOf[SupportsWrite])
+  }
+
   def withNewBatchId(batchId: Long): WriteToMicroBatchDataSource = {
     copy(batchId = Some(batchId))
   }