gotocoding-DB · gotocoding-DB · Mar 19, 2026
diff --git a/kernel/tests/clustering_e2e.rs b/kernel/tests/clustering_e2e.rs
@@ -7,6 +7,7 @@
 use std::collections::HashMap;
 use std::sync::Arc;
 
+use delta_kernel::arrow::array::{ArrayRef, Int32Array};
 use delta_kernel::committer::FileSystemCommitter;
 use delta_kernel::expressions::ColumnName;
 use delta_kernel::schema::{DataType, StructField, StructType};
@@ -138,3 +139,84 @@ async fn test_clustered_table_write_and_checkpoint(
 
     Ok(())
 }
+
+/// Regression test: writing a batch where a clustering column has ALL null values should succeed.
+///
+/// `collect_stats` (commit 76d480f0) omits `minValues`/`maxValues` fields for all-null columns,
+/// but `StatsVerifier` tries to extract `stats.minValues.<column>` before checking the
+/// `nullCount == numRecords` condition. The column extraction fails with:
+///   "Column stats.minValues.<column> not found in the data"
+/// because the field is missing from the stats StructArray entirely.
+///
+/// The verifier should tolerate missing min/max fields when the column is all-null.
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn test_clustered_table_write_all_null_clustering_column() {
+    let (_temp_dir, table_path, engine) = test_table_setup_mt().unwrap();
+    let schema = Arc::new(
+        StructType::try_new(vec![
+            StructField::new("category", DataType::STRING, false),
+            StructField::new("region_id", DataType::INTEGER, true),
+        ])
+        .unwrap(),
+    );
+
+    // Create table clustered on "category" and "region_id"
+    let create_result = create_table(&table_path, schema, "Test/1.0")
+        .with_data_layout(DataLayout::Clustered {
+            columns: vec![
+                ColumnName::new(["category"]),
+                ColumnName::new(["region_id"]),
+            ],
+        })
+        .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))
+        .unwrap()
+        .commit(engine.as_ref())
+        .unwrap();
+
+    let snapshot = match create_result {
+        CommitResult::CommittedTransaction(committed) => committed
+            .post_commit_snapshot()
+            .expect("post-commit snapshot should exist")
+            .clone(),
+        other => panic!("Expected CommittedTransaction, got: {other:?}"),
+    };
+
+    // Write a batch where region_id is ALL nulls.
+    // This should succeed — all-null clustering columns are valid.
+    let all_null_region: ArrayRef = Arc::new(Int32Array::from(vec![None, None, None]));
+    let batch = generate_batch(vec![
+        ("category", vec!["a", "b", "c"].into_array()),
+        ("region_id", all_null_region),
+    ])
+    .unwrap();
+
+    // BUG: This fails with "Column stats.minValues.region_id not found in the data"
+    // because collect_stats omits minValues/maxValues for all-null columns,
+    // but StatsVerifier tries to extract the column before checking nullCount == numRecords.
+    let snapshot = write_batch_to_table(&snapshot, engine.as_ref(), batch, HashMap::new())
+        .await
+        .unwrap();
+    assert_eq!(snapshot.version(), 1);
+
+    // Verify data is readable
+    let scan = snapshot.clone().scan_builder().build().unwrap();
+    let batches = read_scan(&scan, engine.clone()).unwrap();
+    let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+    assert_eq!(total_rows, 3);
+
+    // Verify stats: region_id should have nullCount=3, but minValues/maxValues
+    // should NOT contain region_id (since all values are null, there's nothing to aggregate)
+    let add_infos = read_add_infos(&snapshot, engine.as_ref()).unwrap();
+    assert_eq!(add_infos.len(), 1);
+    let stats = add_infos[0].stats.as_ref().expect("should have stats");
+    assert_eq!(stats["numRecords"], 3);
+    assert_eq!(stats["nullCount"]["region_id"], 3);
+    assert!(
+        stats["minValues"].get("region_id").is_none(),
+        "minValues should not contain region_id when all values are null"
+    );
+    assert!(
+        stats["maxValues"].get("region_id").is_none(),
+        "maxValues should not contain region_id when all values are null"
+    );
+}