diff --git a/kernel/tests/clustering_e2e.rs b/kernel/tests/clustering_e2e.rs index fbecfa9636..5ab1c0224d 100644 --- a/kernel/tests/clustering_e2e.rs +++ b/kernel/tests/clustering_e2e.rs @@ -7,6 +7,7 @@ use std::collections::HashMap; use std::sync::Arc; +use delta_kernel::arrow::array::{ArrayRef, Int32Array}; use delta_kernel::committer::FileSystemCommitter; use delta_kernel::expressions::ColumnName; use delta_kernel::schema::{DataType, StructField, StructType}; @@ -138,3 +139,84 @@ async fn test_clustered_table_write_and_checkpoint( Ok(()) } + +/// Regression test: writing a batch where a clustering column has ALL null values should succeed. +/// +/// `collect_stats` (commit 76d480f0) omits `minValues`/`maxValues` fields for all-null columns, +/// but `StatsVerifier` tries to extract `stats.minValues.` before checking the +/// `nullCount == numRecords` condition. The column extraction fails with: +/// "Column stats.minValues. not found in the data" +/// because the field is missing from the stats StructArray entirely. +/// +/// The verifier should tolerate missing min/max fields when the column is all-null. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_clustered_table_write_all_null_clustering_column() { + let (_temp_dir, table_path, engine) = test_table_setup_mt().unwrap(); + let schema = Arc::new( + StructType::try_new(vec![ + StructField::new("category", DataType::STRING, false), + StructField::new("region_id", DataType::INTEGER, true), + ]) + .unwrap(), + ); + + // Create table clustered on "category" and "region_id" + let create_result = create_table(&table_path, schema, "Test/1.0") + .with_data_layout(DataLayout::Clustered { + columns: vec![ + ColumnName::new(["category"]), + ColumnName::new(["region_id"]), + ], + }) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new())) + .unwrap() + .commit(engine.as_ref()) + .unwrap(); + + let snapshot = match create_result { + CommitResult::CommittedTransaction(committed) => committed + .post_commit_snapshot() + .expect("post-commit snapshot should exist") + .clone(), + other => panic!("Expected CommittedTransaction, got: {other:?}"), + }; + + // Write a batch where region_id is ALL nulls. + // This should succeed — all-null clustering columns are valid. + let all_null_region: ArrayRef = Arc::new(Int32Array::from(vec![None, None, None])); + let batch = generate_batch(vec![ + ("category", vec!["a", "b", "c"].into_array()), + ("region_id", all_null_region), + ]) + .unwrap(); + + // BUG: This fails with "Column stats.minValues.region_id not found in the data" + // because collect_stats omits minValues/maxValues for all-null columns, + // but StatsVerifier tries to extract the column before checking nullCount == numRecords. + let snapshot = write_batch_to_table(&snapshot, engine.as_ref(), batch, HashMap::new()) + .await + .unwrap(); + assert_eq!(snapshot.version(), 1); + + // Verify data is readable + let scan = snapshot.clone().scan_builder().build().unwrap(); + let batches = read_scan(&scan, engine.clone()).unwrap(); + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 3); + + // Verify stats: region_id should have nullCount=3, but minValues/maxValues + // should NOT contain region_id (since all values are null, there's nothing to aggregate) + let add_infos = read_add_infos(&snapshot, engine.as_ref()).unwrap(); + assert_eq!(add_infos.len(), 1); + let stats = add_infos[0].stats.as_ref().expect("should have stats"); + assert_eq!(stats["numRecords"], 3); + assert_eq!(stats["nullCount"]["region_id"], 3); + assert!( + stats["minValues"].get("region_id").is_none(), + "minValues should not contain region_id when all values are null" + ); + assert!( + stats["maxValues"].get("region_id").is_none(), + "maxValues should not contain region_id when all values are null" + ); +}