Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions kernel/tests/clustering_e2e.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
use std::collections::HashMap;
use std::sync::Arc;

use delta_kernel::arrow::array::{ArrayRef, Int32Array};
use delta_kernel::committer::FileSystemCommitter;
use delta_kernel::expressions::ColumnName;
use delta_kernel::schema::{DataType, StructField, StructType};
Expand Down Expand Up @@ -138,3 +139,84 @@ async fn test_clustered_table_write_and_checkpoint(

Ok(())
}

/// Regression test: writing a batch where a clustering column has ALL null values should succeed.
///
/// `collect_stats` (commit 76d480f0) omits `minValues`/`maxValues` fields for all-null columns,
/// but `StatsVerifier` tries to extract `stats.minValues.<column>` before checking the
/// `nullCount == numRecords` condition. The column extraction fails with:
/// "Column stats.minValues.<column> not found in the data"
/// because the field is missing from the stats StructArray entirely.
///
/// The verifier should tolerate missing min/max fields when the column is all-null.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn test_clustered_table_write_all_null_clustering_column() {
let (_temp_dir, table_path, engine) = test_table_setup_mt().unwrap();
let schema = Arc::new(
StructType::try_new(vec![
StructField::new("category", DataType::STRING, false),
StructField::new("region_id", DataType::INTEGER, true),
])
.unwrap(),
);

// Create table clustered on "category" and "region_id"
let create_result = create_table(&table_path, schema, "Test/1.0")
.with_data_layout(DataLayout::Clustered {
columns: vec![
ColumnName::new(["category"]),
ColumnName::new(["region_id"]),
],
})
.build(engine.as_ref(), Box::new(FileSystemCommitter::new()))
.unwrap()
.commit(engine.as_ref())
.unwrap();

let snapshot = match create_result {
CommitResult::CommittedTransaction(committed) => committed
.post_commit_snapshot()
.expect("post-commit snapshot should exist")
.clone(),
other => panic!("Expected CommittedTransaction, got: {other:?}"),
};

// Write a batch where region_id is ALL nulls.
// This should succeed — all-null clustering columns are valid.
let all_null_region: ArrayRef = Arc::new(Int32Array::from(vec![None, None, None]));
let batch = generate_batch(vec![
("category", vec!["a", "b", "c"].into_array()),
("region_id", all_null_region),
])
.unwrap();

// BUG: This fails with "Column stats.minValues.region_id not found in the data"
// because collect_stats omits minValues/maxValues for all-null columns,
// but StatsVerifier tries to extract the column before checking nullCount == numRecords.
let snapshot = write_batch_to_table(&snapshot, engine.as_ref(), batch, HashMap::new())
.await
.unwrap();
assert_eq!(snapshot.version(), 1);

// Verify data is readable
let scan = snapshot.clone().scan_builder().build().unwrap();
let batches = read_scan(&scan, engine.clone()).unwrap();
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
assert_eq!(total_rows, 3);

// Verify stats: region_id should have nullCount=3, but minValues/maxValues
// should NOT contain region_id (since all values are null, there's nothing to aggregate)
let add_infos = read_add_infos(&snapshot, engine.as_ref()).unwrap();
assert_eq!(add_infos.len(), 1);
let stats = add_infos[0].stats.as_ref().expect("should have stats");
assert_eq!(stats["numRecords"], 3);
assert_eq!(stats["nullCount"]["region_id"], 3);
assert!(
stats["minValues"].get("region_id").is_none(),
"minValues should not contain region_id when all values are null"
);
assert!(
stats["maxValues"].get("region_id").is_none(),
"maxValues should not contain region_id when all values are null"
);
}
Loading