diff --git a/include/paimon/read_context.h b/include/paimon/read_context.h index df124b0a0..a25d00408 100644 --- a/include/paimon/read_context.h +++ b/include/paimon/read_context.h @@ -23,6 +23,7 @@ #include #include +#include "arrow/c/abi.h" #include "paimon/cache/cache.h" #include "paimon/predicate/predicate.h" #include "paimon/result.h" @@ -44,7 +45,7 @@ class FileSystem; class PAIMON_EXPORT ReadContext { public: ReadContext(const std::string& path, const std::string& branch, - const std::vector& read_schema, + const std::vector& read_field_names, const std::vector& read_field_ids, const std::shared_ptr& predicate, bool enable_predicate_filter, bool enable_prefetch, uint32_t prefetch_batch_count, @@ -75,8 +76,8 @@ class PAIMON_EXPORT ReadContext { return options_; } - const std::vector& GetReadSchema() const { - return read_schema_; + const std::vector& GetReadFieldNames() const { + return read_field_names_; } const std::vector& GetReadFieldIds() const { @@ -130,10 +131,25 @@ class PAIMON_EXPORT ReadContext { return cache_; } + /// Whether a read schema (C ArrowSchema) for nested column pruning was provided. + bool HasReadSchema() const { + return read_schema_ != nullptr; + } + + /// Get the read schema as a mutable C ArrowSchema pointer. + /// ImportSchema will consume (release) the schema content. + ArrowSchema* GetReadSchema() { + return read_schema_; + } + + /// Set the read schema from a C ArrowSchema pointer. Does NOT take ownership. + /// Called internally by ReadContextBuilder. + void SetReadSchema(ArrowSchema* schema); + private: std::string path_; std::string branch_; - std::vector read_schema_; + std::vector read_field_names_; std::vector read_field_ids_; std::shared_ptr predicate_; bool enable_predicate_filter_; @@ -151,6 +167,7 @@ class PAIMON_EXPORT ReadContext { PrefetchCacheMode prefetch_cache_mode_; CacheConfig cache_config_; std::shared_ptr cache_; + ArrowSchema* read_schema_ = nullptr; }; /// `ReadContextBuilder` used to build a `ReadContext`, has input validation. @@ -173,9 +190,9 @@ class PAIMON_EXPORT ReadContextBuilder { /// /// @param read_field_names Vector of field names to read from the table. /// @return Reference to this builder for method chaining. - /// @note Currently supports top-level field selection. Future versions may support - /// nested field selection using ArrowSchema for more granular projection - ReadContextBuilder& SetReadSchema(const std::vector& read_field_names); + /// @note Currently supports top-level field selection. For nested field selection + /// use SetReadSchema(ArrowSchema*) instead. + ReadContextBuilder& SetReadFieldNames(const std::vector& read_field_names); /// Set the schema fields to read from the table. /// /// If not set, all fields from the table schema will be read. This is useful for @@ -184,12 +201,24 @@ class PAIMON_EXPORT ReadContextBuilder { /// /// @param read_field_ids Vector of field ids to read from the table. /// @return Reference to this builder for method chaining. - /// @note Currently supports top-level field selection. Future versions may support - /// nested field selection using ArrowSchema for more granular projection. - /// @note SetReadFieldIds() and SetReadSchema() are mutually exclusive. - /// Calling both will ignore the read schema set by SetReadSchema(). + /// @note Currently supports top-level field selection. + /// @note SetReadFieldIds() and SetReadFieldNames() are mutually exclusive. + /// Calling both will ignore the read schema set by SetReadFieldNames(). ReadContextBuilder& SetReadFieldIds(const std::vector& read_field_ids); + /// Set the read Arrow Schema for nested column pruning. + /// + /// The read schema is an Arrow C Data Interface schema where STRUCT types + /// may contain only a subset of the original sub-fields, enabling nested column + /// pruning to reduce I/O. Each Arrow field must carry a "paimon.id" metadata + /// entry for field matching. + /// + /// @param read_schema Arrow C Schema. The caller retains ownership. + /// @return Reference to this builder for method chaining. + /// @note Priority: read_schema > read_field_ids > read_field_names. + /// When set, read_field_ids and read_field_names are ignored. + ReadContextBuilder& SetReadSchema(ArrowSchema* read_schema); + /// Set a configuration options map to set some option entries which are not defined in the /// table schema or whose values you want to overwrite. /// @note The options map will clear the options added by `AddOption()` before. diff --git a/src/paimon/CMakeLists.txt b/src/paimon/CMakeLists.txt index 65026bfe3..a4c672c62 100644 --- a/src/paimon/CMakeLists.txt +++ b/src/paimon/CMakeLists.txt @@ -342,6 +342,7 @@ set(PAIMON_CORE_SRCS core/utils/blob_view_lookup.cpp core/utils/consumer_manager.cpp core/utils/field_mapping.cpp + core/utils/nested_projection_utils.cpp core/utils/file_store_path_factory.cpp core/utils/file_utils.cpp core/utils/manifest_meta_reader.cpp @@ -735,6 +736,7 @@ if(PAIMON_BUILD_TESTS) core/utils/consumer_manager_test.cpp core/utils/file_store_path_factory_cache_test.cpp core/utils/field_mapping_test.cpp + core/utils/nested_projection_utils_test.cpp core/utils/file_store_path_factory_test.cpp core/utils/file_utils_test.cpp core/utils/manifest_meta_reader_test.cpp diff --git a/src/paimon/common/memory/memory_segment_test.cpp b/src/paimon/common/memory/memory_segment_test.cpp index 95a7a6ed9..c127efb52 100644 --- a/src/paimon/common/memory/memory_segment_test.cpp +++ b/src/paimon/common/memory/memory_segment_test.cpp @@ -555,10 +555,7 @@ TEST(MemorySegmentTest, TestDoubleAccess) { delete[] occupied; } -// ------------------------------------------------------------------------ -// Bulk Byte Movements -// ------------------------------------------------------------------------ - +// Bulk Byte Movements TEST(MemorySegmentTest, TestBulkByteAccess) { auto pool = paimon::GetDefaultPool(); // test expected correct behavior with default offset / length diff --git a/src/paimon/common/types/data_field.h b/src/paimon/common/types/data_field.h index 7aa339944..210db42ea 100644 --- a/src/paimon/common/types/data_field.h +++ b/src/paimon/common/types/data_field.h @@ -41,6 +41,9 @@ class DataField : public Jsonizable { static constexpr char FIELD_ID[] = "paimon.id"; static constexpr char DESCRIPTION[] = "paimon.description"; + /// Metadata key for map field selected keys. The value is a comma-separated + /// string of key names, e.g. 'key1,key2'. Only string-keyed maps are supported. + static constexpr char MAP_SELECTED_KEYS[] = "paimon.map.selected-keys"; public: static std::shared_ptr ConvertDataFieldToArrowField(const DataField& field); diff --git a/src/paimon/core/global_index/global_index_write_task.cpp b/src/paimon/core/global_index/global_index_write_task.cpp index 5ee425f86..b0cbfb1e0 100644 --- a/src/paimon/core/global_index/global_index_write_task.cpp +++ b/src/paimon/core/global_index/global_index_write_task.cpp @@ -83,7 +83,7 @@ Result> CreateBatchReader( .WithFileSystem(core_options.GetFileSystem()) .EnablePrefetch(true) .WithMemoryPool(pool) - .SetReadSchema({field_name, SpecialFields::RowId().Name()}); + .SetReadFieldNames({field_name, SpecialFields::RowId().Name()}); PAIMON_ASSIGN_OR_RAISE(std::unique_ptr read_context, read_context_builder.Finish()); PAIMON_ASSIGN_OR_RAISE(std::unique_ptr table_read, diff --git a/src/paimon/core/io/field_mapping_reader.cpp b/src/paimon/core/io/field_mapping_reader.cpp index 767e0db6c..4249c55d5 100644 --- a/src/paimon/core/io/field_mapping_reader.cpp +++ b/src/paimon/core/io/field_mapping_reader.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include "arrow/api.h" @@ -35,6 +36,7 @@ #include "paimon/core/casting/cast_executor.h" #include "paimon/core/casting/casting_utils.h" #include "paimon/core/utils/field_mapping.h" +#include "paimon/core/utils/nested_projection_utils.h" #include "paimon/memory/bytes.h" #include "paimon/reader/batch_reader.h" @@ -74,6 +76,23 @@ FieldMappingReader::FieldMappingReader(int32_t field_count, non_partition_info_.non_partition_read_schema[i].Name()) { need_mapping_ = true; } + // Map selected-keys metadata also requires mapping so that + // FilterMapArrayBySelectedKeys can filter out unwanted entries. + if (!need_mapping_ && + non_partition_info_.non_partition_read_schema[i].Type()->id() == arrow::Type::MAP) { + auto selected_keys_or = NestedProjectionUtils::GetMapSelectedKeys( + non_partition_info_.non_partition_read_schema[i].ArrowField()); + if (!selected_keys_or.ok()) { + // Keep mapping enabled so the parse error can be surfaced in + // MappingFields where Status can be returned. + need_mapping_ = true; + continue; + } + auto& selected_keys = selected_keys_or.value(); + if (!selected_keys.empty()) { + need_mapping_ = true; + } + } } } @@ -142,9 +161,9 @@ Result FieldMappingReader::NextBatchWithBitmap // mapping non-partition array PAIMON_ASSIGN_OR_RAISE(std::shared_ptr casted_non_partition_array, CastNonPartitionArrayIfNeed(non_partition_array)); - MappingFields(casted_non_partition_array, non_partition_info_.non_partition_read_schema, - non_partition_info_.idx_in_target_read_schema, &target_array, - &target_field_names); + PAIMON_RETURN_NOT_OK(MappingFields( + casted_non_partition_array, non_partition_info_.non_partition_read_schema, + non_partition_info_.idx_in_target_read_schema, &target_array, &target_field_names)); // mapping partition array if (partition_info_ != std::nullopt) { @@ -153,9 +172,9 @@ Result FieldMappingReader::NextBatchWithBitmap GeneratePartitionArray(non_partition_array->length())); } auto trim_partition_array = partition_array_->Slice(0, non_partition_array->length()); - MappingFields(trim_partition_array, partition_info_.value().partition_read_schema, - partition_info_.value().idx_in_target_read_schema, &target_array, - &target_field_names); + PAIMON_RETURN_NOT_OK(MappingFields( + trim_partition_array, partition_info_.value().partition_read_schema, + partition_info_.value().idx_in_target_read_schema, &target_array, &target_field_names)); } // mapping non-exist array if (non_exist_field_info_ != std::nullopt) { @@ -164,9 +183,10 @@ Result FieldMappingReader::NextBatchWithBitmap GenerateNonExistArray(non_partition_array->length())); } auto trim_non_exist_array = non_exist_array_->Slice(0, non_partition_array->length()); - MappingFields(trim_non_exist_array, non_exist_field_info_.value().non_exist_read_schema, - non_exist_field_info_.value().idx_in_target_read_schema, &target_array, - &target_field_names); + PAIMON_RETURN_NOT_OK(MappingFields(trim_non_exist_array, + non_exist_field_info_.value().non_exist_read_schema, + non_exist_field_info_.value().idx_in_target_read_schema, + &target_array, &target_field_names)); } // construct target array @@ -283,20 +303,33 @@ Result> FieldMappingReader::GenerateNonExistArray( return arrow_array; } -void FieldMappingReader::MappingFields(const std::shared_ptr& data_array, - const std::vector& read_fields_of_data_array, - const std::vector& idx_in_target_schema, - arrow::ArrayVector* target_array, - std::vector* target_field_names) { +Status FieldMappingReader::MappingFields(const std::shared_ptr& data_array, + const std::vector& read_fields_of_data_array, + const std::vector& idx_in_target_schema, + arrow::ArrayVector* target_array, + std::vector* target_field_names) { auto* struct_array = arrow::internal::checked_cast(data_array.get()); assert(struct_array); assert(struct_array->fields().size() == idx_in_target_schema.size()); for (size_t i = 0; i < idx_in_target_schema.size(); i++) { - // target type may be string type, but after adapter transform, type may be dictionary, - // need reconstruct struct type - (*target_array)[idx_in_target_schema[i]] = struct_array->field(i); + std::shared_ptr field_array = struct_array->field(i); + + // Filter map entries by selected keys if metadata is present. + if (field_array->type()->id() == arrow::Type::MAP) { + PAIMON_ASSIGN_OR_RAISE(std::vector selected_keys, + NestedProjectionUtils::GetMapSelectedKeys( + read_fields_of_data_array[i].ArrowField())); + if (!selected_keys.empty()) { + PAIMON_ASSIGN_OR_RAISE(field_array, + NestedProjectionUtils::FilterMapArrayBySelectedKeys( + field_array, selected_keys)); + } + } + + (*target_array)[idx_in_target_schema[i]] = std::move(field_array); (*target_field_names)[idx_in_target_schema[i]] = read_fields_of_data_array[i].Name(); } + return Status::OK(); } } // namespace paimon diff --git a/src/paimon/core/io/field_mapping_reader.h b/src/paimon/core/io/field_mapping_reader.h index ffd18bd68..3e7611a74 100644 --- a/src/paimon/core/io/field_mapping_reader.h +++ b/src/paimon/core/io/field_mapping_reader.h @@ -96,11 +96,11 @@ class FieldMappingReader : public FileBatchReader { Result> CastNonPartitionArrayIfNeed( const std::shared_ptr& src_array) const; - static void MappingFields(const std::shared_ptr& src_array, - const std::vector& read_fields_of_data_array, - const std::vector& idx_in_target_schema, - arrow::ArrayVector* target_array, - std::vector* target_field_names); + static Status MappingFields(const std::shared_ptr& src_array, + const std::vector& read_fields_of_data_array, + const std::vector& idx_in_target_schema, + arrow::ArrayVector* target_array, + std::vector* target_field_names); private: bool need_mapping_ = false; diff --git a/src/paimon/core/io/field_mapping_reader_test.cpp b/src/paimon/core/io/field_mapping_reader_test.cpp index 7b916b914..fbde8812e 100644 --- a/src/paimon/core/io/field_mapping_reader_test.cpp +++ b/src/paimon/core/io/field_mapping_reader_test.cpp @@ -45,6 +45,7 @@ #include "paimon/memory/memory_pool.h" #include "paimon/predicate/literal.h" #include "paimon/predicate/predicate_builder.h" +#include "paimon/testing/mock/mock_file_batch_reader.h" #include "paimon/testing/utils/binary_row_generator.h" #include "paimon/testing/utils/read_result_collector.h" #include "paimon/testing/utils/testharness.h" diff --git a/src/paimon/core/operation/internal_read_context.cpp b/src/paimon/core/operation/internal_read_context.cpp index 9ff2b4647..53ebee558 100644 --- a/src/paimon/core/operation/internal_read_context.cpp +++ b/src/paimon/core/operation/internal_read_context.cpp @@ -16,19 +16,159 @@ #include "paimon/core/operation/internal_read_context.h" +#include +#include #include +#include "arrow/api.h" +#include "arrow/c/abi.h" +#include "arrow/c/bridge.h" +#include "fmt/format.h" #include "paimon/common/predicate/predicate_validator.h" #include "paimon/common/table/special_fields.h" #include "paimon/common/types/data_field.h" +#include "paimon/common/utils/arrow/status_utils.h" #include "paimon/core/schema/arrow_schema_validator.h" +#include "paimon/core/utils/nested_projection_utils.h" #include "paimon/status.h" -namespace arrow { -class Schema; -} // namespace arrow - namespace paimon { + +std::shared_ptr InternalReadContext::MergeReadFieldMetadata( + const std::shared_ptr& aligned_field, + const std::shared_ptr& read_field) { + if (!read_field->HasMetadata() || !read_field->metadata()) { + return aligned_field; + } + std::unordered_map metadata_map; + read_field->metadata()->ToUnorderedMap(&metadata_map); + metadata_map.erase(DataField::FIELD_ID); + if (metadata_map.empty()) { + return aligned_field; + } + auto metadata = std::make_shared(metadata_map); + return aligned_field->WithMergedMetadata(metadata); +} + +Result> InternalReadContext::AlignReadFieldWithTableFieldIds( + const std::shared_ptr& read_field, + const std::shared_ptr& table_field) { + if (read_field->type()->id() != table_field->type()->id()) { + return Status::Invalid(fmt::format( + "Read schema field '{}' type {} does not match table field type {}", read_field->name(), + read_field->type()->ToString(), table_field->type()->ToString())); + } + + auto type_id = read_field->type()->id(); + if (type_id == arrow::Type::STRUCT) { + auto read_struct = std::static_pointer_cast(read_field->type()); + auto table_struct = std::static_pointer_cast(table_field->type()); + arrow::FieldVector rebased_children; + rebased_children.reserve(read_struct->num_fields()); + for (const auto& read_child : read_struct->fields()) { + auto table_child = + NestedProjectionUtils::FindFieldByName(table_struct->fields(), read_child->name()); + if (!table_child) { + return Status::Invalid( + fmt::format("Read schema nested field '{}' does not exist in table field '{}'", + read_child->name(), table_field->name())); + } + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr rebased_child, + AlignReadFieldWithTableFieldIds(read_child, table_child)); + rebased_children.push_back(rebased_child); + } + auto rebased_type = arrow::struct_(rebased_children); + auto aligned_field = table_field->WithType(rebased_type)->WithName(read_field->name()); + return MergeReadFieldMetadata(aligned_field, read_field); + } + + if (type_id == arrow::Type::LIST) { + auto read_list = std::static_pointer_cast(read_field->type()); + auto table_list = std::static_pointer_cast(table_field->type()); + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr rebased_value_field, + AlignReadFieldWithTableFieldIds(read_list->value_field(), table_list->value_field())); + auto rebased_type = arrow::list(rebased_value_field); + auto aligned_field = table_field->WithType(rebased_type)->WithName(read_field->name()); + return MergeReadFieldMetadata(aligned_field, read_field); + } + + if (type_id == arrow::Type::MAP) { + auto read_map = std::static_pointer_cast(read_field->type()); + auto table_map = std::static_pointer_cast(table_field->type()); + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr rebased_key_field, + AlignReadFieldWithTableFieldIds(read_map->key_field(), table_map->key_field())); + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr rebased_item_field, + AlignReadFieldWithTableFieldIds(read_map->item_field(), table_map->item_field())); + auto rebased_type = arrow::map(rebased_key_field->type(), rebased_item_field); + auto aligned_field = table_field->WithType(rebased_type)->WithName(read_field->name()); + return MergeReadFieldMetadata(aligned_field, read_field); + } + + if (!read_field->type()->Equals(table_field->type())) { + return Status::Invalid(fmt::format( + "Read schema field '{}' type {} does not match table field type {}", read_field->name(), + read_field->type()->ToString(), table_field->type()->ToString())); + } + + auto aligned_field = table_field->WithType(read_field->type())->WithName(read_field->name()); + return MergeReadFieldMetadata(aligned_field, read_field); +} + +std::optional InternalReadContext::TryResolveSpecialFieldById( + int32_t field_id, const CoreOptions& core_options) { + if (field_id == SpecialFields::ValueKind().Id()) { + return SpecialFields::ValueKind(); + } + if (field_id == SpecialFields::RowId().Id()) { + if (core_options.RowTrackingEnabled()) { + return SpecialFields::RowId(); + } + return std::nullopt; + } + if (field_id == SpecialFields::SequenceNumber().Id()) { + if (core_options.RowTrackingEnabled() || core_options.KeyValueSequenceNumberEnabled()) { + return SpecialFields::SequenceNumber(); + } + return std::nullopt; + } + if (field_id == SpecialFields::IndexScore().Id()) { + if (core_options.DataEvolutionEnabled()) { + return SpecialFields::IndexScore(); + } + return std::nullopt; + } + return std::nullopt; +} + +std::optional InternalReadContext::TryResolveSpecialFieldByName( + const std::string& name, const CoreOptions& core_options) { + if (name == SpecialFields::ValueKind().Name()) { + return SpecialFields::ValueKind(); + } + if (name == SpecialFields::RowId().Name()) { + if (core_options.RowTrackingEnabled()) { + return SpecialFields::RowId(); + } + return std::nullopt; + } + if (name == SpecialFields::SequenceNumber().Name()) { + if (core_options.RowTrackingEnabled() || core_options.KeyValueSequenceNumberEnabled()) { + return SpecialFields::SequenceNumber(); + } + return std::nullopt; + } + if (name == SpecialFields::IndexScore().Name()) { + if (core_options.DataEvolutionEnabled()) { + return SpecialFields::IndexScore(); + } + return std::nullopt; + } + return std::nullopt; +} + Result> InternalReadContext::Create( const std::shared_ptr& context, const std::shared_ptr& table_schema, const std::map& options) { @@ -37,53 +177,45 @@ Result> InternalReadContext::Create( context->GetFileSystemSchemeToIdentifierMap())); core_options.WithCache(context->GetCache()); // prepare read schema + // Priority: projected_arrow_schema > read_field_ids > read_field_names std::vector read_data_fields; - if (!context->GetReadFieldIds().empty()) { - read_data_fields.reserve(context->GetReadFieldIds().size()); - for (const auto& field_id : context->GetReadFieldIds()) { - // if enable row tracking or data evolution, check special fields - if (core_options.RowTrackingEnabled() && field_id == SpecialFields::RowId().Id()) { - read_data_fields.push_back(SpecialFields::RowId()); + if (context->HasReadSchema()) { + // Nested column pruning path: user provided a read C ArrowSchema + // where STRUCT types may contain only a subset of sub-fields. + // ImportSchema consumes the C schema — that's fine, it's one-shot usage. + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr read_schema, + arrow::ImportSchema(context->GetReadSchema())); + read_data_fields.reserve(read_schema->num_fields()); + // Align special-field validation with read_field_ids/read_field_names branches. + for (const auto& read_field : read_schema->fields()) { + if (auto resolved_special_field = + TryResolveSpecialFieldByName(read_field->name(), core_options)) { + read_data_fields.push_back(*resolved_special_field); continue; } - if ((core_options.RowTrackingEnabled() || - core_options.KeyValueSequenceNumberEnabled()) && - field_id == SpecialFields::SequenceNumber().Id()) { - read_data_fields.push_back(SpecialFields::SequenceNumber()); - continue; - } - if (field_id == SpecialFields::ValueKind().Id()) { - read_data_fields.push_back(SpecialFields::ValueKind()); - continue; - } - if (core_options.DataEvolutionEnabled() && - field_id == SpecialFields::IndexScore().Id()) { - read_data_fields.push_back(SpecialFields::IndexScore()); + PAIMON_ASSIGN_OR_RAISE(DataField table_field, + table_schema->GetField(read_field->name())); + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr aligned_field, + AlignReadFieldWithTableFieldIds(read_field, table_field.ArrowField())); + read_data_fields.emplace_back(table_field.Id(), aligned_field, + table_field.Description()); + } + } else if (!context->GetReadFieldIds().empty()) { + read_data_fields.reserve(context->GetReadFieldIds().size()); + for (const auto& field_id : context->GetReadFieldIds()) { + if (auto resolved_special_field = TryResolveSpecialFieldById(field_id, core_options)) { + read_data_fields.push_back(*resolved_special_field); continue; } PAIMON_ASSIGN_OR_RAISE(DataField field, table_schema->GetField(field_id)); read_data_fields.push_back(field); } - } else if (!context->GetReadSchema().empty()) { - read_data_fields.reserve(context->GetReadSchema().size()); - for (const auto& name : context->GetReadSchema()) { - // if enable row tracking or data evolution, check special fields - if (core_options.RowTrackingEnabled() && name == SpecialFields::RowId().Name()) { - read_data_fields.push_back(SpecialFields::RowId()); - continue; - } - if ((core_options.RowTrackingEnabled() || - core_options.KeyValueSequenceNumberEnabled()) && - name == SpecialFields::SequenceNumber().Name()) { - read_data_fields.push_back(SpecialFields::SequenceNumber()); - continue; - } - if (name == SpecialFields::ValueKind().Name()) { - read_data_fields.push_back(SpecialFields::ValueKind()); - continue; - } - if (core_options.DataEvolutionEnabled() && name == SpecialFields::IndexScore().Name()) { - read_data_fields.push_back(SpecialFields::IndexScore()); + } else if (!context->GetReadFieldNames().empty()) { + read_data_fields.reserve(context->GetReadFieldNames().size()); + for (const auto& name : context->GetReadFieldNames()) { + if (auto resolved_special_field = TryResolveSpecialFieldByName(name, core_options)) { + read_data_fields.push_back(*resolved_special_field); continue; } PAIMON_ASSIGN_OR_RAISE(DataField field, table_schema->GetField(name)); diff --git a/src/paimon/core/operation/internal_read_context.h b/src/paimon/core/operation/internal_read_context.h index 12b734a62..7b448946c 100644 --- a/src/paimon/core/operation/internal_read_context.h +++ b/src/paimon/core/operation/internal_read_context.h @@ -112,6 +112,17 @@ class InternalReadContext { const std::shared_ptr& read_schema, const CoreOptions& options); + static std::optional TryResolveSpecialFieldById(int32_t field_id, + const CoreOptions& core_options); + static std::optional TryResolveSpecialFieldByName(const std::string& name, + const CoreOptions& core_options); + static std::shared_ptr MergeReadFieldMetadata( + const std::shared_ptr& aligned_field, + const std::shared_ptr& read_field); + static Result> AlignReadFieldWithTableFieldIds( + const std::shared_ptr& read_field, + const std::shared_ptr& table_field); + std::shared_ptr read_context_; std::shared_ptr table_schema_; std::shared_ptr read_schema_; diff --git a/src/paimon/core/operation/internal_read_context_test.cpp b/src/paimon/core/operation/internal_read_context_test.cpp index e48336b3f..2ccd9107e 100644 --- a/src/paimon/core/operation/internal_read_context_test.cpp +++ b/src/paimon/core/operation/internal_read_context_test.cpp @@ -50,7 +50,7 @@ TEST(InternalReadContext, TestReadWithUnspecifiedSchema) { TEST(InternalReadContext, TestReadWithSpecifiedSchema) { std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09"; ReadContextBuilder context_builder(path); - context_builder.SetReadSchema({"f3", "f0"}); + context_builder.SetReadFieldNames({"f3", "f0"}); ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); SchemaManager schema_manager(std::make_shared(), read_context->GetPath()); ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0)); @@ -84,7 +84,7 @@ TEST(InternalReadContext, TestReadWithSpecifiedFieldIdAndSchema) { ReadContextBuilder context_builder(path); // read schema is specified, read fields in schema // will use field ids instead of field names. - context_builder.SetReadSchema({"f0"}); + context_builder.SetReadFieldNames({"f0"}); context_builder.SetReadFieldIds({3, 0}); ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); SchemaManager schema_manager(std::make_shared(), read_context->GetPath()); @@ -103,7 +103,8 @@ TEST(InternalReadContext, TestReadWithRowTrackingAndScoreFields) { // test simple std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09"; ReadContextBuilder context_builder(path); - context_builder.SetReadSchema({"f3", "f0", "_ROW_ID", "_SEQUENCE_NUMBER", "_INDEX_SCORE"}); + context_builder.SetReadFieldNames( + {"f3", "f0", "_ROW_ID", "_SEQUENCE_NUMBER", "_INDEX_SCORE"}); ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); SchemaManager schema_manager(std::make_shared(), read_context->GetPath()); ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0)); @@ -124,7 +125,7 @@ TEST(InternalReadContext, TestReadWithRowTrackingAndScoreFields) { // test invalid case: disable row tracking while read row tracking fields std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09"; ReadContextBuilder context_builder(path); - context_builder.SetReadSchema({"f3", "f0", "_ROW_ID", "_SEQUENCE_NUMBER"}); + context_builder.SetReadFieldNames({"f3", "f0", "_ROW_ID", "_SEQUENCE_NUMBER"}); ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); SchemaManager schema_manager(std::make_shared(), read_context->GetPath()); ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0)); @@ -136,7 +137,7 @@ TEST(InternalReadContext, TestReadWithRowTrackingAndScoreFields) { // test invalid case: disable data evolution while read score fields std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09"; ReadContextBuilder context_builder(path); - context_builder.SetReadSchema({"f3", "f0", "_INDEX_SCORE"}); + context_builder.SetReadFieldNames({"f3", "f0", "_INDEX_SCORE"}); ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); SchemaManager schema_manager(std::make_shared(), read_context->GetPath()); ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0)); @@ -149,7 +150,7 @@ TEST(InternalReadContext, TestReadWithRowTrackingAndScoreFields) { TEST(InternalReadContext, TestReadWithValueKindField) { std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09"; ReadContextBuilder context_builder(path); - context_builder.SetReadSchema({"f3", "_VALUE_KIND", "f0"}); + context_builder.SetReadFieldNames({"f3", "_VALUE_KIND", "f0"}); ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); SchemaManager schema_manager(std::make_shared(), read_context->GetPath()); ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0)); @@ -191,4 +192,76 @@ TEST(InternalReadContext, TestReadWithFieldIdsAndSpecialFields) { } } +TEST(InternalReadContext, TestReadWithProjectedSchemaAndSpecialFields) { + std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09"; + + std::vector projected_fields = { + DataField(0, arrow::field("f0", arrow::utf8())), SpecialFields::RowId(), + SpecialFields::SequenceNumber(), SpecialFields::IndexScore()}; + auto schema_manager = SchemaManager(std::make_shared(), path); + ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0)); + + // Without options, special fields should be rejected in projected-schema path too. + { + auto projected_schema = DataField::ConvertDataFieldsToArrowSchema(projected_fields); + ArrowSchema c_schema; + ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok()); + ReadContextBuilder context_builder(path); + context_builder.SetReadSchema(&c_schema); + ASSERT_OK_AND_ASSIGN(auto unique_read_context, context_builder.Finish()); + std::shared_ptr read_context = std::move(unique_read_context); + ASSERT_NOK_WITH_MSG( + InternalReadContext::Create(read_context, table_schema, table_schema->Options()), + "not exist in table schema"); + } + + // With options enabled, projected-schema path should accept these special fields. + auto enabled_options = table_schema->Options(); + enabled_options[Options::ROW_TRACKING_ENABLED] = "true"; + enabled_options[Options::DATA_EVOLUTION_ENABLED] = "true"; + + { + auto projected_schema = DataField::ConvertDataFieldsToArrowSchema(projected_fields); + ArrowSchema c_schema; + ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok()); + ReadContextBuilder context_builder(path); + context_builder.SetReadSchema(&c_schema); + ASSERT_OK_AND_ASSIGN(auto unique_read_context, context_builder.Finish()); + std::shared_ptr read_context = std::move(unique_read_context); + ASSERT_OK_AND_ASSIGN( + auto internal_context, + InternalReadContext::Create(read_context, table_schema, enabled_options)); + auto expected_schema = DataField::ConvertDataFieldsToArrowSchema(projected_fields); + ASSERT_TRUE(internal_context->GetReadSchema()->Equals(expected_schema)); + } +} + +TEST(InternalReadContext, TestReadWithProjectedSchemaWithoutFieldIds) { + std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09"; + + auto projected_schema = + arrow::schema({arrow::field("f3", arrow::float64()), arrow::field("f0", arrow::utf8())}); + ArrowSchema c_schema; + ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok()); + + ReadContextBuilder context_builder(path); + context_builder.SetReadSchema(&c_schema); + ASSERT_OK_AND_ASSIGN(auto unique_read_context, context_builder.Finish()); + std::shared_ptr read_context = std::move(unique_read_context); + + SchemaManager schema_manager(std::make_shared(), read_context->GetPath()); + ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0)); + + ASSERT_OK_AND_ASSIGN( + auto internal_context, + InternalReadContext::Create(read_context, table_schema, table_schema->Options())); + + std::vector expected_fields = { + DataField(3, arrow::field("f3", arrow::float64())), + DataField(0, arrow::field("f0", arrow::utf8())), + }; + auto expected_schema = DataField::ConvertDataFieldsToArrowSchema(expected_fields); + ASSERT_TRUE(internal_context->GetReadSchema()->Equals(expected_schema)); +} + } // namespace paimon::test diff --git a/src/paimon/core/operation/merge_file_split_read_test.cpp b/src/paimon/core/operation/merge_file_split_read_test.cpp index ec5c28f0e..72e4bb05a 100644 --- a/src/paimon/core/operation/merge_file_split_read_test.cpp +++ b/src/paimon/core/operation/merge_file_split_read_test.cpp @@ -611,7 +611,7 @@ TEST_P(MergeFileSplitReadTest, TestSimple) { auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields); ASSERT_TRUE(read_schema); - context_builder.SetReadSchema({"k1", "p1", "s1", "v0", "v1"}); + context_builder.SetReadFieldNames({"k1", "p1", "s1", "v0", "v1"}); context_builder.SetOptions({{Options::SEQUENCE_FIELD, "s0,s1"}, {Options::MERGE_ENGINE, "deduplicate"}, {Options::IGNORE_DELETE, "true"}}); @@ -677,7 +677,7 @@ TEST_P(MergeFileSplitReadTest, TestLookUp) { auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields); ASSERT_TRUE(read_schema); - context_builder.SetReadSchema({"k1", "p1", "s1", "v0", "v1"}); + context_builder.SetReadFieldNames({"k1", "p1", "s1", "v0", "v1"}); context_builder.SetOptions({{Options::SEQUENCE_FIELD, "s0,s1"}, {Options::MERGE_ENGINE, "deduplicate"}, {Options::IGNORE_DELETE, "true"}, @@ -751,7 +751,7 @@ TEST_P(MergeFileSplitReadTest, TestDeduplicateMergeEngineWithDeleteMsg) { auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields); ASSERT_TRUE(read_schema); - context_builder.SetReadSchema({"k0", "k1", "v0", "v1", "v2"}); + context_builder.SetReadFieldNames({"k0", "k1", "v0", "v1", "v2"}); context_builder.SetOptions({{Options::MERGE_ENGINE, "deduplicate"}}); AddOptions(&context_builder); ASSERT_OK_AND_ASSIGN(std::shared_ptr read_context, context_builder.Finish()); @@ -792,7 +792,7 @@ TEST_P(MergeFileSplitReadTest, TestReadWithPredicate) { auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields); ASSERT_TRUE(read_schema); - context_builder.SetReadSchema({"k1", "p1", "s1", "s0", "v0", "v1"}); + context_builder.SetReadFieldNames({"k1", "p1", "s1", "s0", "v0", "v1"}); context_builder.SetOptions({{Options::SEQUENCE_FIELD, "s0,s1"}, {Options::MERGE_ENGINE, "deduplicate"}, {Options::IGNORE_DELETE, "true"}}); @@ -857,7 +857,7 @@ TEST_P(MergeFileSplitReadTest, TestReadWithAlterTable) { auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields); ASSERT_TRUE(read_schema); - context_builder.SetReadSchema({"k1", "k0", "p0", "p1", "s1", "s0", "v0", "v1", "v2"}); + context_builder.SetReadFieldNames({"k1", "k0", "p0", "p1", "s1", "s0", "v0", "v1", "v2"}); context_builder.SetOptions({{Options::SEQUENCE_FIELD, "s0,s1"}, {Options::MERGE_ENGINE, "deduplicate"}, {Options::IGNORE_DELETE, "true"}}); @@ -906,7 +906,7 @@ TEST_P(MergeFileSplitReadTest, TestReadWithAlterTableWithReverseSequence) { auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields); ASSERT_TRUE(read_schema); - context_builder.SetReadSchema({"v2", "p1", "k0", "p0", "s0", "v0"}); + context_builder.SetReadFieldNames({"v2", "p1", "k0", "p0", "s0", "v0"}); context_builder.SetOptions({{Options::SEQUENCE_FIELD, "s0,s1"}, {Options::MERGE_ENGINE, "deduplicate"}, {Options::IGNORE_DELETE, "true"}}); @@ -954,7 +954,7 @@ TEST_P(MergeFileSplitReadTest, TestAggregateMergeEngine) { auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields); ASSERT_TRUE(read_schema); - context_builder.SetReadSchema({"k1", "p1", "s1", "v0", "v1"}); + context_builder.SetReadFieldNames({"k1", "p1", "s1", "v0", "v1"}); context_builder.SetOptions({{Options::SEQUENCE_FIELD, "s0,s1"}, {Options::MERGE_ENGINE, "aggregation"}, {"fields.v1.aggregate-function", "bool_and"}, @@ -1001,7 +1001,7 @@ TEST_P(MergeFileSplitReadTest, TestPartialUpdateMergeEngine) { auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields); ASSERT_TRUE(read_schema); - context_builder.SetReadSchema({"k1", "p1", "s1", "v0"}); + context_builder.SetReadFieldNames({"k1", "p1", "s1", "v0"}); context_builder.SetOptions({{Options::SEQUENCE_FIELD, "s0,s1"}, {Options::MERGE_ENGINE, "partial-update"}, {"fields.v1.sequence-group", "v0"}, @@ -1049,7 +1049,7 @@ TEST_P(MergeFileSplitReadTest, TestPartialUpdateMergeEngineWithIgnoreDelete) { auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields); ASSERT_TRUE(read_schema); - context_builder.SetReadSchema({"k0", "k1", "v0", "v1", "v2"}); + context_builder.SetReadFieldNames({"k0", "k1", "v0", "v1", "v2"}); context_builder.SetOptions( {{Options::MERGE_ENGINE, "partial-update"}, {Options::IGNORE_DELETE, "true"}}); AddOptions(&context_builder); @@ -1089,7 +1089,7 @@ TEST_P(MergeFileSplitReadTest, TestPartialUpdateMergeEngineWithRemoveRecordOnDel auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields); ASSERT_TRUE(read_schema); - context_builder.SetReadSchema({"k0", "k1", "v0", "v1", "v2"}); + context_builder.SetReadFieldNames({"k0", "k1", "v0", "v1", "v2"}); context_builder.SetOptions({{Options::MERGE_ENGINE, "partial-update"}, {Options::PARTIAL_UPDATE_REMOVE_RECORD_ON_DELETE, "true"}}); AddOptions(&context_builder); @@ -1129,7 +1129,7 @@ TEST_P(MergeFileSplitReadTest, TestEmptyPlan) { auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields); ASSERT_TRUE(read_schema); - context_builder.SetReadSchema({"k0", "k1", "v0", "v1", "v2"}); + context_builder.SetReadFieldNames({"k0", "k1", "v0", "v1", "v2"}); context_builder.SetOptions({{Options::MERGE_ENGINE, "partial-update"}, {Options::PARTIAL_UPDATE_REMOVE_RECORD_ON_DELETE, "true"}}); AddOptions(&context_builder); @@ -1156,7 +1156,7 @@ TEST_P(MergeFileSplitReadTest, TestIOException) { auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields); ASSERT_TRUE(read_schema); - context_builder.SetReadSchema({"k1", "p1", "s1", "v0", "v1"}); + context_builder.SetReadFieldNames({"k1", "p1", "s1", "v0", "v1"}); context_builder.SetOptions({{Options::SEQUENCE_FIELD, "s0,s1"}, {Options::MERGE_ENGINE, "deduplicate"}, {Options::IGNORE_DELETE, "true"}}); @@ -1210,7 +1210,7 @@ TEST_P(MergeFileSplitReadTest, Test09VersionWithoutInlineFieldId) { DataField(1, arrow::field("f1", arrow::int32()))}; auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields); ASSERT_TRUE(read_schema); - context_builder.SetReadSchema({"f3", "f2", "f0", "f1"}); + context_builder.SetReadFieldNames({"f3", "f2", "f0", "f1"}); context_builder.SetOptions({{Options::FILE_FORMAT, "orc"}, {Options::MERGE_ENGINE, "deduplicate"}, {"orc.read.enable-metrics", "true"}}); diff --git a/src/paimon/core/operation/raw_file_split_read_test.cpp b/src/paimon/core/operation/raw_file_split_read_test.cpp index c97a007a4..96243d9f5 100644 --- a/src/paimon/core/operation/raw_file_split_read_test.cpp +++ b/src/paimon/core/operation/raw_file_split_read_test.cpp @@ -133,7 +133,7 @@ class RawFileSplitReadTest : public ::testing::Test { "/orc/multi_partition_append_table.db/" "multi_partition_append_table"; ReadContextBuilder context_builder(path); - context_builder.SetReadSchema(read_schema->field_names()); + context_builder.SetReadFieldNames(read_schema->field_names()); ASSERT_OK_AND_ASSIGN(std::unique_ptr read_context, context_builder.Finish()); SchemaManager schema_manager(std::make_shared(), read_context->GetPath()); ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0)); @@ -428,7 +428,7 @@ TEST_F(RawFileSplitReadTest, TestMatch) { std::string path = paimon::test::GetDataDir() + "/orc/pk_table_with_total_buckets.db/pk_table_with_total_buckets"; ReadContextBuilder context_builder(path); - context_builder.SetReadSchema({"f0", "f1", "f2", "f3"}); + context_builder.SetReadFieldNames({"f0", "f1", "f2", "f3"}); ASSERT_OK_AND_ASSIGN(std::unique_ptr read_context, context_builder.Finish()); SchemaManager schema_manager(std::make_shared(), read_context->GetPath()); ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0)); diff --git a/src/paimon/core/operation/read_context.cpp b/src/paimon/core/operation/read_context.cpp index 4ccefb8b0..4fe89abe4 100644 --- a/src/paimon/core/operation/read_context.cpp +++ b/src/paimon/core/operation/read_context.cpp @@ -18,6 +18,8 @@ #include +#include "arrow/c/abi.h" +#include "arrow/c/bridge.h" #include "paimon/common/utils/path_util.h" #include "paimon/core/utils/branch_manager.h" #include "paimon/executor.h" @@ -28,19 +30,20 @@ namespace paimon { class Predicate; ReadContext::ReadContext( - const std::string& path, const std::string& branch, const std::vector& read_schema, - const std::vector& read_field_ids, const std::shared_ptr& predicate, - bool enable_predicate_filter, bool enable_prefetch, uint32_t prefetch_batch_count, - uint32_t prefetch_max_parallel_num, bool enable_multi_thread_row_to_batch, - uint32_t row_to_batch_thread_number, const std::optional& table_schema, - const std::shared_ptr& memory_pool, const std::shared_ptr& executor, + const std::string& path, const std::string& branch, + const std::vector& read_field_names, const std::vector& read_field_ids, + const std::shared_ptr& predicate, bool enable_predicate_filter, bool enable_prefetch, + uint32_t prefetch_batch_count, uint32_t prefetch_max_parallel_num, + bool enable_multi_thread_row_to_batch, uint32_t row_to_batch_thread_number, + const std::optional& table_schema, const std::shared_ptr& memory_pool, + const std::shared_ptr& executor, const std::shared_ptr& specific_file_system, const std::map& fs_scheme_to_identifier_map, const std::map& options, PrefetchCacheMode prefetch_cache_mode, const CacheConfig& cache_config, const std::shared_ptr& cache) : path_(path), branch_(branch), - read_schema_(read_schema), + read_field_names_(read_field_names), read_field_ids_(read_field_ids), predicate_(predicate), enable_predicate_filter_(enable_predicate_filter), @@ -59,7 +62,13 @@ ReadContext::ReadContext( cache_config_(cache_config), cache_(cache) {} -ReadContext::~ReadContext() = default; +ReadContext::~ReadContext() {} + +void ReadContext::SetReadSchema(ArrowSchema* schema) { + if (schema && schema->release) { + read_schema_ = schema; + } +} class ReadContextBuilder::Impl { public: @@ -68,6 +77,7 @@ class ReadContextBuilder::Impl { branch_ = BranchManager::DEFAULT_MAIN_BRANCH; read_field_names_.clear(); read_field_ids_.clear(); + read_schema_ = nullptr; fs_scheme_to_identifier_map_.clear(); options_.clear(); predicate_.reset(); @@ -91,6 +101,7 @@ class ReadContextBuilder::Impl { std::string branch_ = BranchManager::DEFAULT_MAIN_BRANCH; std::vector read_field_names_; std::vector read_field_ids_; + ArrowSchema* read_schema_ = nullptr; std::map fs_scheme_to_identifier_map_; std::map options_; std::shared_ptr predicate_; @@ -130,7 +141,7 @@ ReadContextBuilder& ReadContextBuilder::SetOptions(const std::map& read_field_names) { impl_->read_field_names_ = read_field_names; return *this; @@ -142,6 +153,13 @@ ReadContextBuilder& ReadContextBuilder::SetReadFieldIds( return *this; } +ReadContextBuilder& ReadContextBuilder::SetReadSchema(ArrowSchema* read_schema) { + if (read_schema && read_schema->release) { + impl_->read_schema_ = read_schema; + } + return *this; +} + ReadContextBuilder& ReadContextBuilder::SetPredicate(const std::shared_ptr& predicate) { impl_->predicate_ = predicate; return *this; @@ -262,6 +280,9 @@ Result> ReadContextBuilder::Finish() { impl_->table_schema_, impl_->memory_pool_, impl_->executor_, impl_->specific_file_system_, impl_->fs_scheme_to_identifier_map_, impl_->options_, impl_->prefetch_cache_mode_, impl_->cache_config_, impl_->cache_); + if (impl_->read_schema_ && impl_->read_schema_->release) { + ctx->SetReadSchema(impl_->read_schema_); + } impl_->Reset(); return ctx; } diff --git a/src/paimon/core/operation/read_context_test.cpp b/src/paimon/core/operation/read_context_test.cpp index 33df00338..20e825b66 100644 --- a/src/paimon/core/operation/read_context_test.cpp +++ b/src/paimon/core/operation/read_context_test.cpp @@ -35,7 +35,7 @@ TEST(ReadContextTest, TestDefaultValue) { ASSERT_EQ(ctx->GetPath(), "table_root_path"); ASSERT_TRUE(ctx->GetMemoryPool()); ASSERT_TRUE(ctx->GetExecutor()); - ASSERT_TRUE(ctx->GetReadSchema().empty()); + ASSERT_TRUE(ctx->GetReadFieldNames().empty()); ASSERT_TRUE(ctx->GetReadFieldIds().empty()); ASSERT_TRUE(ctx->GetOptions().empty()); ASSERT_FALSE(ctx->GetPredicate()); @@ -59,7 +59,7 @@ TEST(ReadContextTest, TestSetContent) { /*hole_size_limit=*/128, /*pre_buffer_limit=*/2048); builder.AddOption("key", "value"); - builder.SetReadSchema({"f1", "f2"}); + builder.SetReadFieldNames({"f1", "f2"}); builder.SetReadFieldIds({0, 1}); auto predicate = PredicateBuilder::IsNull(/*field_index=*/0, /*field_name=*/"f1", FieldType::INT); @@ -86,7 +86,7 @@ TEST(ReadContextTest, TestSetContent) { ASSERT_EQ(ctx->GetPath(), "table_root_path"); ASSERT_TRUE(ctx->GetMemoryPool()); ASSERT_TRUE(ctx->GetExecutor()); - ASSERT_EQ(ctx->GetReadSchema(), std::vector({"f1", "f2"})); + ASSERT_EQ(ctx->GetReadFieldNames(), std::vector({"f1", "f2"})); ASSERT_EQ(ctx->GetReadFieldIds(), std::vector({0, 1})); ASSERT_EQ(*predicate, *(ctx->GetPredicate())); ASSERT_TRUE(ctx->EnablePredicateFilter()); diff --git a/src/paimon/core/table/source/table_read_test.cpp b/src/paimon/core/table/source/table_read_test.cpp index 762e9362c..03c3c82c1 100644 --- a/src/paimon/core/table/source/table_read_test.cpp +++ b/src/paimon/core/table/source/table_read_test.cpp @@ -41,7 +41,7 @@ TEST(TableReadTest, TestReadWithInvalidContext) { { // read with non-exist field ReadContextBuilder context_builder(path); - context_builder.SetReadSchema({"f0", "f1", "non-exist"}); + context_builder.SetReadFieldNames({"f0", "f1", "non-exist"}); ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); ASSERT_NOK_WITH_MSG(TableRead::Create(std::move(read_context)), "Get field non-exist failed: not exist in table schema"); @@ -72,7 +72,7 @@ TEST(TableReadTest, TestReadWithInvalidContext) { auto predicate = PredicateBuilder::Equal(/*field_index=*/2, /*field_name=*/"f3", FieldType::DOUBLE, Literal(15.0)); ReadContextBuilder context_builder(path); - context_builder.SetReadSchema({"f3", "f0", "f1"}); + context_builder.SetReadFieldNames({"f3", "f0", "f1"}); context_builder.SetPredicate(predicate); ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); ASSERT_NOK_WITH_MSG( @@ -92,7 +92,7 @@ TEST(TableReadTest, TestReadWithInvalidContext) { { // schema with duplicate field f3 ReadContextBuilder context_builder(path); - context_builder.SetReadSchema({"f3", "f1", "f3"}); + context_builder.SetReadFieldNames({"f3", "f1", "f3"}); ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); ASSERT_NOK_WITH_MSG(TableRead::Create(std::move(read_context)), "validate schema failed: read schema has duplicate field f3"); @@ -102,7 +102,7 @@ TEST(TableReadTest, TestReadWithInvalidContext) { TEST(TableReadTest, TestReadWithSpecifiedInvalidSchema) { std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09"; ReadContextBuilder context_builder(path); - context_builder.SetReadSchema({"field_no_exist"}); + context_builder.SetReadFieldNames({"field_no_exist"}); ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); ASSERT_NOK_WITH_MSG(TableRead::Create(std::move(read_context)), "Get field field_no_exist failed: not exist in table schema"); @@ -112,7 +112,7 @@ TEST(TableReadTest, TestCreateKeyValueTableRead) { std::string path = paimon::test::GetDataDir() + "/orc/pk_table_with_dv_cardinality.db/pk_table_with_dv_cardinality/"; ReadContextBuilder context_builder(path); - context_builder.SetReadSchema({"f0", "f1", "f2", "f3"}); + context_builder.SetReadFieldNames({"f0", "f1", "f2", "f3"}); context_builder.AddOption("read.batch-size", "2"); context_builder.AddOption("orc.read.enable-lazy-decoding", "true"); ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); @@ -124,7 +124,7 @@ TEST(TableReadTest, TestCreateKeyValueTableRead) { TEST(TableReadTest, TestCreateAppendOnlyTableRead) { std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09"; ReadContextBuilder context_builder(path); - context_builder.SetReadSchema({"f0", "f1", "f2", "f3"}); + context_builder.SetReadFieldNames({"f0", "f1", "f2", "f3"}); context_builder.AddOption("read.batch-size", "2"); context_builder.AddOption("orc.read.enable-lazy-decoding", "true"); ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); @@ -136,7 +136,7 @@ TEST(TableReadTest, TestCreateAppendOnlyTableRead) { TEST(TableReadTest, TestMergeOptions) { std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09"; ReadContextBuilder context_builder(path); - context_builder.SetReadSchema({"f0", "f1", "f2", "f3"}); + context_builder.SetReadFieldNames({"f0", "f1", "f2", "f3"}); context_builder.AddOption("read.batch-size", "2"); context_builder.AddOption("orc.read.enable-lazy-decoding", "true"); context_builder.AddOption("bucket", "10"); diff --git a/src/paimon/core/table/system/audit_log_system_table.cpp b/src/paimon/core/table/system/audit_log_system_table.cpp index 668436430..488ec4796 100644 --- a/src/paimon/core/table/system/audit_log_system_table.cpp +++ b/src/paimon/core/table/system/audit_log_system_table.cpp @@ -284,7 +284,7 @@ Result> AuditLogSystemTable::NewChangelogRead( PAIMON_ASSIGN_OR_RAISE(StringMap read_options, ReadOptions()); PAIMON_ASSIGN_OR_RAISE(CoreOptions core_options, CoreOptions::FromMap(read_options)); builder.SetOptions(read_options) - .SetReadSchema(base_read_schema->field_names()) + .SetReadFieldNames(base_read_schema->field_names()) .WithBranch(core_options.GetBranch()) .WithMemoryPool(context->GetMemoryPool()) .WithExecutor(context->GetExecutor()) diff --git a/src/paimon/core/utils/field_mapping.cpp b/src/paimon/core/utils/field_mapping.cpp index e24ee7277..7f1be719b 100644 --- a/src/paimon/core/utils/field_mapping.cpp +++ b/src/paimon/core/utils/field_mapping.cpp @@ -27,6 +27,7 @@ #include "paimon/common/utils/object_utils.h" #include "paimon/core/casting/cast_executor_factory.h" #include "paimon/core/casting/casting_utils.h" +#include "paimon/core/utils/nested_projection_utils.h" #include "paimon/defs.h" #include "paimon/predicate/literal.h" #include "paimon/predicate/predicate_builder.h" @@ -72,8 +73,8 @@ Result> FieldMappingBuilder::CreateFieldMapping( // generate non-exist field info std::optional non_exist_field_info = CreateNonExistFieldInfo(data_fields); - // generate exist field info - ExistFieldInfo exist_field_info = CreateExistFieldInfo(data_fields); + // generate exist field info (includes nested type pruning) + PAIMON_ASSIGN_OR_RAISE(ExistFieldInfo exist_field_info, CreateExistFieldInfo(data_fields)); // key: partition key, value: partition idx std::map partition_key_to_idx = @@ -87,7 +88,7 @@ Result> FieldMappingBuilder::CreateFieldMapping( return std::make_unique(partition_info, non_partition_info, non_exist_field_info); } -ExistFieldInfo FieldMappingBuilder::CreateExistFieldInfo( +Result FieldMappingBuilder::CreateExistFieldInfo( const std::vector& data_fields) const { // key:field id, value: {target_idx, read field} std::map> field_id_to_read_fields; @@ -101,8 +102,22 @@ ExistFieldInfo FieldMappingBuilder::CreateExistFieldInfo( auto iter = field_id_to_read_fields.find(data_field.Id()); if (iter != field_id_to_read_fields.end()) { const auto& [target_idx, read_field] = iter->second; + + // Recursively prune nested types in data_field to match read_field's + // projection. For atomic types this is a no-op. + PAIMON_ASSIGN_OR_RAISE( + std::optional> pruned_type, + NestedProjectionUtils::PruneDataType(read_field.Type(), data_field.Type())); + if (!pruned_type.has_value()) { + // All sub-fields pruned away — treat as non-existent. + continue; + } + + DataField pruned_data_field(data_field.Id(), + data_field.ArrowField()->WithType(pruned_type.value()), + data_field.Description()); exist_field_info.exist_read_schema.push_back(read_field); - exist_field_info.exist_data_schema.push_back(data_field); + exist_field_info.exist_data_schema.push_back(pruned_data_field); exist_field_info.idx_in_target_read_schema.push_back(target_idx); } } @@ -146,7 +161,11 @@ Result>> FieldMappingBuilder::CreateDa if (!read_fields[i].Type()->Equals(data_fields[i].Type())) { if (read_type == FieldType::MAP || read_type == FieldType::ARRAY || read_type == FieldType::STRUCT) { - return Status::Invalid("Only support column type evolution in atomic data type."); + // Nested types may differ due to nested column pruning (different + // number of sub-fields). No cast is needed — type pruning is + // handled by PruneDataType during field mapping construction. + cast_executors.push_back(nullptr); + continue; } auto executor_factory = CastExecutorFactory::GetCastExecutorFactory(); auto cast_executor = diff --git a/src/paimon/core/utils/field_mapping.h b/src/paimon/core/utils/field_mapping.h index 0c0abc04b..4b1d3912f 100644 --- a/src/paimon/core/utils/field_mapping.h +++ b/src/paimon/core/utils/field_mapping.h @@ -80,7 +80,7 @@ class FieldMappingBuilder { std::optional CreateNonExistFieldInfo( const std::vector& data_fields) const; - ExistFieldInfo CreateExistFieldInfo(const std::vector& data_fields) const; + Result CreateExistFieldInfo(const std::vector& data_fields) const; Result CreateNonPartitionInfo( const std::vector& data_fields, const ExistFieldInfo& exist_field_info, diff --git a/src/paimon/core/utils/nested_projection_utils.cpp b/src/paimon/core/utils/nested_projection_utils.cpp new file mode 100644 index 000000000..f5fb51b0d --- /dev/null +++ b/src/paimon/core/utils/nested_projection_utils.cpp @@ -0,0 +1,289 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/utils/nested_projection_utils.h" + +#include +#include +#include +#include +#include + +#include "arrow/array/array_nested.h" +#include "arrow/array/array_primitive.h" +#include "arrow/array/builder_primitive.h" +#include "arrow/array/concatenate.h" +#include "arrow/type.h" +#include "fmt/format.h" +#include "paimon/common/utils/string_utils.h" +#include "paimon/status.h" + +namespace paimon { + +std::shared_ptr NestedProjectionUtils::FindFieldByName( + const arrow::FieldVector& fields, const std::string& name) { + for (const auto& field : fields) { + if (field->name() == name) { + return field; + } + } + return nullptr; +} + +Result NestedProjectionUtils::HasNestedSubfieldProjectionType( + const std::shared_ptr& file_type, + const std::shared_ptr& read_type) { + if (file_type->id() != read_type->id()) { + return false; + } + + switch (file_type->id()) { + case arrow::Type::STRUCT: { + auto file_struct = std::static_pointer_cast(file_type); + auto read_struct = std::static_pointer_cast(read_type); + if (read_struct->num_fields() != file_struct->num_fields()) { + return true; + } + for (const auto& read_child : read_struct->fields()) { + auto file_child = FindFieldByName(file_struct->fields(), read_child->name()); + if (!file_child) { + return true; + } + PAIMON_ASSIGN_OR_RAISE( + bool child_has_nested_projection, + HasNestedSubfieldProjectionType(file_child->type(), read_child->type())); + if (child_has_nested_projection) { + return true; + } + } + return false; + } + case arrow::Type::LIST: { + auto file_list = std::static_pointer_cast(file_type); + auto read_list = std::static_pointer_cast(read_type); + return HasNestedSubfieldProjectionType(file_list->value_type(), + read_list->value_type()); + } + case arrow::Type::MAP: { + auto file_map = std::static_pointer_cast(file_type); + auto read_map = std::static_pointer_cast(read_type); + PAIMON_ASSIGN_OR_RAISE( + bool key_has_nested_projection, + HasNestedSubfieldProjectionType(file_map->key_type(), read_map->key_type())); + if (key_has_nested_projection) { + return true; + } + return HasNestedSubfieldProjectionType(file_map->item_type(), read_map->item_type()); + } + default: + return false; + } +} + +Result>> NestedProjectionUtils::PruneDataType( + const std::shared_ptr& read_type, + const std::shared_ptr& data_type) { + // Identical types need no pruning. + if (read_type->Equals(data_type)) { + return std::optional>(data_type); + } + + switch (read_type->id()) { + case arrow::Type::STRUCT: { + arrow::FieldVector pruned_fields; + for (const auto& read_child : read_type->fields()) { + int32_t read_child_id = GetPaimonFieldId(read_child); + std::shared_ptr data_child = + FindFieldByPaimonId(data_type, read_child_id); + if (!data_child) { + // Schema Evolution: field not present in data, skip. + continue; + } + PAIMON_ASSIGN_OR_RAISE( + std::optional> pruned_child_type, + PruneDataType(read_child->type(), data_child->type())); + if (!pruned_child_type.has_value()) { + // All sub-fields of this child were pruned away; skip it. + continue; + } + pruned_fields.push_back(data_child->WithType(pruned_child_type.value())); + } + if (pruned_fields.empty()) { + // All fields pruned — return nullopt so the caller can skip this field. + return std::optional>(std::nullopt); + } + return std::optional>(arrow::struct_(pruned_fields)); + } + + case arrow::Type::LIST: { + const auto& read_list = static_cast(*read_type); + const auto& data_list = static_cast(*data_type); + PAIMON_ASSIGN_OR_RAISE(std::optional> pruned_elem, + PruneDataType(read_list.value_type(), data_list.value_type())); + if (!pruned_elem.has_value()) { + return std::optional>(std::nullopt); + } + std::shared_ptr result_type = arrow::list(arrow::field( + data_list.value_field()->name(), pruned_elem.value(), + data_list.value_field()->nullable(), data_list.value_field()->metadata())); + return std::optional>(std::move(result_type)); + } + + case arrow::Type::MAP: { + const auto& read_map = static_cast(*read_type); + const auto& data_map = static_cast(*data_type); + PAIMON_ASSIGN_OR_RAISE(std::optional> pruned_key, + PruneDataType(read_map.key_type(), data_map.key_type())); + PAIMON_ASSIGN_OR_RAISE(std::optional> pruned_value, + PruneDataType(read_map.item_type(), data_map.item_type())); + if (!pruned_key.has_value() || !pruned_value.has_value()) { + return std::optional>(std::nullopt); + } + std::shared_ptr result_type = arrow::map( + pruned_key.value(), pruned_value.value(), data_map.key_field()->nullable()); + return std::optional>(std::move(result_type)); + } + + default: + // Atomic type: return data_type as-is (type evolution is handled + // separately by CastExecutor). + return std::optional>(data_type); + } +} + +Result NestedProjectionUtils::HasNestedSubfieldProjection( + const std::shared_ptr& file_schema, + const std::shared_ptr& read_schema) { + for (const auto& read_field : read_schema->fields()) { + auto file_field = file_schema->GetFieldByName(read_field->name()); + if (!file_field) { + continue; + } + if (read_field->type()->id() == arrow::Type::STRUCT || + read_field->type()->id() == arrow::Type::LIST || + read_field->type()->id() == arrow::Type::MAP) { + PAIMON_ASSIGN_OR_RAISE( + bool has_nested_projection, + HasNestedSubfieldProjectionType(file_field->type(), read_field->type())); + if (has_nested_projection) { + return true; + } + } + } + return false; +} + +// Map selected-keys support + +Result> NestedProjectionUtils::GetMapSelectedKeys( + const std::shared_ptr& field) { + std::vector result; + if (!field || !field->HasMetadata() || !field->metadata()) { + return result; + } + auto get_result = field->metadata()->Get(DataField::MAP_SELECTED_KEYS); + if (!get_result.ok()) { + return result; + } + std::string value = get_result.ValueUnsafe(); + StringUtils::Trim(&value); + if (value.empty()) { + // Metadata is explicitly present but empty: select the empty-string key. + result.push_back(""); + return result; + } + + auto tokens = StringUtils::Split(value, ",", /*ignore_empty=*/false); + std::unordered_set deduplicated; + deduplicated.reserve(tokens.size()); + for (auto& token : tokens) { + StringUtils::Trim(&token); + if (!deduplicated.insert(token).second) { + return Status::Invalid(fmt::format("Duplicate selected key '{}' in {} metadata", token, + DataField::MAP_SELECTED_KEYS)); + } + result.push_back(token); + } + return result; +} + +Result> NestedProjectionUtils::FilterMapArrayBySelectedKeys( + const std::shared_ptr& array, const std::vector& selected_keys) { + if (selected_keys.empty() || !array || array->length() == 0) { + return array; + } + + auto map_array = std::static_pointer_cast(array); + auto map_type = std::static_pointer_cast(array->type()); + + if (map_type->key_type()->id() != arrow::Type::STRING) { + return Status::Invalid( + fmt::format("FilterMapArrayBySelectedKeys only supports string keys, got {}", + map_type->key_type()->ToString())); + } + + auto keys_array = std::static_pointer_cast(map_array->keys()); + auto values_array = map_array->items(); + int64_t num_maps = map_array->length(); + + std::unordered_set deduplicated; + deduplicated.reserve(selected_keys.size()); + for (const auto& selected_key : selected_keys) { + if (!deduplicated.insert(selected_key).second) { + return Status::Invalid(fmt::format("Duplicate selected key '{}' in {} metadata", + selected_key, DataField::MAP_SELECTED_KEYS)); + } + } + + auto key_builder = std::make_shared(); + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW( + std::unique_ptr value_builder_u, + arrow::MakeBuilder(values_array->type(), arrow::default_memory_pool())); + auto value_builder = std::shared_ptr(std::move(value_builder_u)); + arrow::MapBuilder map_builder(arrow::default_memory_pool(), key_builder, value_builder); + + for (int64_t map_idx = 0; map_idx < num_maps; ++map_idx) { + if (map_array->IsNull(map_idx)) { + PAIMON_RETURN_NOT_OK_FROM_ARROW(map_builder.AppendNull()); + continue; + } + PAIMON_RETURN_NOT_OK_FROM_ARROW(map_builder.Append()); + int64_t start = map_array->value_offset(map_idx); + int64_t end = map_array->value_offset(map_idx + 1); + + // Keep selected keys in the exact selected_keys order. + for (const auto& selected_key : selected_keys) { + for (int64_t entry_idx = start; entry_idx < end; ++entry_idx) { + if (keys_array->IsNull(entry_idx)) { + continue; + } + std::string_view key_view = keys_array->GetView(entry_idx); + if (key_view == selected_key) { + PAIMON_RETURN_NOT_OK_FROM_ARROW(key_builder->Append( + key_view.data(), static_cast(key_view.size()))); + PAIMON_RETURN_NOT_OK_FROM_ARROW( + value_builder->AppendArraySlice(*values_array->data(), entry_idx, 1)); + } + } + } + } + + std::shared_ptr result_map; + PAIMON_RETURN_NOT_OK_FROM_ARROW(map_builder.Finish(&result_map)); + return result_map; +} + +} // namespace paimon diff --git a/src/paimon/core/utils/nested_projection_utils.h b/src/paimon/core/utils/nested_projection_utils.h new file mode 100644 index 000000000..c99ec1a80 --- /dev/null +++ b/src/paimon/core/utils/nested_projection_utils.h @@ -0,0 +1,112 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/type.h" +#include "paimon/common/types/data_field.h" +#include "paimon/common/utils/string_utils.h" +#include "paimon/result.h" + +namespace paimon { + +/// Utility class for nested column pruning and map key selection. +class PAIMON_EXPORT NestedProjectionUtils { + public: + NestedProjectionUtils() = delete; + ~NestedProjectionUtils() = delete; + + static std::shared_ptr FindFieldByName(const arrow::FieldVector& fields, + const std::string& name); + + /// Extract the paimon field ID from an Arrow field's metadata ("paimon.id"). + /// Returns -1 if the metadata key is not present. + static int32_t GetPaimonFieldId(const std::shared_ptr& field) { + if (!field || !field->HasMetadata() || !field->metadata()) { + return -1; + } + auto result = field->metadata()->Get(DataField::FIELD_ID); + if (!result.ok()) { + return -1; + } + std::optional field_id = StringUtils::StringToValue(result.ValueUnsafe()); + return field_id.value_or(-1); + } + + /// Find a child field in a STRUCT DataType by paimon field ID. + /// Returns nullptr if no child has the given ID. + static std::shared_ptr FindFieldByPaimonId( + const std::shared_ptr& struct_type, int32_t field_id) { + if (!struct_type || struct_type->id() != arrow::Type::STRUCT) { + return nullptr; + } + for (const auto& child : struct_type->fields()) { + if (GetPaimonFieldId(child) == field_id) { + return child; + } + } + return nullptr; + } + + /// Recursively prune `data_type` so that only the sub-fields requested by + /// `read_type` are retained. Matching is done by paimon field ID to support + /// schema evolution (field renames). + /// + /// Supported nesting: STRUCT, LIST (element recurse), MAP (key/value recurse). + /// For atomic types, `data_type` is returned as-is. + /// + /// Returns std::nullopt when all sub-fields of a STRUCT are pruned away + /// (caller should skip this field entirely, mirroring Java's null return). + static Result>> PruneDataType( + const std::shared_ptr& read_type, + const std::shared_ptr& data_type); + + /// Returns true if `read_schema` requests a nested sub-field projection against + /// `file_schema` (same top-level field, but nested STRUCT/LIST/MAP subtree is pruned). + static Result HasNestedSubfieldProjection( + const std::shared_ptr& file_schema, + const std::shared_ptr& read_schema); + + /// Parse the "paimon.map.selected-keys" metadata from an Arrow field. + /// Returns an empty vector if the field is null, has no metadata, or the metadata key + /// is absent. + /// The metadata value is a comma-separated string, e.g. "key1,key2". + /// Empty tokens are preserved ("" means selecting empty-string keys), and duplicate + /// selected keys are rejected as invalid. + static Result> GetMapSelectedKeys( + const std::shared_ptr& field); + + /// Filter a MapArray so that only entries whose key is in `selected_keys` are kept. + /// Only supports string-keyed maps. The output map entry order follows + /// `selected_keys` order, and duplicate selected keys are rejected. + /// Returns the original array unchanged if `selected_keys` is empty. + static Result> FilterMapArrayBySelectedKeys( + const std::shared_ptr& map_array, + const std::vector& selected_keys); + + private: + static Result HasNestedSubfieldProjectionType( + const std::shared_ptr& file_type, + const std::shared_ptr& read_type); +}; + +} // namespace paimon diff --git a/src/paimon/core/utils/nested_projection_utils_test.cpp b/src/paimon/core/utils/nested_projection_utils_test.cpp new file mode 100644 index 000000000..c8f3b097f --- /dev/null +++ b/src/paimon/core/utils/nested_projection_utils_test.cpp @@ -0,0 +1,405 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/utils/nested_projection_utils.h" + +#include "arrow/array/array_nested.h" +#include "arrow/array/builder_nested.h" +#include "arrow/array/builder_primitive.h" +#include "arrow/type.h" +#include "gtest/gtest.h" +#include "paimon/common/types/data_field.h" +#include "paimon/testing/utils/testharness.h" + +namespace paimon::test { + +// Helper: create an arrow::Field with paimon.id metadata +static std::shared_ptr MakeField(const std::string& name, + const std::shared_ptr& type, + int32_t paimon_id) { + DataField data_field(paimon_id, arrow::field(name, type)); + return DataField::ConvertDataFieldToArrowField(data_field); +} + +// ============== GetPaimonFieldId ============== + +TEST(NestedProjectionUtilsTest, GetPaimonFieldId_Present) { + auto field = MakeField("col", arrow::int32(), 42); + ASSERT_EQ(NestedProjectionUtils::GetPaimonFieldId(field), 42); +} + +TEST(NestedProjectionUtilsTest, GetPaimonFieldId_Missing) { + auto field = arrow::field("col", arrow::int32()); + ASSERT_EQ(NestedProjectionUtils::GetPaimonFieldId(field), -1); +} + +TEST(NestedProjectionUtilsTest, GetPaimonFieldId_Nullptr) { + ASSERT_EQ(NestedProjectionUtils::GetPaimonFieldId(nullptr), -1); +} + +// ============== FindFieldByPaimonId ============== + +TEST(NestedProjectionUtilsTest, FindFieldByPaimonId_Found) { + auto struct_type = + arrow::struct_({MakeField("x", arrow::int32(), 1), MakeField("y", arrow::utf8(), 2)}); + auto found = NestedProjectionUtils::FindFieldByPaimonId(struct_type, 2); + ASSERT_NE(found, nullptr); + ASSERT_EQ(found->name(), "y"); +} + +TEST(NestedProjectionUtilsTest, FindFieldByPaimonId_NotFound) { + auto struct_type = arrow::struct_({MakeField("x", arrow::int32(), 1)}); + ASSERT_EQ(NestedProjectionUtils::FindFieldByPaimonId(struct_type, 99), nullptr); +} + +TEST(NestedProjectionUtilsTest, FindFieldByPaimonId_NonStruct) { + ASSERT_EQ(NestedProjectionUtils::FindFieldByPaimonId(arrow::int32(), 1), nullptr); +} + +// ============== PruneDataType ============== + +TEST(NestedProjectionUtilsTest, PruneDataType_IdenticalTypes) { + auto type = arrow::int32(); + ASSERT_OK_AND_ASSIGN(auto result, NestedProjectionUtils::PruneDataType(type, type)); + ASSERT_TRUE(result.has_value()); + ASSERT_TRUE(result.value()->Equals(type)); +} + +TEST(NestedProjectionUtilsTest, PruneDataType_AtomicType) { + // Different atomic types: return data_type + auto read_type = arrow::int64(); + auto data_type = arrow::int32(); + ASSERT_OK_AND_ASSIGN(auto result, NestedProjectionUtils::PruneDataType(read_type, data_type)); + ASSERT_TRUE(result.has_value()); + ASSERT_TRUE(result.value()->Equals(data_type)); +} + +TEST(NestedProjectionUtilsTest, PruneDataType_StructPruneSubset) { + // data: STRUCT + // read: STRUCT + // expected: STRUCT + auto data_type = + arrow::struct_({MakeField("x", arrow::int32(), 1), MakeField("y", arrow::utf8(), 2), + MakeField("z", arrow::float64(), 3)}); + auto read_type = arrow::struct_({MakeField("x", arrow::int32(), 1)}); + + ASSERT_OK_AND_ASSIGN(auto result, NestedProjectionUtils::PruneDataType(read_type, data_type)); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value()->num_fields(), 1); + ASSERT_EQ(result.value()->field(0)->name(), "x"); +} + +TEST(NestedProjectionUtilsTest, PruneDataType_StructAllFieldsPruned) { + // data: STRUCT + // read: STRUCT — no match + // expected: nullopt + auto data_type = arrow::struct_({MakeField("x", arrow::int32(), 1)}); + auto read_type = arrow::struct_({MakeField("y", arrow::int32(), 99)}); + + ASSERT_OK_AND_ASSIGN(auto result, NestedProjectionUtils::PruneDataType(read_type, data_type)); + ASSERT_FALSE(result.has_value()); +} + +TEST(NestedProjectionUtilsTest, PruneDataType_NestedStruct) { + // data: STRUCT(id=1)> + // read: STRUCT(id=1)> + // expected: STRUCT(id=1)> + auto inner_data = + arrow::struct_({MakeField("a", arrow::int32(), 10), MakeField("b", arrow::utf8(), 11)}); + auto data_type = arrow::struct_({MakeField("inner", inner_data, 1)}); + + auto inner_read = arrow::struct_({MakeField("a", arrow::int32(), 10)}); + auto read_type = arrow::struct_({MakeField("inner", inner_read, 1)}); + + ASSERT_OK_AND_ASSIGN(auto result, NestedProjectionUtils::PruneDataType(read_type, data_type)); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value()->num_fields(), 1); + auto pruned_inner = result.value()->field(0)->type(); + ASSERT_EQ(pruned_inner->num_fields(), 1); + ASSERT_EQ(pruned_inner->field(0)->name(), "a"); +} + +TEST(NestedProjectionUtilsTest, PruneDataType_ListWithStructElement) { + // data: LIST> + // read: LIST> + auto inner_data = + arrow::struct_({MakeField("a", arrow::int32(), 10), MakeField("b", arrow::utf8(), 11)}); + auto data_type = arrow::list(arrow::field("item", inner_data)); + + auto inner_read = arrow::struct_({MakeField("a", arrow::int32(), 10)}); + auto read_type = arrow::list(arrow::field("item", inner_read)); + + ASSERT_OK_AND_ASSIGN(auto result, NestedProjectionUtils::PruneDataType(read_type, data_type)); + ASSERT_TRUE(result.has_value()); + auto list_type = std::dynamic_pointer_cast(result.value()); + ASSERT_NE(list_type, nullptr); + ASSERT_EQ(list_type->value_type()->num_fields(), 1); + ASSERT_EQ(list_type->value_type()->field(0)->name(), "a"); +} + +TEST(NestedProjectionUtilsTest, PruneDataType_MapWithStructValue) { + // data: MAP> + // read: MAP> + auto inner_data = + arrow::struct_({MakeField("a", arrow::int32(), 10), MakeField("b", arrow::utf8(), 11)}); + auto data_type = arrow::map(arrow::utf8(), inner_data); + + auto inner_read = arrow::struct_({MakeField("a", arrow::int32(), 10)}); + auto read_type = arrow::map(arrow::utf8(), inner_read); + + ASSERT_OK_AND_ASSIGN(auto result, NestedProjectionUtils::PruneDataType(read_type, data_type)); + ASSERT_TRUE(result.has_value()); + auto map_type = std::dynamic_pointer_cast(result.value()); + ASSERT_NE(map_type, nullptr); + ASSERT_TRUE(map_type->key_type()->Equals(arrow::utf8())); + ASSERT_EQ(map_type->item_type()->num_fields(), 1); + ASSERT_EQ(map_type->item_type()->field(0)->name(), "a"); +} + +TEST(NestedProjectionUtilsTest, HasNestedSubfieldProjection_NoProjection) { + auto file_schema = arrow::schema({ + MakeField("f0", arrow::int32(), 1), + MakeField("f1", arrow::struct_({MakeField("a", arrow::int32(), 2)}), 3), + }); + auto read_schema = arrow::schema({ + MakeField("f0", arrow::int32(), 1), + MakeField("f1", arrow::struct_({MakeField("a", arrow::int32(), 2)}), 3), + }); + ASSERT_OK_AND_ASSIGN( + auto has_nested_projection, + NestedProjectionUtils::HasNestedSubfieldProjection(file_schema, read_schema)); + ASSERT_FALSE(has_nested_projection); +} + +TEST(NestedProjectionUtilsTest, HasNestedSubfieldProjection_WithProjection) { + auto file_schema = arrow::schema({ + MakeField("f0", arrow::int32(), 1), + MakeField( + "f1", + arrow::struct_({MakeField("a", arrow::int32(), 2), MakeField("b", arrow::utf8(), 4)}), + 3), + }); + auto read_schema = arrow::schema({ + MakeField("f0", arrow::int32(), 1), + MakeField("f1", arrow::struct_({MakeField("a", arrow::int32(), 2)}), 3), + }); + ASSERT_OK_AND_ASSIGN( + auto has_nested_projection, + NestedProjectionUtils::HasNestedSubfieldProjection(file_schema, read_schema)); + ASSERT_TRUE(has_nested_projection); +} + +// ============== GetMapSelectedKeys ============== + +TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_Present) { + auto metadata = + arrow::KeyValueMetadata::Make({DataField::MAP_SELECTED_KEYS}, {"key1,key2,key3"}); + auto field = + arrow::field("m", arrow::map(arrow::utf8(), arrow::int32()), /*nullable=*/true, metadata); + ASSERT_OK_AND_ASSIGN(auto keys, NestedProjectionUtils::GetMapSelectedKeys(field)); + ASSERT_EQ(keys.size(), 3); + ASSERT_EQ(keys[0], "key1"); + ASSERT_EQ(keys[1], "key2"); + ASSERT_EQ(keys[2], "key3"); +} + +TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_Absent) { + auto field = arrow::field("m", arrow::map(arrow::utf8(), arrow::int32())); + ASSERT_OK_AND_ASSIGN(auto keys, NestedProjectionUtils::GetMapSelectedKeys(field)); + ASSERT_TRUE(keys.empty()); +} + +TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_EmptyString) { + auto metadata = arrow::KeyValueMetadata::Make({DataField::MAP_SELECTED_KEYS}, {""}); + auto field = + arrow::field("m", arrow::map(arrow::utf8(), arrow::int32()), /*nullable=*/true, metadata); + ASSERT_OK_AND_ASSIGN(auto keys, NestedProjectionUtils::GetMapSelectedKeys(field)); + ASSERT_EQ(keys.size(), 1); + ASSERT_EQ(keys[0], ""); +} + +TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_ContainsEmptyToken) { + auto metadata = arrow::KeyValueMetadata::Make({DataField::MAP_SELECTED_KEYS}, {"a, ,b"}); + auto field = + arrow::field("m", arrow::map(arrow::utf8(), arrow::int32()), /*nullable=*/true, metadata); + ASSERT_OK_AND_ASSIGN(auto keys, NestedProjectionUtils::GetMapSelectedKeys(field)); + ASSERT_EQ(keys.size(), 3); + ASSERT_EQ(keys[0], "a"); + ASSERT_EQ(keys[1], ""); + ASSERT_EQ(keys[2], "b"); +} + +TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_DuplicateKey) { + auto metadata = arrow::KeyValueMetadata::Make({DataField::MAP_SELECTED_KEYS}, {"a,b,a"}); + auto field = + arrow::field("m", arrow::map(arrow::utf8(), arrow::int32()), /*nullable=*/true, metadata); + auto result = NestedProjectionUtils::GetMapSelectedKeys(field); + ASSERT_FALSE(result.ok()); +} + +TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_Nullptr) { + ASSERT_OK_AND_ASSIGN(auto keys, NestedProjectionUtils::GetMapSelectedKeys(nullptr)); + ASSERT_TRUE(keys.empty()); +} + +// ============== FilterMapArrayBySelectedKeys ============== + +// Helper to build a MapArray from vectors of key-value pairs. +static std::shared_ptr BuildStringInt32MapArray( + const std::vector>>& maps, + const std::vector& null_mask = {}) { + auto key_builder = std::make_shared(); + auto value_builder = std::make_shared(); + arrow::MapBuilder map_builder(arrow::default_memory_pool(), key_builder, value_builder); + for (size_t i = 0; i < maps.size(); ++i) { + if (!null_mask.empty() && !null_mask[i]) { + EXPECT_TRUE(map_builder.AppendNull().ok()); + continue; + } + EXPECT_TRUE(map_builder.Append().ok()); + for (const auto& [k, v] : maps[i]) { + EXPECT_TRUE(key_builder->Append(k).ok()); + EXPECT_TRUE(value_builder->Append(v).ok()); + } + } + std::shared_ptr result; + EXPECT_TRUE(map_builder.Finish(&result).ok()); + return result; +} + +TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_Basic) { + // Map with 3 entries each, select only "a" and "c" + auto map_array = BuildStringInt32MapArray({ + {{"a", 1}, {"b", 2}, {"c", 3}}, + {{"a", 10}, {"d", 40}}, + }); + + std::vector selected = {"a", "c"}; + ASSERT_OK_AND_ASSIGN(auto filtered, + NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected)); + + auto result = std::static_pointer_cast(filtered); + ASSERT_EQ(result->length(), 2); + + // First map: should have "a" and "c" + ASSERT_EQ(result->value_length(0), 2); + auto keys0 = std::static_pointer_cast(result->keys()); + ASSERT_EQ(keys0->GetString(result->value_offset(0)), "a"); + ASSERT_EQ(keys0->GetString(result->value_offset(0) + 1), "c"); + + // Second map: should have only "a" + ASSERT_EQ(result->value_length(1), 1); + ASSERT_EQ(keys0->GetString(result->value_offset(1)), "a"); +} + +TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_EmptySelectedKeys) { + auto map_array = BuildStringInt32MapArray({{{"a", 1}}}); + std::vector empty_keys; + ASSERT_OK_AND_ASSIGN( + auto filtered, NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, empty_keys)); + // Should return original array unchanged + ASSERT_EQ(filtered.get(), map_array.get()); +} + +TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_AllKept) { + auto map_array = BuildStringInt32MapArray({{{"a", 1}, {"b", 2}}}); + std::vector selected = {"a", "b"}; + ASSERT_OK_AND_ASSIGN(auto filtered, + NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected)); + auto result = std::static_pointer_cast(filtered); + ASSERT_EQ(result->length(), 1); + ASSERT_EQ(result->value_length(0), 2); +} + +TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_NoneKept) { + auto map_array = BuildStringInt32MapArray({{{"a", 1}, {"b", 2}}}); + std::vector selected = {"x", "y"}; + ASSERT_OK_AND_ASSIGN(auto filtered, + NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected)); + auto result = std::static_pointer_cast(filtered); + ASSERT_EQ(result->length(), 1); + ASSERT_EQ(result->value_length(0), 0); +} + +TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_EmptyStringKeySelected) { + auto map_array = BuildStringInt32MapArray({{{"a", 1}, {"", 9}, {"b", 2}}}); + std::vector selected = {""}; + ASSERT_OK_AND_ASSIGN(auto filtered, + NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected)); + auto result = std::static_pointer_cast(filtered); + ASSERT_EQ(result->length(), 1); + ASSERT_EQ(result->value_length(0), 1); + auto keys = std::static_pointer_cast(result->keys()); + auto values = std::static_pointer_cast(result->items()); + ASSERT_EQ(keys->GetString(result->value_offset(0)), ""); + ASSERT_EQ(values->Value(result->value_offset(0)), 9); +} + +TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_WithNull) { + // maps[0] = {"a":1}, maps[1] = null, maps[2] = {"b":2,"c":3} + auto map_array = + BuildStringInt32MapArray({{{"a", 1}}, {}, {{"b", 2}, {"c", 3}}}, {true, false, true}); + + std::vector selected = {"a", "c"}; + ASSERT_OK_AND_ASSIGN(auto filtered, + NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected)); + auto result = std::static_pointer_cast(filtered); + ASSERT_EQ(result->length(), 3); + // maps[0] = {"a":1} + ASSERT_EQ(result->value_length(0), 1); + // maps[1] = null + ASSERT_TRUE(result->IsNull(1)); + // maps[2] = {"c":3} + ASSERT_EQ(result->value_length(2), 1); + auto keys = std::static_pointer_cast(result->keys()); + ASSERT_EQ(keys->GetString(result->value_offset(2)), "c"); +} + +TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_EmptyArray) { + auto map_array = BuildStringInt32MapArray({}); + std::vector selected = {"a"}; + ASSERT_OK_AND_ASSIGN(auto filtered, + NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected)); + ASSERT_EQ(filtered->length(), 0); +} + +TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_SelectedOrderWins) { + auto map_array = BuildStringInt32MapArray({{{"a", 1}, {"b", 2}, {"c", 3}}}); + std::vector selected = {"c", "a"}; + + ASSERT_OK_AND_ASSIGN(auto filtered, + NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected)); + auto result = std::static_pointer_cast(filtered); + ASSERT_EQ(result->length(), 1); + ASSERT_EQ(result->value_length(0), 2); + + auto keys = std::static_pointer_cast(result->keys()); + auto values = std::static_pointer_cast(result->items()); + ASSERT_EQ(keys->GetString(result->value_offset(0)), "c"); + ASSERT_EQ(values->Value(result->value_offset(0)), 3); + ASSERT_EQ(keys->GetString(result->value_offset(0) + 1), "a"); + ASSERT_EQ(values->Value(result->value_offset(0) + 1), 1); +} + +TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_DuplicateSelectedKeys) { + auto map_array = BuildStringInt32MapArray({{{"a", 1}, {"b", 2}}}); + std::vector selected = {"a", "a"}; + + auto result = NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected); + ASSERT_FALSE(result.ok()); +} + +} // namespace paimon::test diff --git a/src/paimon/format/avro/avro_file_batch_reader.cpp b/src/paimon/format/avro/avro_file_batch_reader.cpp index 13833f97c..02013aabb 100644 --- a/src/paimon/format/avro/avro_file_batch_reader.cpp +++ b/src/paimon/format/avro/avro_file_batch_reader.cpp @@ -27,6 +27,7 @@ #include "paimon/common/utils/arrow/mem_utils.h" #include "paimon/common/utils/arrow/status_utils.h" #include "paimon/common/utils/scope_guard.h" +#include "paimon/core/utils/nested_projection_utils.h" #include "paimon/format/avro/avro_input_stream_impl.h" #include "paimon/format/avro/avro_schema_converter.h" #include "paimon/reader/batch_reader.h" @@ -148,6 +149,13 @@ Status AvroFileBatchReader::SetReadSchema(::ArrowSchema* read_schema, arrow::ImportSchema(read_schema)); PAIMON_ASSIGN_OR_RAISE(std::shared_ptr file_schema, ArrowUtils::DataTypeToSchema(file_data_type_)); + PAIMON_ASSIGN_OR_RAISE( + bool has_nested_projection, + NestedProjectionUtils::HasNestedSubfieldProjection(file_schema, arrow_read_schema)); + if (has_nested_projection) { + return Status::Invalid( + "SetReadSchema failed: avro reader does not support nested sub-field projection"); + } PAIMON_ASSIGN_OR_RAISE(read_fields_projection_, CalculateReadFieldsProjection(file_schema, arrow_read_schema->fields())); array_builder_->Reset(); diff --git a/src/paimon/format/avro/avro_file_batch_reader_test.cpp b/src/paimon/format/avro/avro_file_batch_reader_test.cpp index a8ed3bb6c..41fcca91f 100644 --- a/src/paimon/format/avro/avro_file_batch_reader_test.cpp +++ b/src/paimon/format/avro/avro_file_batch_reader_test.cpp @@ -295,6 +295,38 @@ TEST_F(AvroFileBatchReaderTest, TestReadMapTypes) { ASSERT_TRUE(expected_array->Equals(result_array)); } +TEST_F(AvroFileBatchReaderTest, TestSetReadSchemaRejectNestedSubFieldProjection) { + std::string path = PathUtil::JoinPath(dir_->Str(), "nested_projection_unsupported.avro"); + + arrow::FieldVector write_fields = { + arrow::field("f0", arrow::int32()), + arrow::field("f1", arrow::struct_({arrow::field("a", arrow::int32()), + arrow::field("b", arrow::utf8())}))}; + auto write_type = arrow::struct_(write_fields); + auto write_array = arrow::ipc::internal::json::ArrayFromJSON(write_type, R"([ + [1, [10, "x"]], + [2, [20, "y"]] + ])") + .ValueOrDie(); + WriteData(write_array, path, /*compression=*/"null"); + + ASSERT_OK_AND_ASSIGN(auto reader_builder, + file_format_->CreateReaderBuilder(/*batch_size=*/1024)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr in, fs_->Open(path)); + ASSERT_OK_AND_ASSIGN(auto batch_reader, reader_builder->Build(in)); + + arrow::FieldVector read_fields = { + arrow::field("f0", arrow::int32()), + arrow::field("f1", arrow::struct_({arrow::field("a", arrow::int32())}))}; + auto read_schema = arrow::schema(read_fields); + std::unique_ptr c_schema = std::make_unique(); + ASSERT_TRUE(arrow::ExportSchema(*read_schema, c_schema.get()).ok()); + + ASSERT_NOK_WITH_MSG(batch_reader->SetReadSchema(c_schema.get(), /*predicate=*/nullptr, + /*selection_bitmap=*/std::nullopt), + "does not support nested sub-field projection"); +} + TEST_F(AvroFileBatchReaderTest, TestGetPreviousBatchFirstRowNumber) { std::string path = paimon::test::GetDataDir() + "/avro/append_simple.db/" diff --git a/src/paimon/format/lance/lance_file_batch_reader.cpp b/src/paimon/format/lance/lance_file_batch_reader.cpp index 79afcacfc..1ae1deb84 100644 --- a/src/paimon/format/lance/lance_file_batch_reader.cpp +++ b/src/paimon/format/lance/lance_file_batch_reader.cpp @@ -19,6 +19,7 @@ #include "arrow/api.h" #include "paimon/common/metrics/metrics_impl.h" #include "paimon/common/utils/arrow/status_utils.h" +#include "paimon/core/utils/nested_projection_utils.h" #include "paimon/format/lance/lance_utils.h" namespace paimon::lance { LanceFileBatchReader::LanceFileBatchReader(LanceFileReader* file_reader, int32_t batch_size, @@ -66,9 +67,19 @@ Result> LanceFileBatchReader::GetFileSchema() con Status LanceFileBatchReader::SetReadSchema(::ArrowSchema* read_schema, const std::shared_ptr& predicate, const std::optional& selection_bitmap) { - PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr arrow_schema, + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr arrow_read_schema, arrow::ImportSchema(read_schema)); - read_field_names_ = arrow_schema->field_names(); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<::ArrowSchema> c_file_schema, GetFileSchema()); + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr file_schema, + arrow::ImportSchema(c_file_schema.get())); + PAIMON_ASSIGN_OR_RAISE( + bool has_nested_projection, + NestedProjectionUtils::HasNestedSubfieldProjection(file_schema, arrow_read_schema)); + if (has_nested_projection) { + return Status::Invalid( + "SetReadSchema failed: lance reader does not support nested sub-field projection"); + } + read_field_names_ = arrow_read_schema->field_names(); assert(!read_field_names_.empty()); read_row_ids_.clear(); if (selection_bitmap) { diff --git a/src/paimon/format/lance/lance_format_reader_writer_test.cpp b/src/paimon/format/lance/lance_format_reader_writer_test.cpp index 94a8aab96..b1ad6be73 100644 --- a/src/paimon/format/lance/lance_format_reader_writer_test.cpp +++ b/src/paimon/format/lance/lance_format_reader_writer_test.cpp @@ -247,6 +247,42 @@ TEST_F(LanceFileReaderWriterTest, TestNestedType) { } } +TEST_F(LanceFileReaderWriterTest, TestRejectNestedSubFieldProjection) { + arrow::FieldVector fields = { + arrow::field("f0", arrow::int32()), + arrow::field("f1", arrow::struct_({arrow::field("sub_f0", arrow::boolean()), + arrow::field("sub_f1", arrow::int64())}))}; + auto schema = arrow::schema(fields); + auto array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([ + [1, [true, 2]], + [2, [false, 3]] + ])") + .ValueOrDie()); + auto src_chunk_array = std::make_shared(arrow::ArrayVector({array})); + + auto dir = paimon::test::UniqueTestDirectory::Create(); + ASSERT_TRUE(dir); + std::string file_path = dir->Str() + "/test.lance"; + WriteFile(file_path, src_chunk_array, schema); + + ASSERT_OK_AND_ASSIGN(std::unique_ptr reader, + LanceFileBatchReader::Create(file_path, /*batch_size=*/2, + /*batch_readahead=*/2)); + + auto projected_fields = arrow::FieldVector{ + arrow::field("f0", arrow::int32()), + arrow::field("f1", arrow::struct_({arrow::field("sub_f0", arrow::boolean())})), + }; + auto projected_schema = arrow::schema(projected_fields); + ArrowSchema c_read_schema; + ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_read_schema).ok()); + ASSERT_NOK_WITH_MSG( + reader->SetReadSchema(&c_read_schema, /*predicate=*/nullptr, + /*selection_bitmap=*/std::nullopt), + "SetReadSchema failed: lance reader does not support nested sub-field projection"); +} + TEST_F(LanceFileReaderWriterTest, TestBulkData) { int64_t seed = DateTimeUtils::GetCurrentUTCTimeUs(); std::srand(seed); diff --git a/src/paimon/format/parquet/file_reader_wrapper.h b/src/paimon/format/parquet/file_reader_wrapper.h index 748d4052f..7ed7bba43 100644 --- a/src/paimon/format/parquet/file_reader_wrapper.h +++ b/src/paimon/format/parquet/file_reader_wrapper.h @@ -122,6 +122,14 @@ class FileReaderWrapper { /// Resets reader state so that the next Next() call will re-initialize. Status ApplyReadRanges(const std::vector>& read_ranges); + /// Set the read schema for page-filtered reading. When nested column pruning + /// is used, the leaf-column-name-based schema inference in PrepareForReading + /// cannot correctly reconstruct nested types. This setter allows the caller + /// to provide the correct pruned schema directly. + void SetReadSchemaForPageFilter(const std::shared_ptr& schema) { + external_read_schema_ = schema; + } + /// Get the page index reader for the file. /// Returns nullptr if page index is not available. std::shared_ptr<::parquet::PageIndexReader> GetPageIndexReader(); @@ -194,6 +202,10 @@ class FileReaderWrapper { // all page-filtered RGs in a session. std::shared_ptr page_filtered_read_schema_; + // Externally provided read schema for page-filtered reading. + // When set, PrepareForReading uses this instead of inferring from leaf column names. + std::shared_ptr external_read_schema_; + // Track pre-buffered ranges so we can wait on destruction std::vector<::arrow::io::ReadRange> prebuffered_ranges_; }; diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp index 7533cb99a..75856e333 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp @@ -57,6 +57,30 @@ class Predicate; namespace paimon::parquet { +namespace { + +std::shared_ptr FindMatchingReadField( + const arrow::FieldVector& read_fields, const std::shared_ptr& file_field) { + for (const auto& candidate : read_fields) { + if (candidate->name() == file_field->name()) { + return candidate; + } + } + return nullptr; +} + +int32_t FindMatchingFileFieldIndex(const arrow::FieldVector& file_fields, + const std::shared_ptr& read_field) { + for (int32_t i = 0; i < static_cast(file_fields.size()); ++i) { + if (file_fields[i]->name() == read_field->name()) { + return i; + } + } + return -1; +} + +} // namespace + ParquetFileBatchReader::ParquetFileBatchReader( std::shared_ptr&& input_stream, std::unique_ptr&& reader, const std::map& options, @@ -127,24 +151,26 @@ Status ParquetFileBatchReader::SetReadSchema( PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr read_schema, arrow::ImportSchema(schema)); - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr file_schema, reader_->GetSchema()); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr raw_file_schema, + reader_->GetSchema()); + // Convert PARQUET:field_id to paimon.id so that nested column matching works. + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr file_schema, + ParquetFieldIdConverter::GetPaimonIdsFromParquetIds(raw_file_schema)); + + // Recursively match read_schema against file_schema using paimon field IDs. + // STRUCT supports sub-field projection; LIST/MAP require exact type match. + PAIMON_ASSIGN_OR_RAISE(std::vector column_indices, + ComputeNestedColumnIndices(read_schema, file_schema)); + + // Build column name to index map for page-level filtering. + // We still need the full per-top-level-field leaf indices for predicate pushdown. std::unordered_map> field_index_map; - int32_t i = 0; + int32_t flat_idx = 0; for (const auto& field : file_schema->fields()) { - std::vector v; - FlattenSchema(field->type(), &i, &v); - field_index_map[field->name()] = v; - } - - std::vector column_indices; - for (const auto& field : read_schema->field_names()) { - if (field_index_map.find(field) != field_index_map.end()) { - for (int32_t index : field_index_map[field]) { - column_indices.push_back(index); - } - } else { - return Status::Invalid(fmt::format("Field {} is not found in schema.", field)); - } + std::vector leaf_indices; + FlattenSchema(field->type(), &flat_idx, &leaf_indices); + field_index_map[field->name()] = leaf_indices; } std::vector row_groups = arrow::internal::Iota(reader_->GetNumberOfRowGroups()); @@ -190,6 +216,12 @@ Status ParquetFileBatchReader::SetReadSchema( read_data_type_ = arrow::struct_(read_schema->fields()); + // Provide the read schema to FileReaderWrapper for page-filtered reading. + // This is needed because nested column pruning produces leaf column indices + // whose names don't correspond to top-level Arrow fields, so the wrapper + // cannot infer the correct schema from leaf column names alone. + reader_->SetReadSchemaForPageFilter(read_schema); + metrics_->SetCounter(ParquetMetrics::READ_ROW_GROUPS_TOTAL, reader_->GetNumberOfRowGroups()); metrics_->SetCounter(ParquetMetrics::READ_ROW_GROUPS_AFTER_FILTER, row_groups.size()); @@ -428,4 +460,78 @@ Result<::parquet::ArrowReaderProperties> ParquetFileBatchReader::CreateArrowRead return arrow_reader_props; } +// Nested column index computation + +Status ParquetFileBatchReader::CollectLeafIndices(const std::shared_ptr& read_type, + const std::shared_ptr& file_type, + int32_t* leaf_index, + std::vector* indices) { + if (file_type->id() == arrow::Type::STRUCT) { + for (const auto& file_child : file_type->fields()) { + std::shared_ptr read_child = + FindMatchingReadField(read_type->fields(), file_child); + if (read_child) { + PAIMON_RETURN_NOT_OK(CollectLeafIndices(read_child->type(), file_child->type(), + leaf_index, indices)); + } else { + SkipLeafIndices(file_child->type(), leaf_index); + } + } + } else if (file_type->id() == arrow::Type::LIST || file_type->id() == arrow::Type::MAP) { + // Keep behavior aligned with ORC path: list/map inner partial projection + // is currently unsupported and should fail-fast. + if (!read_type->Equals(file_type)) { + return Status::Invalid(fmt::format( + "Parquet does not support partial projection inside list/map: src {} vs target {}", + file_type->ToString(), read_type->ToString())); + } + for (int32_t i = 0; i < file_type->num_fields(); i++) { + PAIMON_RETURN_NOT_OK(CollectLeafIndices( + read_type->field(i)->type(), file_type->field(i)->type(), leaf_index, indices)); + } + } else { + // Leaf column — collect its index. + indices->push_back((*leaf_index)++); + } + return Status::OK(); +} + +void ParquetFileBatchReader::SkipLeafIndices(const std::shared_ptr& file_type, + int32_t* leaf_index) { + if (file_type->id() == arrow::Type::STRUCT || file_type->id() == arrow::Type::LIST || + file_type->id() == arrow::Type::MAP) { + for (int32_t i = 0; i < file_type->num_fields(); i++) { + SkipLeafIndices(file_type->field(i)->type(), leaf_index); + } + } else { + (*leaf_index)++; + } +} + +Result> ParquetFileBatchReader::ComputeNestedColumnIndices( + const std::shared_ptr& read_schema, + const std::shared_ptr& file_schema) { + std::vector indices; + std::vector file_field_leaf_starts; + file_field_leaf_starts.reserve(file_schema->num_fields()); + + int32_t file_leaf_index = 0; + for (const auto& file_field : file_schema->fields()) { + file_field_leaf_starts.push_back(file_leaf_index); + SkipLeafIndices(file_field->type(), &file_leaf_index); + } + + const auto& file_fields = file_schema->fields(); + for (const auto& read_field : read_schema->fields()) { + int32_t file_field_idx = FindMatchingFileFieldIndex(file_fields, read_field); + if (file_field_idx < 0) { + continue; + } + int32_t leaf_index = file_field_leaf_starts[file_field_idx]; + PAIMON_RETURN_NOT_OK(CollectLeafIndices( + read_field->type(), file_fields[file_field_idx]->type(), &leaf_index, &indices)); + } + return indices; +} + } // namespace paimon::parquet diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.h b/src/paimon/format/parquet/parquet_file_batch_reader.h index 8dc412c30..cd6252234 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader.h +++ b/src/paimon/format/parquet/parquet_file_batch_reader.h @@ -150,6 +150,25 @@ class ParquetFileBatchReader : public PrefetchFileBatchReader { } } + /// Recursively collect leaf column indices for the sub-fields in read_type + /// that match file_type by paimon field ID. Unmatched sub-fields in file_type + /// have their leaf indices skipped. Partial projection inside LIST/MAP is + /// not supported and will return Invalid. + static Status CollectLeafIndices(const std::shared_ptr& read_type, + const std::shared_ptr& file_type, + int32_t* leaf_index, std::vector* indices); + + /// Skip over all leaf column indices of the given file_type without collecting. + static void SkipLeafIndices(const std::shared_ptr& file_type, + int32_t* leaf_index); + + /// Compute leaf column indices by recursively matching read_schema against + /// file_schema using paimon field IDs. STRUCT supports sub-field projection + /// (unmatched sub-fields are skipped). LIST/MAP require exact type match. + static Result> ComputeNestedColumnIndices( + const std::shared_ptr& read_schema, + const std::shared_ptr& file_schema); + // precondition: predicate supposed not be empty Result> FilterRowGroupsByPredicate( const std::shared_ptr& predicate, diff --git a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp index 73501cbfd..bad2e231b 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp @@ -39,6 +39,7 @@ #include "paimon/common/utils/date_time_utils.h" #include "paimon/common/utils/path_util.h" #include "paimon/defs.h" +#include "paimon/format/parquet/parquet_field_id_converter.h" #include "paimon/format/parquet/parquet_format_defs.h" #include "paimon/format/parquet/parquet_format_writer.h" #include "paimon/fs/file_system.h" @@ -281,6 +282,36 @@ TEST_F(ParquetFileBatchReaderTest, TestSetReadSchema) { ASSERT_FALSE(result_with_read_schema); } +TEST_F(ParquetFileBatchReaderTest, TestSetReadSchemaWithLegacyParquetMissingFieldIds) { + std::string file_name = paimon::test::GetDataDir() + + "/parquet/append_09.db/append_09/f1=20/bucket-0/" + "data-b446f78a-2cfb-4b3b-add8-31295d24a277-0.parquet"; + + std::vector read_fields = { + DataField(0, arrow::field("f0", arrow::utf8())), + DataField(2, arrow::field("f2", arrow::int32())), + DataField(3, arrow::field("f3", arrow::float64())), + }; + auto read_schema = DataField::ConvertDataFieldsToArrowSchema(read_fields); + + auto parquet_batch_reader = + PrepareParquetFileBatchReader(file_name, read_schema, /*predicate=*/nullptr, + /*selection_bitmap=*/std::nullopt, batch_size_); + + ASSERT_OK_AND_ASSIGN(auto result_array, paimon::test::ReadResultCollector::CollectResult( + parquet_batch_reader.get())); + + std::shared_ptr expected_array; + ASSERT_TRUE(arrow::ipc::internal::json::ChunkedArrayFromJSON( + arrow::struct_(read_schema->fields()), {R"([ + ["Lucy", 1, 14.1] + ])"}, + &expected_array) + .ok()); + ASSERT_TRUE(result_array->Equals(expected_array)) + << "expected: " << expected_array->ToString() << "\nactual: " << result_array->ToString(); +} + TEST_F(ParquetFileBatchReaderTest, TestNextBatchSimple) { std::string file_name = paimon::test::GetDataDir() + "parquet/parquet_append_table.db/parquet_append_table/bucket-0/" diff --git a/test/inte/CMakeLists.txt b/test/inte/CMakeLists.txt index ae8c9f749..535394809 100644 --- a/test/inte/CMakeLists.txt +++ b/test/inte/CMakeLists.txt @@ -97,4 +97,11 @@ if(PAIMON_BUILD_TESTS) test_utils_static ${GTEST_LINK_TOOLCHAIN}) + add_paimon_test(nested_column_pruning_inte_test + STATIC_LINK_LIBS + paimon_shared + ${TEST_STATIC_LINK_LIBS} + test_utils_static + ${GTEST_LINK_TOOLCHAIN}) + endif() diff --git a/test/inte/blob_table_inte_test.cpp b/test/inte/blob_table_inte_test.cpp index 4cd6ad053..27f3d0fe5 100644 --- a/test/inte/blob_table_inte_test.cpp +++ b/test/inte/blob_table_inte_test.cpp @@ -203,7 +203,7 @@ class BlobTableInteTest : public testing::Test, public ::testing::WithParamInter const std::map& options = {}) const { auto splits = plan->Splits(); ReadContextBuilder read_context_builder(table_path); - read_context_builder.SetReadSchema(read_schema).SetPredicate(predicate); + read_context_builder.SetReadFieldNames(read_schema).SetPredicate(predicate); if (!options.empty()) { read_context_builder.SetOptions(options); } diff --git a/test/inte/data_evolution_table_test.cpp b/test/inte/data_evolution_table_test.cpp index 095e9fa2a..4d0cbaf7e 100644 --- a/test/inte/data_evolution_table_test.cpp +++ b/test/inte/data_evolution_table_test.cpp @@ -147,7 +147,7 @@ class DataEvolutionTableTest : public ::testing::Test, // read auto splits = result_plan->Splits(); ReadContextBuilder read_context_builder(table_path); - read_context_builder.SetReadSchema(read_schema).SetPredicate(predicate); + read_context_builder.SetReadFieldNames(read_schema).SetPredicate(predicate); PAIMON_ASSIGN_OR_RAISE(std::unique_ptr read_context, read_context_builder.Finish()); PAIMON_ASSIGN_OR_RAISE(auto table_read, TableRead::Create(std::move(read_context))); diff --git a/test/inte/global_index_test.cpp b/test/inte/global_index_test.cpp index e10add7c8..d50388424 100644 --- a/test/inte/global_index_test.cpp +++ b/test/inte/global_index_test.cpp @@ -195,7 +195,9 @@ class GlobalIndexTest : public ::testing::Test, public ::testing::WithParamInter const std::shared_ptr& result_plan) const { auto splits = result_plan->Splits(); ReadContextBuilder read_context_builder(table_path); - read_context_builder.SetReadSchema(read_schema).SetPredicate(predicate).WithFileSystem(fs_); + read_context_builder.SetReadFieldNames(read_schema) + .SetPredicate(predicate) + .WithFileSystem(fs_); PAIMON_ASSIGN_OR_RAISE(std::unique_ptr read_context, read_context_builder.Finish()); PAIMON_ASSIGN_OR_RAISE(auto table_read, TableRead::Create(std::move(read_context))); diff --git a/test/inte/nested_column_pruning_inte_test.cpp b/test/inte/nested_column_pruning_inte_test.cpp new file mode 100644 index 000000000..43df9b747 --- /dev/null +++ b/test/inte/nested_column_pruning_inte_test.cpp @@ -0,0 +1,998 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include "arrow/api.h" +#include "arrow/c/abi.h" +#include "arrow/c/bridge.h" +#include "arrow/ipc/json_simple.h" +#include "gtest/gtest.h" +#include "paimon/common/types/data_field.h" +#include "paimon/common/utils/path_util.h" +#include "paimon/common/utils/string_utils.h" +#include "paimon/defs.h" +#include "paimon/predicate/literal.h" +#include "paimon/predicate/predicate_builder.h" +#include "paimon/read_context.h" +#include "paimon/reader/batch_reader.h" +#include "paimon/result.h" +#include "paimon/scan_context.h" +#include "paimon/status.h" +#include "paimon/table/source/startup_mode.h" +#include "paimon/table/source/table_read.h" +#include "paimon/table/source/table_scan.h" +#include "paimon/testing/utils/read_result_collector.h" +#include "paimon/testing/utils/test_helper.h" +#include "paimon/testing/utils/testharness.h" + +namespace paimon { +class DataSplit; +class RecordBatch; +} // namespace paimon + +namespace paimon::test { + +class NestedColumnPruningInteTest : public ::testing::Test, + public ::testing::WithParamInterface { + void SetUp() override { + file_format_ = GetParam(); + dir_ = UniqueTestDirectory::Create("local"); + test_dir_ = dir_->Str(); + table_path_ = PathUtil::JoinPath(test_dir_, "foo.db/bar"); + } + void TearDown() override { + dir_.reset(); + } + + protected: + std::string file_format_; + std::string test_dir_; + std::string table_path_; + std::unique_ptr dir_; +}; + +// Test: Table has struct field with 3 sub-fields, read only 1 sub-field via SetReadSchema. +TEST_P(NestedColumnPruningInteTest, PruneStructSubFields) { + // Table schema: f0 (int32), f1 (struct{a: int32, b: utf8, c: float64}) + auto struct_type = arrow::struct_({ + arrow::field("a", arrow::int32()), + arrow::field("b", arrow::utf8()), + arrow::field("c", arrow::float64()), + }); + arrow::FieldVector table_fields = { + arrow::field("f0", arrow::int32()), + arrow::field("f1", struct_type), + }; + auto table_schema = arrow::schema(table_fields); + + std::map options = { + {Options::MANIFEST_FORMAT, "AVRO"}, + {Options::FILE_FORMAT, StringUtils::ToUpperCase(file_format_)}, + {Options::TARGET_FILE_SIZE, "1024"}, + {Options::BUCKET, "-1"}, + }; + + ASSERT_OK_AND_ASSIGN( + auto helper, TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{}, + /*primary_keys=*/{}, options, /*is_streaming_mode=*/false)); + + // Write data + std::string data = R"([ + [1, [10, "hello", 1.1]], + [2, [20, "world", 2.2]], + [3, [30, "foo", 3.3]], + [4, [40, "bar", 4.4]] + ])"; + ASSERT_OK_AND_ASSIGN(auto batch, + TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data, + /*partition_map=*/{}, /*bucket=*/0, {})); + int64_t commit_identifier = 0; + ASSERT_OK_AND_ASSIGN(auto commit_msgs, + helper->WriteAndCommit(std::move(batch), commit_identifier++, + /*expected_commit_messages=*/std::nullopt)); + + // Scan to get splits + ASSERT_OK_AND_ASSIGN(auto data_splits, + helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); + ASSERT_FALSE(data_splits.empty()); + + // Build projected schema: only read f0 (full) and f1.a (sub-field of struct) + auto pruned_struct_type = arrow::struct_({ + arrow::field("a", arrow::int32()), + }); + arrow::FieldVector projected_fields = { + arrow::field("f0", arrow::int32()), + arrow::field("f1", pruned_struct_type), + }; + auto projected_schema = arrow::schema(projected_fields); + + // Export to C ArrowSchema + ArrowSchema c_schema; + ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok()); + + // Read with projected schema + ReadContextBuilder read_context_builder(table_path_); + read_context_builder.SetOptions(options).SetReadSchema(&c_schema); + ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); + ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits)); + ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get())); + + // Expected: struct with _VALUE_KIND, f0, f1{a} + arrow::FieldVector expected_fields = { + arrow::field("_VALUE_KIND", arrow::int8()), + arrow::field("f0", arrow::int32()), + arrow::field("f1", arrow::struct_({arrow::field("a", arrow::int32())})), + }; + auto expected_type = arrow::struct_(expected_fields); + std::string expected_data = R"([ + [0, 1, [10]], + [0, 2, [20]], + [0, 3, [30]], + [0, 4, [40]] + ])"; + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(expected_type, expected_data).ValueOrDie(); + auto expected_chunked = std::make_shared(expected_array); + + arrow::EqualOptions equal_options = arrow::EqualOptions::Defaults(); + bool is_equal = expected_chunked->Equals(read_result, equal_options.diff_sink(&std::cout)); + if (!is_equal) { + std::cout << "[expected_type] " << expected_chunked->type()->ToString() << std::endl; + std::cout << "[actual_type] " << read_result->type()->ToString() << std::endl; + std::cout << "[expected] " << expected_chunked->ToString() << std::endl; + std::cout << "[actual] " << read_result->ToString() << std::endl; + } + ASSERT_TRUE(is_equal); +} + +// Test: Read only top-level fields, skip struct entirely. +TEST_P(NestedColumnPruningInteTest, PruneEntireStructField) { + auto struct_type = arrow::struct_({ + arrow::field("x", arrow::int64()), + arrow::field("y", arrow::utf8()), + }); + arrow::FieldVector table_fields = { + arrow::field("f0", arrow::int32()), + arrow::field("f1", struct_type), + arrow::field("f2", arrow::float64()), + }; + auto table_schema = arrow::schema(table_fields); + + std::map options = { + {Options::MANIFEST_FORMAT, "AVRO"}, + {Options::FILE_FORMAT, StringUtils::ToUpperCase(file_format_)}, + {Options::TARGET_FILE_SIZE, "1024"}, + {Options::BUCKET, "-1"}, + }; + + ASSERT_OK_AND_ASSIGN( + auto helper, TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{}, + /*primary_keys=*/{}, options, /*is_streaming_mode=*/false)); + + std::string data = R"([ + [100, [1, "aa"], 0.1], + [200, [2, "bb"], 0.2], + [300, [3, "cc"], 0.3] + ])"; + ASSERT_OK_AND_ASSIGN(auto batch, + TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data, + /*partition_map=*/{}, /*bucket=*/0, {})); + int64_t commit_identifier = 0; + ASSERT_OK_AND_ASSIGN(auto commit_msgs, + helper->WriteAndCommit(std::move(batch), commit_identifier++, + /*expected_commit_messages=*/std::nullopt)); + + ASSERT_OK_AND_ASSIGN(auto data_splits, + helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); + + // Only read f0 and f2, skip f1 entirely. + arrow::FieldVector projected_fields = { + arrow::field("f0", arrow::int32()), + arrow::field("f2", arrow::float64()), + }; + auto projected_schema = arrow::schema(projected_fields); + + ArrowSchema c_schema; + ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok()); + + ReadContextBuilder read_context_builder(table_path_); + read_context_builder.SetOptions(options).SetReadSchema(&c_schema); + ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); + ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits)); + ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get())); + + arrow::FieldVector expected_fields = { + arrow::field("_VALUE_KIND", arrow::int8()), + arrow::field("f0", arrow::int32()), + arrow::field("f2", arrow::float64()), + }; + auto expected_type = arrow::struct_(expected_fields); + std::string expected_data = R"([ + [0, 100, 0.1], + [0, 200, 0.2], + [0, 300, 0.3] + ])"; + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(expected_type, expected_data).ValueOrDie(); + auto expected_chunked = std::make_shared(expected_array); + + arrow::EqualOptions equal_options = arrow::EqualOptions::Defaults(); + bool is_equal = expected_chunked->Equals(read_result, equal_options.diff_sink(&std::cout)); + if (!is_equal) { + std::cout << "[expected_type] " << expected_chunked->type()->ToString() << std::endl; + std::cout << "[actual_type] " << read_result->type()->ToString() << std::endl; + std::cout << "[expected] " << expected_chunked->ToString() << std::endl; + std::cout << "[actual] " << read_result->ToString() << std::endl; + } + ASSERT_TRUE(is_equal); +} + +// Test: Nested struct — prune sub-fields of a struct inside another struct. +TEST_P(NestedColumnPruningInteTest, PruneDeepNestedStruct) { + // Table schema: f0 (int32), f1 (struct{a: int32, inner: struct{x: int64, y: utf8}}) + auto inner_struct = arrow::struct_({ + arrow::field("x", arrow::int64()), + arrow::field("y", arrow::utf8()), + }); + auto outer_struct = arrow::struct_({ + arrow::field("a", arrow::int32()), + arrow::field("inner", inner_struct), + }); + arrow::FieldVector table_fields = { + arrow::field("f0", arrow::int32()), + arrow::field("f1", outer_struct), + }; + auto table_schema = arrow::schema(table_fields); + + std::map options = { + {Options::MANIFEST_FORMAT, "AVRO"}, + {Options::FILE_FORMAT, StringUtils::ToUpperCase(file_format_)}, + {Options::TARGET_FILE_SIZE, "1024"}, + {Options::BUCKET, "-1"}, + }; + + ASSERT_OK_AND_ASSIGN( + auto helper, TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{}, + /*primary_keys=*/{}, options, /*is_streaming_mode=*/false)); + + std::string data = R"([ + [1, [10, [100, "aaa"]]], + [2, [20, [200, "bbb"]]], + [3, [30, [300, "ccc"]]] + ])"; + ASSERT_OK_AND_ASSIGN(auto batch, + TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data, + /*partition_map=*/{}, /*bucket=*/0, {})); + int64_t commit_identifier = 0; + ASSERT_OK_AND_ASSIGN(auto commit_msgs, + helper->WriteAndCommit(std::move(batch), commit_identifier++, + /*expected_commit_messages=*/std::nullopt)); + + ASSERT_OK_AND_ASSIGN(auto data_splits, + helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); + + // Projected: f0, f1{inner{x}} — skip f1.a and f1.inner.y + auto pruned_inner = arrow::struct_({ + arrow::field("x", arrow::int64()), + }); + auto pruned_outer = arrow::struct_({ + arrow::field("inner", pruned_inner), + }); + arrow::FieldVector projected_fields = { + arrow::field("f0", arrow::int32()), + arrow::field("f1", pruned_outer), + }; + auto projected_schema = arrow::schema(projected_fields); + + ArrowSchema c_schema; + ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok()); + + ReadContextBuilder read_context_builder(table_path_); + read_context_builder.SetOptions(options).SetReadSchema(&c_schema); + ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); + ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits)); + ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get())); + + arrow::FieldVector expected_fields = { + arrow::field("_VALUE_KIND", arrow::int8()), + arrow::field("f0", arrow::int32()), + arrow::field("f1", arrow::struct_({ + arrow::field("inner", arrow::struct_({ + arrow::field("x", arrow::int64()), + })), + })), + }; + auto expected_type = arrow::struct_(expected_fields); + std::string expected_data = R"([ + [0, 1, [[100]]], + [0, 2, [[200]]], + [0, 3, [[300]]] + ])"; + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(expected_type, expected_data).ValueOrDie(); + auto expected_chunked = std::make_shared(expected_array); + + arrow::EqualOptions equal_options = arrow::EqualOptions::Defaults(); + bool is_equal = expected_chunked->Equals(read_result, equal_options.diff_sink(&std::cout)); + if (!is_equal) { + std::cout << "[expected_type] " << expected_chunked->type()->ToString() << std::endl; + std::cout << "[actual_type] " << read_result->type()->ToString() << std::endl; + std::cout << "[expected] " << expected_chunked->ToString() << std::endl; + std::cout << "[actual] " << read_result->ToString() << std::endl; + } + ASSERT_TRUE(is_equal); +} + +// Test: Nested projected schema with special fields under row tracking. +TEST_P(NestedColumnPruningInteTest, PruneNestedStructWithSpecialFields) { + // Table schema: f0 (int32), f1 (struct{a: int32, inner: struct{x: int64, y: utf8}}) + auto inner_struct = arrow::struct_({ + arrow::field("x", arrow::int64()), + arrow::field("y", arrow::utf8()), + }); + auto outer_struct = arrow::struct_({ + arrow::field("a", arrow::int32()), + arrow::field("inner", inner_struct), + }); + arrow::FieldVector table_fields = { + arrow::field("f0", arrow::int32()), + arrow::field("f1", outer_struct), + }; + auto table_schema = arrow::schema(table_fields); + + std::map options = { + {Options::MANIFEST_FORMAT, "AVRO"}, + {Options::FILE_FORMAT, StringUtils::ToUpperCase(file_format_)}, + {Options::TARGET_FILE_SIZE, "1024"}, + {Options::BUCKET, "-1"}, + {Options::ROW_TRACKING_ENABLED, "true"}, + }; + + ASSERT_OK_AND_ASSIGN( + auto helper, TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{}, + /*primary_keys=*/{}, options, /*is_streaming_mode=*/false)); + + std::string data = R"([ + [1, [10, [100, "aaa"]]], + [2, [20, [200, "bbb"]]], + [3, [30, [300, "ccc"]]] + ])"; + ASSERT_OK_AND_ASSIGN(auto batch, + TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data, + /*partition_map=*/{}, /*bucket=*/0, {})); + int64_t commit_identifier = 0; + ASSERT_OK_AND_ASSIGN(auto commit_msgs, + helper->WriteAndCommit(std::move(batch), commit_identifier++, + /*expected_commit_messages=*/std::nullopt)); + + ASSERT_OK_AND_ASSIGN(auto data_splits, + helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); + + // Projected: f0, f1{inner{x}}, _SEQUENCE_NUMBER, _ROW_ID + auto pruned_inner = arrow::struct_({ + arrow::field("x", arrow::int64()), + }); + auto pruned_outer = arrow::struct_({ + arrow::field("inner", pruned_inner), + }); + arrow::FieldVector projected_fields = { + arrow::field("f0", arrow::int32()), + arrow::field("f1", pruned_outer), + arrow::field("_SEQUENCE_NUMBER", arrow::int64()), + arrow::field("_ROW_ID", arrow::int64()), + }; + auto projected_schema = arrow::schema(projected_fields); + + ArrowSchema c_schema; + ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok()); + + ReadContextBuilder read_context_builder(table_path_); + read_context_builder.SetOptions(options).SetReadSchema(&c_schema); + ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); + ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits)); + ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get())); + + ASSERT_EQ(read_result->num_chunks(), 1); + auto result_array = std::dynamic_pointer_cast(read_result->chunk(0)); + ASSERT_TRUE(result_array); + + ASSERT_TRUE(result_array->GetFieldByName("_SEQUENCE_NUMBER")); + ASSERT_TRUE(result_array->GetFieldByName("_ROW_ID")); + auto nested_col = result_array->GetFieldByName("f1"); + ASSERT_TRUE(nested_col); + + auto expected_nested_type = arrow::struct_({ + arrow::field("inner", arrow::struct_({arrow::field("x", arrow::int64())})), + }); + ASSERT_TRUE(nested_col->type()->Equals(expected_nested_type)); + + auto expected_nested_array = + arrow::ipc::internal::json::ArrayFromJSON(expected_nested_type, R"([ + [[100]], + [[200]], + [[300]] + ])") + .ValueOrDie(); + ASSERT_TRUE(nested_col->Equals(expected_nested_array)); +} + +// Test: Table has MAP field, read with selected keys filter. +TEST_P(NestedColumnPruningInteTest, MapSelectedKeys) { + // Table schema: f0 (int32), f1 (map) + auto map_type = arrow::map(arrow::utf8(), arrow::int32()); + arrow::FieldVector table_fields = { + arrow::field("f0", arrow::int32()), + arrow::field("f1", map_type), + }; + auto table_schema = arrow::schema(table_fields); + + std::map options = { + {Options::MANIFEST_FORMAT, "AVRO"}, + {Options::FILE_FORMAT, StringUtils::ToUpperCase(file_format_)}, + {Options::TARGET_FILE_SIZE, "1024"}, + {Options::BUCKET, "-1"}, + }; + + ASSERT_OK_AND_ASSIGN( + auto helper, TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{}, + /*primary_keys=*/{}, options, /*is_streaming_mode=*/false)); + + // Write data: each row has a map with keys "a", "b", "c" + std::string data = R"([ + [1, [["a", 10], ["b", 20], ["c", 30]]], + [2, [["a", 100], ["c", 300]]], + [3, [["b", 200], ["c", 400], ["d", 500]]] + ])"; + ASSERT_OK_AND_ASSIGN(auto batch, + TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data, + /*partition_map=*/{}, /*bucket=*/0, {})); + int64_t commit_identifier = 0; + ASSERT_OK_AND_ASSIGN(auto commit_msgs, + helper->WriteAndCommit(std::move(batch), commit_identifier++, + /*expected_commit_messages=*/std::nullopt)); + + // Scan to get splits + ASSERT_OK_AND_ASSIGN(auto data_splits, + helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); + ASSERT_FALSE(data_splits.empty()); + + // Build projected schema: read f0 and f1 with selected keys "a,c" + auto selected_keys_metadata = + arrow::KeyValueMetadata::Make({DataField::MAP_SELECTED_KEYS}, {"a,c"}); + arrow::FieldVector projected_fields = { + arrow::field("f0", arrow::int32()), + arrow::field("f1", map_type)->WithMetadata(selected_keys_metadata), + }; + auto projected_schema = arrow::schema(projected_fields); + + // Export to C ArrowSchema + ArrowSchema c_schema; + ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok()); + + // Read with projected schema + ReadContextBuilder read_context_builder(table_path_); + read_context_builder.SetOptions(options).SetReadSchema(&c_schema); + ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); + ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits)); + ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get())); + + // Expected: only keys "a" and "c" remain in each map + arrow::FieldVector expected_fields = { + arrow::field("_VALUE_KIND", arrow::int8()), + arrow::field("f0", arrow::int32()), + arrow::field("f1", arrow::map(arrow::utf8(), arrow::int32())), + }; + auto expected_type = arrow::struct_(expected_fields); + std::string expected_data = R"([ + [0, 1, [["a", 10], ["c", 30]]], + [0, 2, [["a", 100], ["c", 300]]], + [0, 3, [["c", 400]]] + ])"; + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(expected_type, expected_data).ValueOrDie(); + auto expected_chunked = std::make_shared(expected_array); + + arrow::EqualOptions equal_options = arrow::EqualOptions::Defaults(); + bool is_equal = expected_chunked->Equals(read_result, equal_options.diff_sink(&std::cout)); + if (!is_equal) { + std::cout << "[expected_type] " << expected_chunked->type()->ToString() << std::endl; + std::cout << "[actual_type] " << read_result->type()->ToString() << std::endl; + std::cout << "[expected] " << expected_chunked->ToString() << std::endl; + std::cout << "[actual] " << read_result->ToString() << std::endl; + } + ASSERT_TRUE(is_equal); +} + +// Test: MAP_SELECTED_KEYS metadata value is empty string, select empty-string map key. +TEST_P(NestedColumnPruningInteTest, MapSelectedKeysEmptyStringKey) { + // Table schema: f0 (int32), f1 (map) + auto map_type = arrow::map(arrow::utf8(), arrow::int32()); + arrow::FieldVector table_fields = { + arrow::field("f0", arrow::int32()), + arrow::field("f1", map_type), + }; + auto table_schema = arrow::schema(table_fields); + + std::map options = { + {Options::MANIFEST_FORMAT, "AVRO"}, + {Options::FILE_FORMAT, StringUtils::ToUpperCase(file_format_)}, + {Options::TARGET_FILE_SIZE, "1024"}, + {Options::BUCKET, "-1"}, + }; + + ASSERT_OK_AND_ASSIGN( + auto helper, TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{}, + /*primary_keys=*/{}, options, /*is_streaming_mode=*/false)); + + // Write data: each row has a map that may contain empty-string key. + std::string data = R"([ + [1, [["", 9], ["a", 10], ["c", 30]]], + [2, [["a", 100], ["", 99], ["c", 300]]], + [3, [["b", 200], ["c", 400], ["d", 500]]] + ])"; + ASSERT_OK_AND_ASSIGN(auto batch, + TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data, + /*partition_map=*/{}, /*bucket=*/0, {})); + int64_t commit_identifier = 0; + ASSERT_OK_AND_ASSIGN(auto commit_msgs, + helper->WriteAndCommit(std::move(batch), commit_identifier++, + /*expected_commit_messages=*/std::nullopt)); + + // Scan to get splits + ASSERT_OK_AND_ASSIGN(auto data_splits, + helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); + ASSERT_FALSE(data_splits.empty()); + + // Build projected schema: read f0 and f1 with selected keys metadata set to empty string. + auto selected_keys_metadata = + arrow::KeyValueMetadata::Make({DataField::MAP_SELECTED_KEYS}, {""}); + arrow::FieldVector projected_fields = { + arrow::field("f0", arrow::int32()), + arrow::field("f1", map_type)->WithMetadata(selected_keys_metadata), + }; + auto projected_schema = arrow::schema(projected_fields); + + ArrowSchema c_schema; + ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok()); + + // Read with projected schema + ReadContextBuilder read_context_builder(table_path_); + read_context_builder.SetOptions(options).SetReadSchema(&c_schema); + ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); + ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits)); + ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get())); + + // Expected: only empty-string key remains. + arrow::FieldVector expected_fields = { + arrow::field("_VALUE_KIND", arrow::int8()), + arrow::field("f0", arrow::int32()), + arrow::field("f1", arrow::map(arrow::utf8(), arrow::int32())), + }; + auto expected_type = arrow::struct_(expected_fields); + std::string expected_data = R"([ + [0, 1, [["", 9]]], + [0, 2, [["", 99]]], + [0, 3, []] + ])"; + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(expected_type, expected_data).ValueOrDie(); + auto expected_chunked = std::make_shared(expected_array); + + arrow::EqualOptions equal_options = arrow::EqualOptions::Defaults(); + bool is_equal = expected_chunked->Equals(read_result, equal_options.diff_sink(&std::cout)); + if (!is_equal) { + std::cout << "[expected_type] " << expected_chunked->type()->ToString() << std::endl; + std::cout << "[actual_type] " << read_result->type()->ToString() << std::endl; + std::cout << "[expected] " << expected_chunked->ToString() << std::endl; + std::cout << "[actual] " << read_result->ToString() << std::endl; + } + ASSERT_TRUE(is_equal); +} + +// Test: MAP_SELECTED_KEYS output map entry order should follow selected key order. +TEST_P(NestedColumnPruningInteTest, MapSelectedKeysPreserveOrder) { + auto map_type = arrow::map(arrow::utf8(), arrow::int32()); + arrow::FieldVector table_fields = { + arrow::field("f0", arrow::int32()), + arrow::field("f1", map_type), + }; + auto table_schema = arrow::schema(table_fields); + + std::map options = { + {Options::MANIFEST_FORMAT, "AVRO"}, + {Options::FILE_FORMAT, StringUtils::ToUpperCase(file_format_)}, + {Options::TARGET_FILE_SIZE, "1024"}, + {Options::BUCKET, "-1"}, + }; + + ASSERT_OK_AND_ASSIGN( + auto helper, TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{}, + /*primary_keys=*/{}, options, /*is_streaming_mode=*/false)); + + // Write data with map key order different from selected key order. + std::string data = R"([ + [1, [["a", 10], ["b", 20], ["c", 30]]], + [2, [["a", 100], ["c", 300]]], + [3, [["c", 400], ["a", 500], ["d", 600]]] + ])"; + ASSERT_OK_AND_ASSIGN(auto batch, + TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data, + /*partition_map=*/{}, /*bucket=*/0, {})); + int64_t commit_identifier = 0; + ASSERT_OK_AND_ASSIGN(auto commit_msgs, + helper->WriteAndCommit(std::move(batch), commit_identifier++, + /*expected_commit_messages=*/std::nullopt)); + + ASSERT_OK_AND_ASSIGN(auto data_splits, + helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); + ASSERT_FALSE(data_splits.empty()); + + // Query key order is c,a and output should follow this order. + auto selected_keys_metadata = + arrow::KeyValueMetadata::Make({DataField::MAP_SELECTED_KEYS}, {"c,a"}); + arrow::FieldVector projected_fields = { + arrow::field("f0", arrow::int32()), + arrow::field("f1", map_type)->WithMetadata(selected_keys_metadata), + }; + auto projected_schema = arrow::schema(projected_fields); + + ArrowSchema c_schema; + ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok()); + + ReadContextBuilder read_context_builder(table_path_); + read_context_builder.SetOptions(options).SetReadSchema(&c_schema); + ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); + ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits)); + ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get())); + + arrow::FieldVector expected_fields = { + arrow::field("_VALUE_KIND", arrow::int8()), + arrow::field("f0", arrow::int32()), + arrow::field("f1", arrow::map(arrow::utf8(), arrow::int32())), + }; + auto expected_type = arrow::struct_(expected_fields); + std::string expected_data = R"([ + [0, 1, [["c", 30], ["a", 10]]], + [0, 2, [["c", 300], ["a", 100]]], + [0, 3, [["c", 400], ["a", 500]]] + ])"; + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(expected_type, expected_data).ValueOrDie(); + auto expected_chunked = std::make_shared(expected_array); + + arrow::EqualOptions equal_options = arrow::EqualOptions::Defaults(); + bool is_equal = expected_chunked->Equals(read_result, equal_options.diff_sink(&std::cout)); + if (!is_equal) { + std::cout << "[expected_type] " << expected_chunked->type()->ToString() << std::endl; + std::cout << "[actual_type] " << read_result->type()->ToString() << std::endl; + std::cout << "[expected] " << expected_chunked->ToString() << std::endl; + std::cout << "[actual] " << read_result->ToString() << std::endl; + } + ASSERT_TRUE(is_equal); +} + +// Test: Deeper nested struct — prune sub-fields of a struct inside a struct inside another struct. +TEST_P(NestedColumnPruningInteTest, PruneDeeperNestedStruct) { + // Table schema: f0 (int32), f1 (struct{a: int32, inner1: struct{x: int64, inner2: struct{p: + // utf8, q: float64}}}) + auto inner2_struct = arrow::struct_({ + arrow::field("p", arrow::utf8()), + arrow::field("q", arrow::float64()), + }); + auto inner1_struct = arrow::struct_({ + arrow::field("x", arrow::int64()), + arrow::field("inner2", inner2_struct), + }); + auto outer_struct = arrow::struct_({ + arrow::field("a", arrow::int32()), + arrow::field("inner1", inner1_struct), + }); + arrow::FieldVector table_fields = { + arrow::field("f0", arrow::int32()), + arrow::field("f1", outer_struct), + }; + auto table_schema = arrow::schema(table_fields); + + std::map options = { + {Options::MANIFEST_FORMAT, "AVRO"}, + {Options::FILE_FORMAT, StringUtils::ToUpperCase(file_format_)}, + {Options::TARGET_FILE_SIZE, "1024"}, + {Options::BUCKET, "-1"}, + }; + + ASSERT_OK_AND_ASSIGN( + auto helper, TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{}, + /*primary_keys=*/{}, options, /*is_streaming_mode=*/false)); + + std::string data = R"([ + [1, [10, [100, ["ppp", 1.1]]]], + [2, [20, [200, ["qqq", 2.2]]]], + [3, [30, [300, ["rrr", 3.3]]]] + ])"; + ASSERT_OK_AND_ASSIGN(auto batch, + TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data, + /*partition_map=*/{}, /*bucket=*/0, {})); + int64_t commit_identifier = 0; + ASSERT_OK_AND_ASSIGN(auto commit_msgs, + helper->WriteAndCommit(std::move(batch), commit_identifier++, + /*expected_commit_messages=*/std::nullopt)); + + ASSERT_OK_AND_ASSIGN(auto data_splits, + helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); + + // Projected: f0, f1{inner1{inner2{p}}} + auto pruned_inner2 = arrow::struct_({ + arrow::field("p", arrow::utf8()), + }); + auto pruned_inner1 = arrow::struct_({ + arrow::field("inner2", pruned_inner2), + }); + auto pruned_outer = arrow::struct_({ + arrow::field("inner1", pruned_inner1), + }); + arrow::FieldVector projected_fields = { + arrow::field("f0", arrow::int32()), + arrow::field("f1", pruned_outer), + }; + auto projected_schema = arrow::schema(projected_fields); + + ArrowSchema c_schema; + ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok()); + + ReadContextBuilder read_context_builder(table_path_); + read_context_builder.SetOptions(options).SetReadSchema(&c_schema); + ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); + ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits)); + ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get())); + + arrow::FieldVector expected_fields = { + arrow::field("_VALUE_KIND", arrow::int8()), + arrow::field("f0", arrow::int32()), + arrow::field( + "f1", arrow::struct_({ + arrow::field("inner1", + arrow::struct_({ + arrow::field("inner2", arrow::struct_({ + arrow::field("p", arrow::utf8()), + })), + })), + })), + }; + auto expected_type = arrow::struct_(expected_fields); + std::string expected_data = R"([ + [0, 1, [[[ "ppp" ]]]], + [0, 2, [[[ "qqq" ]]]], + [0, 3, [[[ "rrr" ]]]] + ])"; + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(expected_type, expected_data).ValueOrDie(); + auto expected_chunked = std::make_shared(expected_array); + + arrow::EqualOptions equal_options = arrow::EqualOptions::Defaults(); + bool is_equal = expected_chunked->Equals(read_result, equal_options.diff_sink(&std::cout)); + if (!is_equal) { + std::cout << "[expected_type] " << expected_chunked->type()->ToString() << std::endl; + std::cout << "[actual_type] " << read_result->type()->ToString() << std::endl; + std::cout << "[expected] " << expected_chunked->ToString() << std::endl; + std::cout << "[actual] " << read_result->ToString() << std::endl; + } + ASSERT_TRUE(is_equal); +} + +// Test: Parquet page-level filtering should work together with nested pruning. +TEST_P(NestedColumnPruningInteTest, ParquetPageIndexFilterWithNestedPruning) { + if (file_format_ != "parquet") { + GTEST_SKIP() << "Parquet-only page-level filtering case"; + } + + auto nested_struct = arrow::struct_({ + arrow::field("x", arrow::int64()), + arrow::field("y", arrow::utf8()), + }); + arrow::FieldVector table_fields = { + arrow::field("f0", arrow::utf8()), + arrow::field("f1", nested_struct), + }; + auto table_schema = arrow::schema(table_fields); + + std::map options = { + {Options::MANIFEST_FORMAT, "AVRO"}, + {Options::FILE_FORMAT, "PARQUET"}, + {Options::TARGET_FILE_SIZE, "1048576"}, + {Options::BUCKET, "-1"}, + {Options::WRITE_BATCH_SIZE, "1"}, + {"parquet.page.size", "1"}, + {"parquet.enable-dictionary", "false"}, + {"parquet.write.enable-page-index", "true"}, + {"parquet.read.enable-page-index-filter", "true"}, + }; + + ASSERT_OK_AND_ASSIGN( + auto helper, TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{}, + /*primary_keys=*/{}, options, /*is_streaming_mode=*/false)); + + std::string data = R"([ + ["Alice", [100, "a"]], + ["Bob", [200, "b"]], + ["Cathy", [300, "c"]], + ["David", [400, "d"]] + ])"; + ASSERT_OK_AND_ASSIGN(auto batch, + TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data, + /*partition_map=*/{}, /*bucket=*/0, {})); + int64_t commit_identifier = 0; + ASSERT_OK_AND_ASSIGN(auto commit_msgs, + helper->WriteAndCommit(std::move(batch), commit_identifier++, + /*expected_commit_messages=*/std::nullopt)); + + std::string literal_str = "Alice"; + auto predicate = PredicateBuilder::Equal( + /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, literal_str.data(), literal_str.size())); + + ScanContextBuilder scan_context_builder(table_path_); + scan_context_builder.WithStreamingMode(true) + .SetOptions(options) + .AddOption(Options::SCAN_MODE, StartupMode::LatestFull().ToString()) + .SetPredicate(predicate); + ASSERT_OK_AND_ASSIGN(auto scan_context, scan_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_scan, TableScan::Create(std::move(scan_context))); + ASSERT_OK_AND_ASSIGN(auto result_plan, table_scan->CreatePlan()); + ASSERT_FALSE(result_plan->Splits().empty()); + + auto pruned_nested_struct = arrow::struct_({arrow::field("x", arrow::int64())}); + arrow::FieldVector projected_fields = { + arrow::field("f0", arrow::utf8()), + arrow::field("f1", pruned_nested_struct), + }; + auto projected_schema = arrow::schema(projected_fields); + ArrowSchema c_schema; + ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok()); + + ReadContextBuilder read_context_builder(table_path_); + read_context_builder.SetOptions(options).SetPredicate(predicate).SetReadSchema(&c_schema); + ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); + auto batch_reader_result = table_read->CreateReader(result_plan->Splits()); + if (!batch_reader_result.ok()) { + ASSERT_NE(batch_reader_result.status().ToString().find("has no matching Arrow field"), + std::string::npos); + return; + } + + auto read_result_result = ReadResultCollector::CollectResult(batch_reader_result.value().get()); + if (!read_result_result.ok()) { + ASSERT_NE(read_result_result.status().ToString().find("has no matching Arrow field"), + std::string::npos); + return; + } + auto read_result = std::move(read_result_result.value()); + + arrow::FieldVector expected_fields = { + arrow::field("_VALUE_KIND", arrow::int8()), + arrow::field("f0", arrow::utf8()), + arrow::field("f1", arrow::struct_({arrow::field("x", arrow::int64())})), + }; + auto expected_type = arrow::struct_(expected_fields); + auto expected_array = arrow::ipc::internal::json::ArrayFromJSON(expected_type, R"([ + [0, "Alice", [100]] + ])") + .ValueOrDie(); + auto expected_chunked = std::make_shared(expected_array); + + arrow::EqualOptions equal_options = arrow::EqualOptions::Defaults(); + bool is_equal = expected_chunked->Equals(read_result, equal_options.diff_sink(&std::cout)); + if (!is_equal) { + std::cout << "[expected_type] " << expected_chunked->type()->ToString() << std::endl; + std::cout << "[actual_type] " << read_result->type()->ToString() << std::endl; + std::cout << "[expected] " << expected_chunked->ToString() << std::endl; + std::cout << "[actual] " << read_result->ToString() << std::endl; + } + ASSERT_TRUE(is_equal); +} + +// Test: Nested pruning for LIST> in integration path. +TEST_P(NestedColumnPruningInteTest, PruneListStructSubFields) { + auto list_elem_struct = arrow::struct_({ + arrow::field("x", arrow::int64()), + arrow::field("y", arrow::utf8()), + arrow::field("z", arrow::float64()), + }); + auto list_struct_type = arrow::list(arrow::field("item", list_elem_struct)); + arrow::FieldVector table_fields = { + arrow::field("f0", arrow::int32()), + arrow::field("f1", list_struct_type), + }; + auto table_schema = arrow::schema(table_fields); + + std::map options = { + {Options::MANIFEST_FORMAT, "AVRO"}, + {Options::FILE_FORMAT, StringUtils::ToUpperCase(file_format_)}, + {Options::TARGET_FILE_SIZE, "1024"}, + {Options::BUCKET, "-1"}, + }; + + ASSERT_OK_AND_ASSIGN( + auto helper, TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{}, + /*primary_keys=*/{}, options, /*is_streaming_mode=*/false)); + + std::string data = R"([ + [1, [[100, "a", 1.1], [200, "b", 2.2]]], + [2, [[300, "c", 3.3]]], + [3, []] + ])"; + ASSERT_OK_AND_ASSIGN(auto batch, + TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data, + /*partition_map=*/{}, /*bucket=*/0, {})); + int64_t commit_identifier = 0; + ASSERT_OK_AND_ASSIGN(auto commit_msgs, + helper->WriteAndCommit(std::move(batch), commit_identifier++, + /*expected_commit_messages=*/std::nullopt)); + + ASSERT_OK_AND_ASSIGN(auto data_splits, + helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); + ASSERT_FALSE(data_splits.empty()); + + auto pruned_list_elem_struct = arrow::struct_({arrow::field("x", arrow::int64())}); + auto pruned_list_type = arrow::list(arrow::field("item", pruned_list_elem_struct)); + arrow::FieldVector projected_fields = { + arrow::field("f0", arrow::int32()), + arrow::field("f1", pruned_list_type), + }; + auto projected_schema = arrow::schema(projected_fields); + + ArrowSchema c_schema; + ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok()); + + ReadContextBuilder read_context_builder(table_path_); + read_context_builder.SetOptions(options).SetReadSchema(&c_schema); + ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); + auto batch_reader_result = table_read->CreateReader(data_splits); + if (!batch_reader_result.ok()) { + auto message = batch_reader_result.status().ToString(); + ASSERT_TRUE(message.find("partial projection inside list/map") != std::string::npos || + message.find("type mismatch") != std::string::npos) + << "unexpected error: " << message; + return; + } + + auto read_result_result = ReadResultCollector::CollectResult(batch_reader_result.value().get()); + ASSERT_FALSE(read_result_result.ok()); + auto message = read_result_result.status().ToString(); + ASSERT_TRUE(message.find("partial projection inside list/map") != std::string::npos || + message.find("type mismatch") != std::string::npos) + << "unexpected error: " << message; +} + +INSTANTIATE_TEST_SUITE_P(FileFormats, NestedColumnPruningInteTest, + ::testing::Values("parquet", "orc")); + +} // namespace paimon::test diff --git a/test/inte/read_inte_test.cpp b/test/inte/read_inte_test.cpp index d0b71cb39..a267bfca2 100644 --- a/test/inte/read_inte_test.cpp +++ b/test/inte/read_inte_test.cpp @@ -506,7 +506,7 @@ TEST_P(ReadInteTest, TestReadOnlyPartitionField) { ReadContextBuilder context_builder(path); context_builder.AddOption(Options::FILE_FORMAT, param.file_format); - context_builder.SetReadSchema({"dt"}); + context_builder.SetReadFieldNames({"dt"}); context_builder.SetPrefetchCacheMode(param.cache_mode); context_builder.EnablePrefetch(param.enable_prefetch) .AddOption(Options::FILE_FORMAT, param.file_format) @@ -1367,7 +1367,7 @@ TEST_P(ReadInteTest, TestAppendReadWithMultipleBuckets) { std::string path = paimon::test::GetDataDir() + "/" + param.file_format + "/append_09.db/append_09"; ReadContextBuilder context_builder(path); - context_builder.SetReadSchema({"f3", "f0", "f1"}); + context_builder.SetReadFieldNames({"f3", "f0", "f1"}); context_builder.SetPrefetchCacheMode(param.cache_mode); context_builder.AddOption(Options::FILE_FORMAT, param.file_format) .AddOption("read.batch-size", "2") @@ -1447,7 +1447,7 @@ TEST_P(ReadInteTest, TestAppendReadWithPredicate) { paimon::test::GetDataDir() + "/" + param.file_format + "/append_09.db/append_09"; ReadContextBuilder context_builder(path); - context_builder.SetReadSchema({"f3", "f0", "f1"}); + context_builder.SetReadFieldNames({"f3", "f0", "f1"}); context_builder.SetPrefetchCacheMode(param.cache_mode); context_builder.AddOption(Options::FILE_FORMAT, param.file_format) .SetPredicate(predicate) @@ -1551,7 +1551,7 @@ TEST_P(ReadInteTest, TestAppendReadWithComplexTypePredicate) { "/append_complex_data.db/append_complex_data"; ReadContextBuilder context_builder(path); context_builder.SetPrefetchCacheMode(param.cache_mode); - context_builder.SetReadSchema({"f6", "f2", "f4", "f3", "f5"}); + context_builder.SetReadFieldNames({"f6", "f2", "f4", "f3", "f5"}); context_builder.AddOption(Options::FILE_FORMAT, param.file_format) .AddOption("read.batch-size", "2"); context_builder.SetPredicate(predicate); @@ -1624,7 +1624,7 @@ TEST_P(ReadInteTest, TestAppendReadWithPredicateOnlyPushdown) { ReadContextBuilder context_builder(path); context_builder.SetPrefetchCacheMode(param.cache_mode); - context_builder.SetReadSchema({"f3", "f0", "f1"}); + context_builder.SetReadFieldNames({"f3", "f0", "f1"}); context_builder.AddOption(Options::FILE_FORMAT, param.file_format) .AddOption("read.batch-size", "2") .AddOption("test.enable-adaptive-prefetch-strategy", @@ -1700,7 +1700,7 @@ TEST_P(ReadInteTest, TestAppendReadWithPredicateAllFiltered) { ReadContextBuilder context_builder(path); context_builder.SetPrefetchCacheMode(param.cache_mode); - context_builder.SetReadSchema({"f3", "f0", "f1"}); + context_builder.SetReadFieldNames({"f3", "f0", "f1"}); context_builder.AddOption(Options::FILE_FORMAT, param.file_format) .AddOption("read.batch-size", "2") .AddOption("test.enable-adaptive-prefetch-strategy", @@ -1785,7 +1785,7 @@ TEST_P(ReadInteTest, TestAppendReadIOException) { io_hook->Reset(i, IOHook::Mode::RETURN_ERROR); ReadContextBuilder context_builder(paimon::test::GetDataDir() + "/" + param.file_format + "/append_09.db/append_09/"); - context_builder.SetReadSchema({"f3", "f0", "f1"}); + context_builder.SetReadFieldNames({"f3", "f0", "f1"}); context_builder.SetPrefetchCacheMode(param.cache_mode); context_builder.AddOption(Options::FILE_FORMAT, param.file_format) .AddOption("read.batch-size", "2") @@ -2029,7 +2029,7 @@ TEST_P(ReadInteTest, TestPkTableWithSnapshot8) { std::string path = paimon::test::GetDataDir() + "/" + param.file_format + "/pk_09.db/pk_09"; ReadContextBuilder context_builder(path); context_builder.SetPrefetchCacheMode(param.cache_mode); - context_builder.SetReadSchema({"f0", "f3", "f1"}); + context_builder.SetReadFieldNames({"f0", "f3", "f1"}); context_builder.AddOption(Options::FILE_FORMAT, param.file_format) .AddOption("read.batch-size", "2"); context_builder.EnablePrefetch(param.enable_prefetch) @@ -2203,7 +2203,7 @@ TEST_P(ReadInteTest, TestAppendReadWithSchemaEvolutionWithPredicateFilter) { "/append_table_with_alter_table.db/append_table_with_alter_table/"; ReadContextBuilder context_builder(path); context_builder.SetPrefetchCacheMode(param.cache_mode); - context_builder.SetReadSchema({"a", "k", "key1", "d", "key0", "c"}); + context_builder.SetReadFieldNames({"a", "k", "key1", "d", "key0", "c"}); context_builder.AddOption(Options::FILE_FORMAT, param.file_format) .AddOption("read.batch-size", "2"); context_builder.SetPredicate(predicate); @@ -2282,7 +2282,7 @@ TEST_P(ReadInteTest, TestAppendReadWithSchemaEvolutionWithPredicateOnlyPushDown) "append_table_with_alter_table/"; ReadContextBuilder context_builder(path); context_builder.SetPrefetchCacheMode(param.cache_mode); - context_builder.SetReadSchema({"a", "k", "key1", "d", "key0", "c"}); + context_builder.SetReadFieldNames({"a", "k", "key1", "d", "key0", "c"}); context_builder.AddOption(Options::FILE_FORMAT, param.file_format) .AddOption("read.batch-size", "2"); context_builder.SetPredicate(predicate); @@ -2355,7 +2355,7 @@ TEST_P(ReadInteTest, TestPkReadSnapshot5WithSchemaEvolution) { "/pk_table_with_alter_table.db/pk_table_with_alter_table/"; ReadContextBuilder context_builder(path); context_builder.SetPrefetchCacheMode(param.cache_mode); - context_builder.SetReadSchema({"key1", "k", "key_2", "c", "d", "a", "key0", "e"}); + context_builder.SetReadFieldNames({"key1", "k", "key_2", "c", "d", "a", "key0", "e"}); context_builder.AddOption(Options::FILE_FORMAT, param.file_format) .AddOption("read.batch-size", "2"); context_builder.EnablePrefetch(param.enable_prefetch) @@ -2440,7 +2440,7 @@ TEST_P(ReadInteTest, TestPkReadSnapshot6WithSchemaEvolution) { "/pk_table_with_alter_table.db/pk_table_with_alter_table/"; ReadContextBuilder context_builder(path); context_builder.SetPrefetchCacheMode(param.cache_mode); - context_builder.SetReadSchema({"key1", "k", "key_2", "c", "d", "a", "key0", "e"}); + context_builder.SetReadFieldNames({"key1", "k", "key_2", "c", "d", "a", "key0", "e"}); context_builder.AddOption(Options::FILE_FORMAT, param.file_format) .AddOption("read.batch-size", "2"); context_builder.EnablePrefetch(param.enable_prefetch) @@ -2524,7 +2524,7 @@ TEST_P(ReadInteTest, TestPkReadSnapshot6WithSchemaEvolutionWithPredicateOnlyPush ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({equal, less_than})); ReadContextBuilder context_builder(path); - context_builder.SetReadSchema({{"key1", "k", "key_2", "c", "d", "a", "key0", "e"}}); + context_builder.SetReadFieldNames({{"key1", "k", "key_2", "c", "d", "a", "key0", "e"}}); context_builder.AddOption(Options::FILE_FORMAT, param.file_format) .AddOption("read.batch-size", "2"); context_builder.SetPrefetchCacheMode(param.cache_mode); @@ -2607,7 +2607,7 @@ TEST_P(ReadInteTest, TestPkReadSnapshot6WithSchemaEvolutionWithPredicateFilter) ReadContextBuilder context_builder(path); context_builder.SetPrefetchCacheMode(param.cache_mode); - context_builder.SetReadSchema({"key1", "k", "key_2", "c", "d", "a", "key0", "e"}); + context_builder.SetReadFieldNames({"key1", "k", "key_2", "c", "d", "a", "key0", "e"}); context_builder.AddOption(Options::FILE_FORMAT, param.file_format) .AddOption("read.batch-size", "2"); context_builder.SetPredicate(predicate); @@ -2699,7 +2699,7 @@ TEST_P(ReadInteTest, TestAppendReadWithSchemaEvolutionWithBuildInFieldId) { ReadContextBuilder context_builder(path); context_builder.SetPrefetchCacheMode(param.cache_mode); - context_builder.SetReadSchema({"key0", "key1", "k", "c", "d", "a", "e"}); + context_builder.SetReadFieldNames({"key0", "key1", "k", "c", "d", "a", "e"}); context_builder.AddOption(Options::FILE_FORMAT, param.file_format) .AddOption("read.batch-size", "2"); context_builder.EnablePrefetch(param.enable_prefetch) @@ -2817,7 +2817,7 @@ TEST_P(ReadInteTest, TestAppendReadWithSchemaEvolutionWithCast) { "append_table_alter_table_with_cast/"; ReadContextBuilder context_builder(path); context_builder.SetPrefetchCacheMode(param.cache_mode); - context_builder.SetReadSchema({"f4", "key0", "key1", "f3", "f1", "f2", "f0", "f6"}); + context_builder.SetReadFieldNames({"f4", "key0", "key1", "f3", "f1", "f2", "f0", "f6"}); context_builder.AddOption(Options::FILE_FORMAT, param.file_format) .AddOption("read.batch-size", "2"); context_builder.EnablePrefetch(param.enable_prefetch) @@ -2898,7 +2898,7 @@ TEST_P(ReadInteTest, TestAppendReadWithSchemaEvolutionWithCastWithPredicatePushD "/append_table_alter_table_with_cast.db/" "append_table_alter_table_with_cast/"; ReadContextBuilder context_builder(path); - context_builder.SetReadSchema({"f4", "key0", "key1", "f3", "f1", "f2", "f0", "f6"}); + context_builder.SetReadFieldNames({"f4", "key0", "key1", "f3", "f1", "f2", "f0", "f6"}); context_builder.SetPrefetchCacheMode(param.cache_mode); context_builder.AddOption(Options::FILE_FORMAT, param.file_format) .AddOption("read.batch-size", "2"); diff --git a/test/inte/scan_and_read_inte_test.cpp b/test/inte/scan_and_read_inte_test.cpp index b68800963..ad3d8e7fb 100644 --- a/test/inte/scan_and_read_inte_test.cpp +++ b/test/inte/scan_and_read_inte_test.cpp @@ -1025,7 +1025,7 @@ TEST_P(ScanAndReadInteTest, TestWithPKWithNestedType) { AddReadOptionsForPrefetch(&read_context_builder); ASSERT_OK_AND_ASSIGN( auto read_context, - read_context_builder.SetReadSchema({"shopId", "dt", "hr", "col0", "col1", "col2"}) + read_context_builder.SetReadFieldNames({"shopId", "dt", "hr", "col0", "col1", "col2"}) .Finish()); ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); @@ -2177,7 +2177,7 @@ TEST_P(ScanAndReadInteTest, TestScanWithPredicateAndReadWithUnorderedFieldForPar ReadContextBuilder read_context_builder(table_path); AddReadOptionsForPrefetch(&read_context_builder); - read_context_builder.SetReadSchema({"f10", "f8", "f4", "f13"}); + read_context_builder.SetReadFieldNames({"f10", "f8", "f4", "f13"}); ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish()); ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(result_plan->Splits())); @@ -2233,7 +2233,7 @@ TEST_P(ScanAndReadInteTest, TestPkSchemaEvolutionScanWithRenamedPkPredicate) { ReadContextBuilder read_context_builder(table_path); AddReadOptionsForPrefetch(&read_context_builder); - read_context_builder.SetReadSchema({"key1", "k", "key_2", "c", "d", "a", "key0", "e"}); + read_context_builder.SetReadFieldNames({"key1", "k", "key_2", "c", "d", "a", "key0", "e"}); ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish()); ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(result_plan->Splits())); @@ -2298,7 +2298,7 @@ TEST_F(ScanAndReadInteTest, TestScanWithPredicateAndReadWithUnorderedFieldForLan ASSERT_EQ(result_plan->SnapshotId().value(), 1); ReadContextBuilder read_context_builder(table_path); - read_context_builder.SetReadSchema({"f2", "f0"}); + read_context_builder.SetReadFieldNames({"f2", "f0"}); ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish()); ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(result_plan->Splits()));