From f6fe01d0cd468d8a8ec2de5f8a283a60dcbc551c Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Wed, 10 Jun 2026 16:00:41 +0800
Subject: [PATCH 01/24] feat: support read for nested type sub column

---
 include/paimon/read_context.h                 |  44 +-
 src/paimon/CMakeLists.txt                     |   2 +
 .../global_index/global_index_write_task.cpp  |   2 +-
 src/paimon/core/io/field_mapping_reader.cpp   |  52 ++-
 src/paimon/core/io/field_mapping_reader.h     |  10 +-
 .../core/operation/internal_read_context.cpp  |  32 +-
 .../operation/internal_read_context_test.cpp  |  12 +-
 .../operation/merge_file_split_read_test.cpp  |  26 +-
 .../operation/raw_file_split_read_test.cpp    |   4 +-
 src/paimon/core/operation/read_context.cpp    |  43 +-
 .../core/operation/read_context_test.cpp      |   6 +-
 .../core/table/source/table_read_test.cpp     |  14 +-
 .../table/system/audit_log_system_table.cpp   |   2 +-
 src/paimon/core/utils/field_mapping.cpp       |  29 +-
 src/paimon/core/utils/field_mapping.h         |   2 +-
 .../core/utils/nested_projection_utils.cpp    | 175 ++++++++
 .../core/utils/nested_projection_utils.h      |  81 ++++
 .../utils/nested_projection_utils_test.cpp    | 212 ++++++++++
 .../format/parquet/file_reader_wrapper.cpp    | 131 ++++++
 .../format/parquet/file_reader_wrapper.h      |  12 +
 .../parquet/parquet_file_batch_reader.cpp     | 118 +++++-
 .../parquet/parquet_file_batch_reader.h       |  18 +
 test/inte/CMakeLists.txt                      |   7 +
 test/inte/blob_table_inte_test.cpp            |   2 +-
 test/inte/data_evolution_table_test.cpp       |   2 +-
 test/inte/global_index_test.cpp               |   2 +-
 test/inte/nested_column_pruning_inte_test.cpp | 379 ++++++++++++++++++
 test/inte/read_inte_test.cpp                  |  34 +-
 test/inte/scan_and_read_inte_test.cpp         |   6 +-
 29 files changed, 1344 insertions(+), 115 deletions(-)
 create mode 100644 src/paimon/core/utils/nested_projection_utils.cpp
 create mode 100644 src/paimon/core/utils/nested_projection_utils.h
 create mode 100644 src/paimon/core/utils/nested_projection_utils_test.cpp
 create mode 100644 test/inte/nested_column_pruning_inte_test.cpp
diff --git a/include/paimon/read_context.h b/include/paimon/read_context.h
index df124b0a0..47165e24f 100644
--- a/include/paimon/read_context.h
+++ b/include/paimon/read_context.h
@@ -23,6 +23,7 @@
 #include <string>
 #include <vector>
 
+#include "arrow/c/abi.h"
 #include "paimon/cache/cache.h"
 #include "paimon/predicate/predicate.h"
 #include "paimon/result.h"
@@ -75,7 +76,7 @@ class PAIMON_EXPORT ReadContext {
         return options_;
     }
 
-    const std::vector<std::string>& GetReadSchema() const {
+    const std::vector<std::string>& GetReadFieldNames() const {
         return read_schema_;
     }
 
@@ -130,6 +131,22 @@ class PAIMON_EXPORT ReadContext {
         return cache_;
     }
 
+    /// Whether a read schema (C ArrowSchema) for nested column pruning was provided.
+    bool HasReadSchema() const {
+        return has_read_schema_;
+    }
+
+    /// Get the read schema as a mutable C ArrowSchema pointer.
+    /// ImportSchema will consume (release) the schema content.
+    ArrowSchema* GetReadSchema() {
+        return &read_schema_c_;
+    }
+
+    /// Set the read schema from a C ArrowSchema. Moves the content into this object.
+    /// The input schema's release will be set to nullptr after the move.
+    /// Called internally by ReadContextBuilder.
+    void SetReadSchema(ArrowSchema* schema);
+
  private:
     std::string path_;
     std::string branch_;
@@ -151,6 +168,8 @@ class PAIMON_EXPORT ReadContext {
     PrefetchCacheMode prefetch_cache_mode_;
     CacheConfig cache_config_;
     std::shared_ptr<Cache> cache_;
+    ArrowSchema read_schema_c_{};   // C ABI schema for nested column pruning
+    bool has_read_schema_ = false;  // whether read_schema_c_ holds valid content
 };
 
 /// `ReadContextBuilder` used to build a `ReadContext`, has input validation.
@@ -173,9 +192,9 @@ class PAIMON_EXPORT ReadContextBuilder {
     ///
     /// @param read_field_names Vector of field names to read from the table.
     /// @return Reference to this builder for method chaining.
-    /// @note Currently supports top-level field selection. Future versions may support
-    ///       nested field selection using ArrowSchema for more granular projection
-    ReadContextBuilder& SetReadSchema(const std::vector<std::string>& read_field_names);
+    /// @note Currently supports top-level field selection. For nested field selection
+    ///       use SetReadSchema(ArrowSchema*) instead.
+    ReadContextBuilder& SetReadFieldNames(const std::vector<std::string>& read_field_names);
     /// Set the schema fields to read from the table.
     ///
     /// If not set, all fields from the table schema will be read. This is useful for
@@ -186,10 +205,23 @@ class PAIMON_EXPORT ReadContextBuilder {
     /// @return Reference to this builder for method chaining.
     /// @note Currently supports top-level field selection. Future versions may support
     ///       nested field selection using ArrowSchema for more granular projection.
-    /// @note SetReadFieldIds() and SetReadSchema() are mutually exclusive.
-    ///       Calling both will ignore the read schema set by SetReadSchema().
+    /// @note SetReadFieldIds() and SetReadFieldNames() are mutually exclusive.
+    ///       Calling both will ignore the read schema set by SetReadFieldNames().
     ReadContextBuilder& SetReadFieldIds(const std::vector<int32_t>& read_field_ids);
 
+    /// Set the projected Arrow Schema for nested column pruning.
+    ///
+    /// The projected schema is an Arrow C Data Interface schema where STRUCT types
+    /// may contain only a subset of the original sub-fields, enabling nested column
+    /// pruning to reduce I/O. Each Arrow field must carry a "paimon.id" metadata
+    /// entry for field matching.
+    ///
+    /// @param projected_schema Arrow C Schema (consumed/released by this call).
+    /// @return Reference to this builder for method chaining.
+    /// @note Priority: projected_arrow_schema > read_field_ids > read_field_names.
+    ///       When set, read_field_ids and read_field_names are ignored.
+    ReadContextBuilder& SetReadSchema(ArrowSchema* projected_schema);
+
     /// Set a configuration options map to set some option entries which are not defined in the
     /// table schema or whose values you want to overwrite.
     /// @note The options map will clear the options added by `AddOption()` before.
diff --git a/src/paimon/CMakeLists.txt b/src/paimon/CMakeLists.txt
index 65026bfe3..a4c672c62 100644
--- a/src/paimon/CMakeLists.txt
+++ b/src/paimon/CMakeLists.txt
@@ -342,6 +342,7 @@ set(PAIMON_CORE_SRCS
     core/utils/blob_view_lookup.cpp
     core/utils/consumer_manager.cpp
     core/utils/field_mapping.cpp
+    core/utils/nested_projection_utils.cpp
     core/utils/file_store_path_factory.cpp
     core/utils/file_utils.cpp
     core/utils/manifest_meta_reader.cpp
@@ -735,6 +736,7 @@ if(PAIMON_BUILD_TESTS)
                     core/utils/consumer_manager_test.cpp
                     core/utils/file_store_path_factory_cache_test.cpp
                     core/utils/field_mapping_test.cpp
+                    core/utils/nested_projection_utils_test.cpp
                     core/utils/file_store_path_factory_test.cpp
                     core/utils/file_utils_test.cpp
                     core/utils/manifest_meta_reader_test.cpp
diff --git a/src/paimon/core/global_index/global_index_write_task.cpp b/src/paimon/core/global_index/global_index_write_task.cpp
index 5ee425f86..b0cbfb1e0 100644
--- a/src/paimon/core/global_index/global_index_write_task.cpp
+++ b/src/paimon/core/global_index/global_index_write_task.cpp
@@ -83,7 +83,7 @@ Result<std::unique_ptr<BatchReader>> CreateBatchReader(
         .WithFileSystem(core_options.GetFileSystem())
         .EnablePrefetch(true)
         .WithMemoryPool(pool)
-        .SetReadSchema({field_name, SpecialFields::RowId().Name()});
+        .SetReadFieldNames({field_name, SpecialFields::RowId().Name()});
     PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<ReadContext> read_context,
                            read_context_builder.Finish());
     PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<TableRead> table_read,
diff --git a/src/paimon/core/io/field_mapping_reader.cpp b/src/paimon/core/io/field_mapping_reader.cpp
index 767e0db6c..97f733cf5 100644
--- a/src/paimon/core/io/field_mapping_reader.cpp
+++ b/src/paimon/core/io/field_mapping_reader.cpp
@@ -35,6 +35,7 @@
 #include "paimon/core/casting/cast_executor.h"
 #include "paimon/core/casting/casting_utils.h"
 #include "paimon/core/utils/field_mapping.h"
+#include "paimon/core/utils/nested_projection_utils.h"
 #include "paimon/memory/bytes.h"
 #include "paimon/reader/batch_reader.h"
 
@@ -70,8 +71,12 @@ FieldMappingReader::FieldMappingReader(int32_t field_count,
         // post-rename logical name. If we skipped mapping, the inner reader's
         // batch would be passed through with the old physical name and the
         // consumer's name-based lookup against the read schema would fail.
+        // Nested type difference (nested column pruning) also requires mapping
+        // so that PruneArray can trim excess sub-fields from the format reader.
         if (non_partition_info_.non_partition_data_schema[i].Name() !=
-            non_partition_info_.non_partition_read_schema[i].Name()) {
+                non_partition_info_.non_partition_read_schema[i].Name() ||
+            !non_partition_info_.non_partition_data_schema[i].Type()->Equals(
+                non_partition_info_.non_partition_read_schema[i].Type())) {
             need_mapping_ = true;
         }
     }
@@ -142,9 +147,10 @@ Result<BatchReader::ReadBatchWithBitmap> FieldMappingReader::NextBatchWithBitmap
     // mapping non-partition array
     PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> casted_non_partition_array,
                            CastNonPartitionArrayIfNeed(non_partition_array));
-    MappingFields(casted_non_partition_array, non_partition_info_.non_partition_read_schema,
-                  non_partition_info_.idx_in_target_read_schema, &target_array,
-                  &target_field_names);
+    PAIMON_RETURN_NOT_OK(
+        MappingFields(casted_non_partition_array, non_partition_info_.non_partition_read_schema,
+                      non_partition_info_.idx_in_target_read_schema, &target_array,
+                      &target_field_names));
 
     // mapping partition array
     if (partition_info_ != std::nullopt) {
@@ -153,9 +159,10 @@ Result<BatchReader::ReadBatchWithBitmap> FieldMappingReader::NextBatchWithBitmap
                                    GeneratePartitionArray(non_partition_array->length()));
         }
         auto trim_partition_array = partition_array_->Slice(0, non_partition_array->length());
-        MappingFields(trim_partition_array, partition_info_.value().partition_read_schema,
-                      partition_info_.value().idx_in_target_read_schema, &target_array,
-                      &target_field_names);
+        PAIMON_RETURN_NOT_OK(
+            MappingFields(trim_partition_array, partition_info_.value().partition_read_schema,
+                          partition_info_.value().idx_in_target_read_schema, &target_array,
+                          &target_field_names));
     }
     // mapping non-exist array
     if (non_exist_field_info_ != std::nullopt) {
@@ -164,9 +171,10 @@ Result<BatchReader::ReadBatchWithBitmap> FieldMappingReader::NextBatchWithBitmap
                                    GenerateNonExistArray(non_partition_array->length()));
         }
         auto trim_non_exist_array = non_exist_array_->Slice(0, non_partition_array->length());
-        MappingFields(trim_non_exist_array, non_exist_field_info_.value().non_exist_read_schema,
-                      non_exist_field_info_.value().idx_in_target_read_schema, &target_array,
-                      &target_field_names);
+        PAIMON_RETURN_NOT_OK(
+            MappingFields(trim_non_exist_array, non_exist_field_info_.value().non_exist_read_schema,
+                          non_exist_field_info_.value().idx_in_target_read_schema, &target_array,
+                          &target_field_names));
     }
 
     // construct target array
@@ -283,20 +291,28 @@ Result<std::shared_ptr<arrow::Array>> FieldMappingReader::GenerateNonExistArray(
     return arrow_array;
 }
 
-void FieldMappingReader::MappingFields(const std::shared_ptr<arrow::Array>& data_array,
-                                       const std::vector<DataField>& read_fields_of_data_array,
-                                       const std::vector<int32_t>& idx_in_target_schema,
-                                       arrow::ArrayVector* target_array,
-                                       std::vector<std::string>* target_field_names) {
+Status FieldMappingReader::MappingFields(const std::shared_ptr<arrow::Array>& data_array,
+                                        const std::vector<DataField>& read_fields_of_data_array,
+                                        const std::vector<int32_t>& idx_in_target_schema,
+                                        arrow::ArrayVector* target_array,
+                                        std::vector<std::string>* target_field_names) {
     auto* struct_array = arrow::internal::checked_cast<arrow::StructArray*>(data_array.get());
     assert(struct_array);
     assert(struct_array->fields().size() == idx_in_target_schema.size());
     for (size_t i = 0; i < idx_in_target_schema.size(); i++) {
-        // target type may be string type, but after adapter transform, type may be dictionary,
-        // need reconstruct struct type
-        (*target_array)[idx_in_target_schema[i]] = struct_array->field(i);
+        std::shared_ptr<arrow::Array> field_array = struct_array->field(i);
+
+        // Fallback nested pruning: if the format reader returned more nested
+        // sub-fields than requested, prune the excess here.
+        const std::shared_ptr<arrow::DataType>& target_type = read_fields_of_data_array[i].Type();
+        if (!field_array->type()->Equals(target_type)) {
+            PAIMON_ASSIGN_OR_RAISE(field_array, PruneArray(field_array, target_type));
+        }
+
+        (*target_array)[idx_in_target_schema[i]] = std::move(field_array);
         (*target_field_names)[idx_in_target_schema[i]] = read_fields_of_data_array[i].Name();
     }
+    return Status::OK();
 }
 
 }  // namespace paimon
diff --git a/src/paimon/core/io/field_mapping_reader.h b/src/paimon/core/io/field_mapping_reader.h
index ffd18bd68..1ab7f41eb 100644
--- a/src/paimon/core/io/field_mapping_reader.h
+++ b/src/paimon/core/io/field_mapping_reader.h
@@ -96,11 +96,11 @@ class FieldMappingReader : public FileBatchReader {
     Result<std::shared_ptr<arrow::Array>> CastNonPartitionArrayIfNeed(
         const std::shared_ptr<arrow::Array>& src_array) const;
 
-    static void MappingFields(const std::shared_ptr<arrow::Array>& src_array,
-                              const std::vector<DataField>& read_fields_of_data_array,
-                              const std::vector<int32_t>& idx_in_target_schema,
-                              arrow::ArrayVector* target_array,
-                              std::vector<std::string>* target_field_names);
+    static Status MappingFields(const std::shared_ptr<arrow::Array>& src_array,
+                               const std::vector<DataField>& read_fields_of_data_array,
+                               const std::vector<int32_t>& idx_in_target_schema,
+                               arrow::ArrayVector* target_array,
+                               std::vector<std::string>* target_field_names);
 
  private:
     bool need_mapping_ = false;
diff --git a/src/paimon/core/operation/internal_read_context.cpp b/src/paimon/core/operation/internal_read_context.cpp
index 9ff2b4647..a1b670827 100644
--- a/src/paimon/core/operation/internal_read_context.cpp
+++ b/src/paimon/core/operation/internal_read_context.cpp
@@ -18,16 +18,15 @@
 
 #include <utility>
 
+#include "arrow/c/abi.h"
+#include "arrow/c/bridge.h"
 #include "paimon/common/predicate/predicate_validator.h"
 #include "paimon/common/table/special_fields.h"
 #include "paimon/common/types/data_field.h"
+#include "paimon/common/utils/arrow/status_utils.h"
 #include "paimon/core/schema/arrow_schema_validator.h"
 #include "paimon/status.h"
 
-namespace arrow {
-class Schema;
-}  // namespace arrow
-
 namespace paimon {
 Result<std::unique_ptr<InternalReadContext>> InternalReadContext::Create(
     const std::shared_ptr<ReadContext>& context, const std::shared_ptr<TableSchema>& table_schema,
@@ -37,8 +36,25 @@ Result<std::unique_ptr<InternalReadContext>> InternalReadContext::Create(
                                                 context->GetFileSystemSchemeToIdentifierMap()));
     core_options.WithCache(context->GetCache());
     // prepare read schema
+    // Priority: projected_arrow_schema > read_field_ids > read_field_names
     std::vector<DataField> read_data_fields;
-    if (!context->GetReadFieldIds().empty()) {
+    if (context->HasReadSchema()) {
+        // Nested column pruning path: user provided a projected C ArrowSchema
+        // where STRUCT types may contain only a subset of sub-fields.
+        // ImportSchema consumes the C schema — that's fine, it's one-shot usage.
+        PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
+            std::shared_ptr<arrow::Schema> projected_schema,
+            arrow::ImportSchema(context->GetReadSchema()));
+        PAIMON_ASSIGN_OR_RAISE(read_data_fields,
+                               DataField::ConvertArrowSchemaToDataFields(projected_schema));
+        // Validate that every top-level field exists in the table schema by field ID.
+        for (const auto& field : read_data_fields) {
+            if (!SpecialFields::IsSpecialFieldName(field.Name())) {
+                PAIMON_ASSIGN_OR_RAISE([[maybe_unused]] DataField unused,
+                                       table_schema->GetField(field.Id()));
+            }
+        }
+    } else if (!context->GetReadFieldIds().empty()) {
         read_data_fields.reserve(context->GetReadFieldIds().size());
         for (const auto& field_id : context->GetReadFieldIds()) {
             // if enable row tracking or data evolution, check special fields
@@ -64,9 +80,9 @@ Result<std::unique_ptr<InternalReadContext>> InternalReadContext::Create(
             PAIMON_ASSIGN_OR_RAISE(DataField field, table_schema->GetField(field_id));
             read_data_fields.push_back(field);
         }
-    } else if (!context->GetReadSchema().empty()) {
-        read_data_fields.reserve(context->GetReadSchema().size());
-        for (const auto& name : context->GetReadSchema()) {
+    } else if (!context->GetReadFieldNames().empty()) {
+        read_data_fields.reserve(context->GetReadFieldNames().size());
+        for (const auto& name : context->GetReadFieldNames()) {
             // if enable row tracking or data evolution, check special fields
             if (core_options.RowTrackingEnabled() && name == SpecialFields::RowId().Name()) {
                 read_data_fields.push_back(SpecialFields::RowId());
diff --git a/src/paimon/core/operation/internal_read_context_test.cpp b/src/paimon/core/operation/internal_read_context_test.cpp
index e48336b3f..4ef53baa8 100644
--- a/src/paimon/core/operation/internal_read_context_test.cpp
+++ b/src/paimon/core/operation/internal_read_context_test.cpp
@@ -50,7 +50,7 @@ TEST(InternalReadContext, TestReadWithUnspecifiedSchema) {
 TEST(InternalReadContext, TestReadWithSpecifiedSchema) {
     std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09";
     ReadContextBuilder context_builder(path);
-    context_builder.SetReadSchema({"f3", "f0"});
+    context_builder.SetReadFieldNames({"f3", "f0"});
     ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish());
     SchemaManager schema_manager(std::make_shared<LocalFileSystem>(), read_context->GetPath());
     ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0));
@@ -84,7 +84,7 @@ TEST(InternalReadContext, TestReadWithSpecifiedFieldIdAndSchema) {
     ReadContextBuilder context_builder(path);
     // read schema is specified, read fields in schema
     // will use field ids instead of field names.
-    context_builder.SetReadSchema({"f0"});
+    context_builder.SetReadFieldNames({"f0"});
     context_builder.SetReadFieldIds({3, 0});
     ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish());
     SchemaManager schema_manager(std::make_shared<LocalFileSystem>(), read_context->GetPath());
@@ -103,7 +103,7 @@ TEST(InternalReadContext, TestReadWithRowTrackingAndScoreFields) {
         // test simple
         std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09";
         ReadContextBuilder context_builder(path);
-        context_builder.SetReadSchema({"f3", "f0", "_ROW_ID", "_SEQUENCE_NUMBER", "_INDEX_SCORE"});
+        context_builder.SetReadFieldNames({"f3", "f0", "_ROW_ID", "_SEQUENCE_NUMBER", "_INDEX_SCORE"});
         ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish());
         SchemaManager schema_manager(std::make_shared<LocalFileSystem>(), read_context->GetPath());
         ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0));
@@ -124,7 +124,7 @@ TEST(InternalReadContext, TestReadWithRowTrackingAndScoreFields) {
         // test invalid case: disable row tracking while read row tracking fields
         std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09";
         ReadContextBuilder context_builder(path);
-        context_builder.SetReadSchema({"f3", "f0", "_ROW_ID", "_SEQUENCE_NUMBER"});
+        context_builder.SetReadFieldNames({"f3", "f0", "_ROW_ID", "_SEQUENCE_NUMBER"});
         ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish());
         SchemaManager schema_manager(std::make_shared<LocalFileSystem>(), read_context->GetPath());
         ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0));
@@ -136,7 +136,7 @@ TEST(InternalReadContext, TestReadWithRowTrackingAndScoreFields) {
         // test invalid case: disable data evolution while read score fields
         std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09";
         ReadContextBuilder context_builder(path);
-        context_builder.SetReadSchema({"f3", "f0", "_INDEX_SCORE"});
+        context_builder.SetReadFieldNames({"f3", "f0", "_INDEX_SCORE"});
         ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish());
         SchemaManager schema_manager(std::make_shared<LocalFileSystem>(), read_context->GetPath());
         ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0));
@@ -149,7 +149,7 @@ TEST(InternalReadContext, TestReadWithRowTrackingAndScoreFields) {
 TEST(InternalReadContext, TestReadWithValueKindField) {
     std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09";
     ReadContextBuilder context_builder(path);
-    context_builder.SetReadSchema({"f3", "_VALUE_KIND", "f0"});
+    context_builder.SetReadFieldNames({"f3", "_VALUE_KIND", "f0"});
     ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish());
     SchemaManager schema_manager(std::make_shared<LocalFileSystem>(), read_context->GetPath());
     ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0));
diff --git a/src/paimon/core/operation/merge_file_split_read_test.cpp b/src/paimon/core/operation/merge_file_split_read_test.cpp
index ec5c28f0e..72e4bb05a 100644
--- a/src/paimon/core/operation/merge_file_split_read_test.cpp
+++ b/src/paimon/core/operation/merge_file_split_read_test.cpp
@@ -611,7 +611,7 @@ TEST_P(MergeFileSplitReadTest, TestSimple) {
     auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields);
     ASSERT_TRUE(read_schema);
 
-    context_builder.SetReadSchema({"k1", "p1", "s1", "v0", "v1"});
+    context_builder.SetReadFieldNames({"k1", "p1", "s1", "v0", "v1"});
     context_builder.SetOptions({{Options::SEQUENCE_FIELD, "s0,s1"},
                                 {Options::MERGE_ENGINE, "deduplicate"},
                                 {Options::IGNORE_DELETE, "true"}});
@@ -677,7 +677,7 @@ TEST_P(MergeFileSplitReadTest, TestLookUp) {
     auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields);
     ASSERT_TRUE(read_schema);
 
-    context_builder.SetReadSchema({"k1", "p1", "s1", "v0", "v1"});
+    context_builder.SetReadFieldNames({"k1", "p1", "s1", "v0", "v1"});
     context_builder.SetOptions({{Options::SEQUENCE_FIELD, "s0,s1"},
                                 {Options::MERGE_ENGINE, "deduplicate"},
                                 {Options::IGNORE_DELETE, "true"},
@@ -751,7 +751,7 @@ TEST_P(MergeFileSplitReadTest, TestDeduplicateMergeEngineWithDeleteMsg) {
     auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields);
     ASSERT_TRUE(read_schema);
 
-    context_builder.SetReadSchema({"k0", "k1", "v0", "v1", "v2"});
+    context_builder.SetReadFieldNames({"k0", "k1", "v0", "v1", "v2"});
     context_builder.SetOptions({{Options::MERGE_ENGINE, "deduplicate"}});
     AddOptions(&context_builder);
     ASSERT_OK_AND_ASSIGN(std::shared_ptr<ReadContext> read_context, context_builder.Finish());
@@ -792,7 +792,7 @@ TEST_P(MergeFileSplitReadTest, TestReadWithPredicate) {
     auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields);
     ASSERT_TRUE(read_schema);
 
-    context_builder.SetReadSchema({"k1", "p1", "s1", "s0", "v0", "v1"});
+    context_builder.SetReadFieldNames({"k1", "p1", "s1", "s0", "v0", "v1"});
     context_builder.SetOptions({{Options::SEQUENCE_FIELD, "s0,s1"},
                                 {Options::MERGE_ENGINE, "deduplicate"},
                                 {Options::IGNORE_DELETE, "true"}});
@@ -857,7 +857,7 @@ TEST_P(MergeFileSplitReadTest, TestReadWithAlterTable) {
     auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields);
     ASSERT_TRUE(read_schema);
 
-    context_builder.SetReadSchema({"k1", "k0", "p0", "p1", "s1", "s0", "v0", "v1", "v2"});
+    context_builder.SetReadFieldNames({"k1", "k0", "p0", "p1", "s1", "s0", "v0", "v1", "v2"});
     context_builder.SetOptions({{Options::SEQUENCE_FIELD, "s0,s1"},
                                 {Options::MERGE_ENGINE, "deduplicate"},
                                 {Options::IGNORE_DELETE, "true"}});
@@ -906,7 +906,7 @@ TEST_P(MergeFileSplitReadTest, TestReadWithAlterTableWithReverseSequence) {
     auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields);
     ASSERT_TRUE(read_schema);
 
-    context_builder.SetReadSchema({"v2", "p1", "k0", "p0", "s0", "v0"});
+    context_builder.SetReadFieldNames({"v2", "p1", "k0", "p0", "s0", "v0"});
     context_builder.SetOptions({{Options::SEQUENCE_FIELD, "s0,s1"},
                                 {Options::MERGE_ENGINE, "deduplicate"},
                                 {Options::IGNORE_DELETE, "true"}});
@@ -954,7 +954,7 @@ TEST_P(MergeFileSplitReadTest, TestAggregateMergeEngine) {
     auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields);
     ASSERT_TRUE(read_schema);
 
-    context_builder.SetReadSchema({"k1", "p1", "s1", "v0", "v1"});
+    context_builder.SetReadFieldNames({"k1", "p1", "s1", "v0", "v1"});
     context_builder.SetOptions({{Options::SEQUENCE_FIELD, "s0,s1"},
                                 {Options::MERGE_ENGINE, "aggregation"},
                                 {"fields.v1.aggregate-function", "bool_and"},
@@ -1001,7 +1001,7 @@ TEST_P(MergeFileSplitReadTest, TestPartialUpdateMergeEngine) {
     auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields);
     ASSERT_TRUE(read_schema);
 
-    context_builder.SetReadSchema({"k1", "p1", "s1", "v0"});
+    context_builder.SetReadFieldNames({"k1", "p1", "s1", "v0"});
     context_builder.SetOptions({{Options::SEQUENCE_FIELD, "s0,s1"},
                                 {Options::MERGE_ENGINE, "partial-update"},
                                 {"fields.v1.sequence-group", "v0"},
@@ -1049,7 +1049,7 @@ TEST_P(MergeFileSplitReadTest, TestPartialUpdateMergeEngineWithIgnoreDelete) {
     auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields);
     ASSERT_TRUE(read_schema);
 
-    context_builder.SetReadSchema({"k0", "k1", "v0", "v1", "v2"});
+    context_builder.SetReadFieldNames({"k0", "k1", "v0", "v1", "v2"});
     context_builder.SetOptions(
         {{Options::MERGE_ENGINE, "partial-update"}, {Options::IGNORE_DELETE, "true"}});
     AddOptions(&context_builder);
@@ -1089,7 +1089,7 @@ TEST_P(MergeFileSplitReadTest, TestPartialUpdateMergeEngineWithRemoveRecordOnDel
     auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields);
     ASSERT_TRUE(read_schema);
 
-    context_builder.SetReadSchema({"k0", "k1", "v0", "v1", "v2"});
+    context_builder.SetReadFieldNames({"k0", "k1", "v0", "v1", "v2"});
     context_builder.SetOptions({{Options::MERGE_ENGINE, "partial-update"},
                                 {Options::PARTIAL_UPDATE_REMOVE_RECORD_ON_DELETE, "true"}});
     AddOptions(&context_builder);
@@ -1129,7 +1129,7 @@ TEST_P(MergeFileSplitReadTest, TestEmptyPlan) {
     auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields);
     ASSERT_TRUE(read_schema);
 
-    context_builder.SetReadSchema({"k0", "k1", "v0", "v1", "v2"});
+    context_builder.SetReadFieldNames({"k0", "k1", "v0", "v1", "v2"});
     context_builder.SetOptions({{Options::MERGE_ENGINE, "partial-update"},
                                 {Options::PARTIAL_UPDATE_REMOVE_RECORD_ON_DELETE, "true"}});
     AddOptions(&context_builder);
@@ -1156,7 +1156,7 @@ TEST_P(MergeFileSplitReadTest, TestIOException) {
     auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields);
     ASSERT_TRUE(read_schema);
 
-    context_builder.SetReadSchema({"k1", "p1", "s1", "v0", "v1"});
+    context_builder.SetReadFieldNames({"k1", "p1", "s1", "v0", "v1"});
     context_builder.SetOptions({{Options::SEQUENCE_FIELD, "s0,s1"},
                                 {Options::MERGE_ENGINE, "deduplicate"},
                                 {Options::IGNORE_DELETE, "true"}});
@@ -1210,7 +1210,7 @@ TEST_P(MergeFileSplitReadTest, Test09VersionWithoutInlineFieldId) {
                                               DataField(1, arrow::field("f1", arrow::int32()))};
     auto read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields);
     ASSERT_TRUE(read_schema);
-    context_builder.SetReadSchema({"f3", "f2", "f0", "f1"});
+    context_builder.SetReadFieldNames({"f3", "f2", "f0", "f1"});
     context_builder.SetOptions({{Options::FILE_FORMAT, "orc"},
                                 {Options::MERGE_ENGINE, "deduplicate"},
                                 {"orc.read.enable-metrics", "true"}});
diff --git a/src/paimon/core/operation/raw_file_split_read_test.cpp b/src/paimon/core/operation/raw_file_split_read_test.cpp
index c97a007a4..96243d9f5 100644
--- a/src/paimon/core/operation/raw_file_split_read_test.cpp
+++ b/src/paimon/core/operation/raw_file_split_read_test.cpp
@@ -133,7 +133,7 @@ class RawFileSplitReadTest : public ::testing::Test {
                            "/orc/multi_partition_append_table.db/"
                            "multi_partition_append_table";
         ReadContextBuilder context_builder(path);
-        context_builder.SetReadSchema(read_schema->field_names());
+        context_builder.SetReadFieldNames(read_schema->field_names());
         ASSERT_OK_AND_ASSIGN(std::unique_ptr<ReadContext> read_context, context_builder.Finish());
         SchemaManager schema_manager(std::make_shared<LocalFileSystem>(), read_context->GetPath());
         ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0));
@@ -428,7 +428,7 @@ TEST_F(RawFileSplitReadTest, TestMatch) {
     std::string path = paimon::test::GetDataDir() +
                        "/orc/pk_table_with_total_buckets.db/pk_table_with_total_buckets";
     ReadContextBuilder context_builder(path);
-    context_builder.SetReadSchema({"f0", "f1", "f2", "f3"});
+    context_builder.SetReadFieldNames({"f0", "f1", "f2", "f3"});
     ASSERT_OK_AND_ASSIGN(std::unique_ptr<ReadContext> read_context, context_builder.Finish());
     SchemaManager schema_manager(std::make_shared<LocalFileSystem>(), read_context->GetPath());
     ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0));
diff --git a/src/paimon/core/operation/read_context.cpp b/src/paimon/core/operation/read_context.cpp
index 4ccefb8b0..c2c7306a5 100644
--- a/src/paimon/core/operation/read_context.cpp
+++ b/src/paimon/core/operation/read_context.cpp
@@ -18,6 +18,8 @@
 
 #include <utility>
 
+#include "arrow/c/abi.h"
+#include "arrow/c/bridge.h"
 #include "paimon/common/utils/path_util.h"
 #include "paimon/core/utils/branch_manager.h"
 #include "paimon/executor.h"
@@ -59,7 +61,20 @@ ReadContext::ReadContext(
       cache_config_(cache_config),
       cache_(cache) {}
 
-ReadContext::~ReadContext() = default;
+ReadContext::~ReadContext() {
+    if (has_read_schema_ && read_schema_c_.release) {
+        read_schema_c_.release(&read_schema_c_);
+    }
+}
+
+void ReadContext::SetReadSchema(ArrowSchema* schema) {
+    if (schema && schema->release) {
+        // Move the C schema content into our member. After move, source's release is nullptr.
+        read_schema_c_ = *schema;
+        schema->release = nullptr;
+        has_read_schema_ = true;
+    }
+}
 
 class ReadContextBuilder::Impl {
  public:
@@ -68,6 +83,10 @@ class ReadContextBuilder::Impl {
         branch_ = BranchManager::DEFAULT_MAIN_BRANCH;
         read_field_names_.clear();
         read_field_ids_.clear();
+        if (projected_c_schema_.release) {
+            projected_c_schema_.release(&projected_c_schema_);
+        }
+        projected_c_schema_ = {};
         fs_scheme_to_identifier_map_.clear();
         options_.clear();
         predicate_.reset();
@@ -91,6 +110,7 @@ class ReadContextBuilder::Impl {
     std::string branch_ = BranchManager::DEFAULT_MAIN_BRANCH;
     std::vector<std::string> read_field_names_;
     std::vector<int32_t> read_field_ids_;
+    ArrowSchema projected_c_schema_{};
     std::map<std::string, std::string> fs_scheme_to_identifier_map_;
     std::map<std::string, std::string> options_;
     std::shared_ptr<Predicate> predicate_;
@@ -130,7 +150,7 @@ ReadContextBuilder& ReadContextBuilder::SetOptions(const std::map<std::string, s
     return *this;
 }
 
-ReadContextBuilder& ReadContextBuilder::SetReadSchema(
+ReadContextBuilder& ReadContextBuilder::SetReadFieldNames(
     const std::vector<std::string>& read_field_names) {
     impl_->read_field_names_ = read_field_names;
     return *this;
@@ -142,6 +162,22 @@ ReadContextBuilder& ReadContextBuilder::SetReadFieldIds(
     return *this;
 }
 
+ReadContextBuilder& ReadContextBuilder::SetReadSchema(ArrowSchema* projected_schema) {
+    if (projected_schema && projected_schema->release) {
+        // Import consumes the input C schema, then export a fresh copy into our member.
+        auto import_result = arrow::ImportSchema(projected_schema);
+        if (import_result.ok()) {
+            // Release any previously held schema.
+            if (impl_->projected_c_schema_.release) {
+                impl_->projected_c_schema_.release(&impl_->projected_c_schema_);
+            }
+            impl_->projected_c_schema_ = {};
+            (void)arrow::ExportSchema(*import_result.ValueUnsafe(), &impl_->projected_c_schema_);
+        }
+    }
+    return *this;
+}
+
 ReadContextBuilder& ReadContextBuilder::SetPredicate(const std::shared_ptr<Predicate>& predicate) {
     impl_->predicate_ = predicate;
     return *this;
@@ -262,6 +298,9 @@ Result<std::unique_ptr<ReadContext>> ReadContextBuilder::Finish() {
         impl_->table_schema_, impl_->memory_pool_, impl_->executor_, impl_->specific_file_system_,
         impl_->fs_scheme_to_identifier_map_, impl_->options_, impl_->prefetch_cache_mode_,
         impl_->cache_config_, impl_->cache_);
+    if (impl_->projected_c_schema_.release) {
+        ctx->SetReadSchema(&impl_->projected_c_schema_);
+    }
     impl_->Reset();
     return ctx;
 }
diff --git a/src/paimon/core/operation/read_context_test.cpp b/src/paimon/core/operation/read_context_test.cpp
index 33df00338..20e825b66 100644
--- a/src/paimon/core/operation/read_context_test.cpp
+++ b/src/paimon/core/operation/read_context_test.cpp
@@ -35,7 +35,7 @@ TEST(ReadContextTest, TestDefaultValue) {
     ASSERT_EQ(ctx->GetPath(), "table_root_path");
     ASSERT_TRUE(ctx->GetMemoryPool());
     ASSERT_TRUE(ctx->GetExecutor());
-    ASSERT_TRUE(ctx->GetReadSchema().empty());
+    ASSERT_TRUE(ctx->GetReadFieldNames().empty());
     ASSERT_TRUE(ctx->GetReadFieldIds().empty());
     ASSERT_TRUE(ctx->GetOptions().empty());
     ASSERT_FALSE(ctx->GetPredicate());
@@ -59,7 +59,7 @@ TEST(ReadContextTest, TestSetContent) {
                              /*hole_size_limit=*/128, /*pre_buffer_limit=*/2048);
 
     builder.AddOption("key", "value");
-    builder.SetReadSchema({"f1", "f2"});
+    builder.SetReadFieldNames({"f1", "f2"});
     builder.SetReadFieldIds({0, 1});
     auto predicate =
         PredicateBuilder::IsNull(/*field_index=*/0, /*field_name=*/"f1", FieldType::INT);
@@ -86,7 +86,7 @@ TEST(ReadContextTest, TestSetContent) {
     ASSERT_EQ(ctx->GetPath(), "table_root_path");
     ASSERT_TRUE(ctx->GetMemoryPool());
     ASSERT_TRUE(ctx->GetExecutor());
-    ASSERT_EQ(ctx->GetReadSchema(), std::vector<std::string>({"f1", "f2"}));
+    ASSERT_EQ(ctx->GetReadFieldNames(), std::vector<std::string>({"f1", "f2"}));
     ASSERT_EQ(ctx->GetReadFieldIds(), std::vector<int32_t>({0, 1}));
     ASSERT_EQ(*predicate, *(ctx->GetPredicate()));
     ASSERT_TRUE(ctx->EnablePredicateFilter());
diff --git a/src/paimon/core/table/source/table_read_test.cpp b/src/paimon/core/table/source/table_read_test.cpp
index 762e9362c..03c3c82c1 100644
--- a/src/paimon/core/table/source/table_read_test.cpp
+++ b/src/paimon/core/table/source/table_read_test.cpp
@@ -41,7 +41,7 @@ TEST(TableReadTest, TestReadWithInvalidContext) {
     {
         // read with non-exist field
         ReadContextBuilder context_builder(path);
-        context_builder.SetReadSchema({"f0", "f1", "non-exist"});
+        context_builder.SetReadFieldNames({"f0", "f1", "non-exist"});
         ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish());
         ASSERT_NOK_WITH_MSG(TableRead::Create(std::move(read_context)),
                             "Get field non-exist failed: not exist in table schema");
@@ -72,7 +72,7 @@ TEST(TableReadTest, TestReadWithInvalidContext) {
         auto predicate = PredicateBuilder::Equal(/*field_index=*/2, /*field_name=*/"f3",
                                                  FieldType::DOUBLE, Literal(15.0));
         ReadContextBuilder context_builder(path);
-        context_builder.SetReadSchema({"f3", "f0", "f1"});
+        context_builder.SetReadFieldNames({"f3", "f0", "f1"});
         context_builder.SetPredicate(predicate);
         ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish());
         ASSERT_NOK_WITH_MSG(
@@ -92,7 +92,7 @@ TEST(TableReadTest, TestReadWithInvalidContext) {
     {
         // schema with duplicate field f3
         ReadContextBuilder context_builder(path);
-        context_builder.SetReadSchema({"f3", "f1", "f3"});
+        context_builder.SetReadFieldNames({"f3", "f1", "f3"});
         ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish());
         ASSERT_NOK_WITH_MSG(TableRead::Create(std::move(read_context)),
                             "validate schema failed: read schema has duplicate field f3");
@@ -102,7 +102,7 @@ TEST(TableReadTest, TestReadWithInvalidContext) {
 TEST(TableReadTest, TestReadWithSpecifiedInvalidSchema) {
     std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09";
     ReadContextBuilder context_builder(path);
-    context_builder.SetReadSchema({"field_no_exist"});
+    context_builder.SetReadFieldNames({"field_no_exist"});
     ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish());
     ASSERT_NOK_WITH_MSG(TableRead::Create(std::move(read_context)),
                         "Get field field_no_exist failed: not exist in table schema");
@@ -112,7 +112,7 @@ TEST(TableReadTest, TestCreateKeyValueTableRead) {
     std::string path = paimon::test::GetDataDir() +
                        "/orc/pk_table_with_dv_cardinality.db/pk_table_with_dv_cardinality/";
     ReadContextBuilder context_builder(path);
-    context_builder.SetReadSchema({"f0", "f1", "f2", "f3"});
+    context_builder.SetReadFieldNames({"f0", "f1", "f2", "f3"});
     context_builder.AddOption("read.batch-size", "2");
     context_builder.AddOption("orc.read.enable-lazy-decoding", "true");
     ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish());
@@ -124,7 +124,7 @@ TEST(TableReadTest, TestCreateKeyValueTableRead) {
 TEST(TableReadTest, TestCreateAppendOnlyTableRead) {
     std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09";
     ReadContextBuilder context_builder(path);
-    context_builder.SetReadSchema({"f0", "f1", "f2", "f3"});
+    context_builder.SetReadFieldNames({"f0", "f1", "f2", "f3"});
     context_builder.AddOption("read.batch-size", "2");
     context_builder.AddOption("orc.read.enable-lazy-decoding", "true");
     ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish());
@@ -136,7 +136,7 @@ TEST(TableReadTest, TestCreateAppendOnlyTableRead) {
 TEST(TableReadTest, TestMergeOptions) {
     std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09";
     ReadContextBuilder context_builder(path);
-    context_builder.SetReadSchema({"f0", "f1", "f2", "f3"});
+    context_builder.SetReadFieldNames({"f0", "f1", "f2", "f3"});
     context_builder.AddOption("read.batch-size", "2");
     context_builder.AddOption("orc.read.enable-lazy-decoding", "true");
     context_builder.AddOption("bucket", "10");
diff --git a/src/paimon/core/table/system/audit_log_system_table.cpp b/src/paimon/core/table/system/audit_log_system_table.cpp
index 668436430..488ec4796 100644
--- a/src/paimon/core/table/system/audit_log_system_table.cpp
+++ b/src/paimon/core/table/system/audit_log_system_table.cpp
@@ -284,7 +284,7 @@ Result<std::unique_ptr<TableRead>> AuditLogSystemTable::NewChangelogRead(
     PAIMON_ASSIGN_OR_RAISE(StringMap read_options, ReadOptions());
     PAIMON_ASSIGN_OR_RAISE(CoreOptions core_options, CoreOptions::FromMap(read_options));
     builder.SetOptions(read_options)
-        .SetReadSchema(base_read_schema->field_names())
+        .SetReadFieldNames(base_read_schema->field_names())
         .WithBranch(core_options.GetBranch())
         .WithMemoryPool(context->GetMemoryPool())
         .WithExecutor(context->GetExecutor())
diff --git a/src/paimon/core/utils/field_mapping.cpp b/src/paimon/core/utils/field_mapping.cpp
index e24ee7277..38bf4b68d 100644
--- a/src/paimon/core/utils/field_mapping.cpp
+++ b/src/paimon/core/utils/field_mapping.cpp
@@ -27,6 +27,7 @@
 #include "paimon/common/utils/object_utils.h"
 #include "paimon/core/casting/cast_executor_factory.h"
 #include "paimon/core/casting/casting_utils.h"
+#include "paimon/core/utils/nested_projection_utils.h"
 #include "paimon/defs.h"
 #include "paimon/predicate/literal.h"
 #include "paimon/predicate/predicate_builder.h"
@@ -72,8 +73,8 @@ Result<std::unique_ptr<FieldMapping>> FieldMappingBuilder::CreateFieldMapping(
     // generate non-exist field info
     std::optional<NonExistFieldInfo> non_exist_field_info = CreateNonExistFieldInfo(data_fields);
 
-    // generate exist field info
-    ExistFieldInfo exist_field_info = CreateExistFieldInfo(data_fields);
+    // generate exist field info (includes nested type pruning)
+    PAIMON_ASSIGN_OR_RAISE(ExistFieldInfo exist_field_info, CreateExistFieldInfo(data_fields));
 
     // key: partition key, value: partition idx
     std::map<std::string, int32_t> partition_key_to_idx =
@@ -87,7 +88,7 @@ Result<std::unique_ptr<FieldMapping>> FieldMappingBuilder::CreateFieldMapping(
     return std::make_unique<FieldMapping>(partition_info, non_partition_info, non_exist_field_info);
 }
 
-ExistFieldInfo FieldMappingBuilder::CreateExistFieldInfo(
+Result<ExistFieldInfo> FieldMappingBuilder::CreateExistFieldInfo(
     const std::vector<DataField>& data_fields) const {
     // key:field id, value: {target_idx, read field}
     std::map<int32_t, std::pair<int32_t, DataField>> field_id_to_read_fields;
@@ -101,8 +102,22 @@ ExistFieldInfo FieldMappingBuilder::CreateExistFieldInfo(
         auto iter = field_id_to_read_fields.find(data_field.Id());
         if (iter != field_id_to_read_fields.end()) {
             const auto& [target_idx, read_field] = iter->second;
+
+            // Recursively prune nested types in data_field to match read_field's
+            // projection. For atomic types this is a no-op.
+            PAIMON_ASSIGN_OR_RAISE(
+                std::optional<std::shared_ptr<arrow::DataType>> pruned_type,
+                PruneDataType(read_field.Type(), data_field.Type()));
+            if (!pruned_type.has_value()) {
+                // All sub-fields pruned away — treat as non-existent.
+                continue;
+            }
+
+            DataField pruned_data_field(data_field.Id(),
+                                        data_field.ArrowField()->WithType(pruned_type.value()),
+                                        data_field.Description());
             exist_field_info.exist_read_schema.push_back(read_field);
-            exist_field_info.exist_data_schema.push_back(data_field);
+            exist_field_info.exist_data_schema.push_back(pruned_data_field);
             exist_field_info.idx_in_target_read_schema.push_back(target_idx);
         }
     }
@@ -146,7 +161,11 @@ Result<std::vector<std::shared_ptr<CastExecutor>>> FieldMappingBuilder::CreateDa
         if (!read_fields[i].Type()->Equals(data_fields[i].Type())) {
             if (read_type == FieldType::MAP || read_type == FieldType::ARRAY ||
                 read_type == FieldType::STRUCT) {
-                return Status::Invalid("Only support column type evolution in atomic data type.");
+                // Nested types may differ due to nested column pruning (different
+                // number of sub-fields). No cast is needed — pruning is handled
+                // separately by PruneDataType / PruneArray.
+                cast_executors.push_back(nullptr);
+                continue;
             }
             auto executor_factory = CastExecutorFactory::GetCastExecutorFactory();
             auto cast_executor =
diff --git a/src/paimon/core/utils/field_mapping.h b/src/paimon/core/utils/field_mapping.h
index 0c0abc04b..4b1d3912f 100644
--- a/src/paimon/core/utils/field_mapping.h
+++ b/src/paimon/core/utils/field_mapping.h
@@ -80,7 +80,7 @@ class FieldMappingBuilder {
 
     std::optional<NonExistFieldInfo> CreateNonExistFieldInfo(
         const std::vector<DataField>& data_fields) const;
-    ExistFieldInfo CreateExistFieldInfo(const std::vector<DataField>& data_fields) const;
+    Result<ExistFieldInfo> CreateExistFieldInfo(const std::vector<DataField>& data_fields) const;
 
     Result<NonPartitionInfo> CreateNonPartitionInfo(
         const std::vector<DataField>& data_fields, const ExistFieldInfo& exist_field_info,
diff --git a/src/paimon/core/utils/nested_projection_utils.cpp b/src/paimon/core/utils/nested_projection_utils.cpp
new file mode 100644
index 000000000..f06886611
--- /dev/null
+++ b/src/paimon/core/utils/nested_projection_utils.cpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright 2024-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "paimon/core/utils/nested_projection_utils.h"
+
+#include <string>
+#include <utility>
+
+#include "arrow/array/array_nested.h"
+#include "arrow/type.h"
+#include "fmt/format.h"
+#include "paimon/status.h"
+
+namespace paimon {
+
+Result<std::optional<std::shared_ptr<arrow::DataType>>> PruneDataType(
+    const std::shared_ptr<arrow::DataType>& read_type,
+    const std::shared_ptr<arrow::DataType>& data_type) {
+    // Identical types need no pruning.
+    if (read_type->Equals(data_type)) {
+        return std::optional<std::shared_ptr<arrow::DataType>>(data_type);
+    }
+
+    switch (read_type->id()) {
+        case arrow::Type::STRUCT: {
+            arrow::FieldVector pruned_fields;
+            for (const auto& read_child : read_type->fields()) {
+                int32_t read_child_id = GetPaimonFieldId(read_child);
+                std::shared_ptr<arrow::Field> data_child =
+                    FindFieldByPaimonId(data_type, read_child_id);
+                if (!data_child) {
+                    // Schema Evolution: field not present in data, skip.
+                    continue;
+                }
+                PAIMON_ASSIGN_OR_RAISE(
+                    std::optional<std::shared_ptr<arrow::DataType>> pruned_child_type,
+                    PruneDataType(read_child->type(), data_child->type()));
+                if (!pruned_child_type.has_value()) {
+                    // All sub-fields of this child were pruned away; skip it.
+                    continue;
+                }
+                pruned_fields.push_back(data_child->WithType(pruned_child_type.value()));
+            }
+            if (pruned_fields.empty()) {
+                // All fields pruned — return nullopt so the caller can skip this field.
+                return std::optional<std::shared_ptr<arrow::DataType>>(std::nullopt);
+            }
+            return std::optional<std::shared_ptr<arrow::DataType>>(arrow::struct_(pruned_fields));
+        }
+
+        case arrow::Type::LIST: {
+            const auto& read_list = static_cast<const arrow::ListType&>(*read_type);
+            const auto& data_list = static_cast<const arrow::ListType&>(*data_type);
+            PAIMON_ASSIGN_OR_RAISE(
+                std::optional<std::shared_ptr<arrow::DataType>> pruned_elem,
+                PruneDataType(read_list.value_type(), data_list.value_type()));
+            if (!pruned_elem.has_value()) {
+                return std::optional<std::shared_ptr<arrow::DataType>>(std::nullopt);
+            }
+            std::shared_ptr<arrow::DataType> result_type = arrow::list(
+                arrow::field(data_list.value_field()->name(), pruned_elem.value(),
+                             data_list.value_field()->nullable(),
+                             data_list.value_field()->metadata()));
+            return std::optional<std::shared_ptr<arrow::DataType>>(std::move(result_type));
+        }
+
+        case arrow::Type::MAP: {
+            const auto& read_map = static_cast<const arrow::MapType&>(*read_type);
+            const auto& data_map = static_cast<const arrow::MapType&>(*data_type);
+            PAIMON_ASSIGN_OR_RAISE(
+                std::optional<std::shared_ptr<arrow::DataType>> pruned_key,
+                PruneDataType(read_map.key_type(), data_map.key_type()));
+            PAIMON_ASSIGN_OR_RAISE(
+                std::optional<std::shared_ptr<arrow::DataType>> pruned_value,
+                PruneDataType(read_map.item_type(), data_map.item_type()));
+            if (!pruned_key.has_value() || !pruned_value.has_value()) {
+                return std::optional<std::shared_ptr<arrow::DataType>>(std::nullopt);
+            }
+            std::shared_ptr<arrow::DataType> result_type =
+                arrow::map(pruned_key.value(), pruned_value.value(),
+                           data_map.key_field()->nullable());
+            return std::optional<std::shared_ptr<arrow::DataType>>(std::move(result_type));
+        }
+
+        default:
+            // Atomic type: return data_type as-is (type evolution is handled
+            // separately by CastExecutor).
+            return std::optional<std::shared_ptr<arrow::DataType>>(data_type);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// PruneArray — fallback for format readers that return extra nested columns
+// ---------------------------------------------------------------------------
+
+Result<std::shared_ptr<arrow::Array>> PruneArray(
+    const std::shared_ptr<arrow::Array>& array,
+    const std::shared_ptr<arrow::DataType>& target_type) {
+    if (!array || array->type()->Equals(target_type)) {
+        return array;
+    }
+
+    switch (target_type->id()) {
+        case arrow::Type::STRUCT: {
+            auto struct_array = std::static_pointer_cast<arrow::StructArray>(array);
+            arrow::ArrayVector pruned_children;
+            arrow::FieldVector pruned_fields;
+            for (const auto& target_field : target_type->fields()) {
+                std::shared_ptr<arrow::Array> child =
+                    struct_array->GetFieldByName(target_field->name());
+                if (!child) {
+                    return Status::Invalid(fmt::format(
+                        "PruneArray: field '{}' not found in struct array", target_field->name()));
+                }
+                PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> pruned_child,
+                                       PruneArray(child, target_field->type()));
+                pruned_children.push_back(std::move(pruned_child));
+                pruned_fields.push_back(target_field);
+            }
+            PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
+                std::shared_ptr<arrow::StructArray> result_struct,
+                arrow::StructArray::Make(pruned_children, pruned_fields,
+                                         struct_array->null_bitmap(),
+                                         struct_array->null_count(), struct_array->offset()));
+            return std::static_pointer_cast<arrow::Array>(result_struct);
+        }
+
+        case arrow::Type::LIST: {
+            auto list_array = std::static_pointer_cast<arrow::ListArray>(array);
+            const auto& target_elem_type =
+                static_cast<const arrow::ListType&>(*target_type).value_type();
+            PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> pruned_values,
+                                   PruneArray(list_array->values(), target_elem_type));
+            PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
+                std::shared_ptr<arrow::ListArray> result_list,
+                arrow::ListArray::FromArrays(
+                    *list_array->offsets(), *pruned_values, arrow::default_memory_pool(),
+                    list_array->null_bitmap(), list_array->null_count()));
+            return std::static_pointer_cast<arrow::Array>(result_list);
+        }
+
+        case arrow::Type::MAP: {
+            auto map_array = std::static_pointer_cast<arrow::MapArray>(array);
+            const auto& target_map_type = static_cast<const arrow::MapType&>(*target_type);
+            PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> pruned_keys,
+                                   PruneArray(map_array->keys(), target_map_type.key_type()));
+            PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> pruned_items,
+                                   PruneArray(map_array->items(), target_map_type.item_type()));
+            PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
+                std::shared_ptr<arrow::Array> result_map,
+                arrow::MapArray::FromArrays(map_array->offsets(), pruned_keys, pruned_items,
+                                            arrow::default_memory_pool()));
+            return result_map;
+        }
+
+        default:
+            // Atomic type — no pruning needed.
+            return array;
+    }
+}
+
+}  // namespace paimon
diff --git a/src/paimon/core/utils/nested_projection_utils.h b/src/paimon/core/utils/nested_projection_utils.h
new file mode 100644
index 000000000..7bc798a6b
--- /dev/null
+++ b/src/paimon/core/utils/nested_projection_utils.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright 2024-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "arrow/type.h"
+#include "paimon/common/types/data_field.h"
+#include "paimon/common/utils/string_utils.h"
+#include "paimon/result.h"
+
+namespace paimon {
+
+/// Extract the paimon field ID from an Arrow field's metadata ("paimon.id").
+/// Returns -1 if the metadata key is not present.
+inline int32_t GetPaimonFieldId(const std::shared_ptr<arrow::Field>& field) {
+    if (!field || !field->HasMetadata() || !field->metadata()) {
+        return -1;
+    }
+    auto result = field->metadata()->Get(DataField::FIELD_ID);
+    if (!result.ok()) {
+        return -1;
+    }
+    std::optional<int32_t> field_id = StringUtils::StringToValue<int32_t>(result.ValueUnsafe());
+    return field_id.value_or(-1);
+}
+
+/// Find a child field in a STRUCT DataType by paimon field ID.
+/// Returns nullptr if no child has the given ID.
+inline std::shared_ptr<arrow::Field> FindFieldByPaimonId(
+    const std::shared_ptr<arrow::DataType>& struct_type, int32_t field_id) {
+    if (!struct_type || struct_type->id() != arrow::Type::STRUCT) {
+        return nullptr;
+    }
+    for (const auto& child : struct_type->fields()) {
+        if (GetPaimonFieldId(child) == field_id) {
+            return child;
+        }
+    }
+    return nullptr;
+}
+
+/// Recursively prune `data_type` so that only the sub-fields requested by
+/// `read_type` are retained. Matching is done by paimon field ID to support
+/// schema evolution (field renames).
+///
+/// Supported nesting: STRUCT, LIST (element recurse), MAP (key/value recurse).
+/// For atomic types, `data_type` is returned as-is.
+///
+/// Returns std::nullopt when all sub-fields of a STRUCT are pruned away
+/// (caller should skip this field entirely, mirroring Java's null return).
+Result<std::optional<std::shared_ptr<arrow::DataType>>> PruneDataType(
+    const std::shared_ptr<arrow::DataType>& read_type,
+    const std::shared_ptr<arrow::DataType>& data_type);
+
+/// Prune a StructArray so that only the sub-fields present in `target_type`
+/// are kept. Used as a fallback when the format reader returns more columns
+/// than requested.
+Result<std::shared_ptr<arrow::Array>> PruneArray(
+    const std::shared_ptr<arrow::Array>& array,
+    const std::shared_ptr<arrow::DataType>& target_type);
+
+}  // namespace paimon
diff --git a/src/paimon/core/utils/nested_projection_utils_test.cpp b/src/paimon/core/utils/nested_projection_utils_test.cpp
new file mode 100644
index 000000000..f66932885
--- /dev/null
+++ b/src/paimon/core/utils/nested_projection_utils_test.cpp
@@ -0,0 +1,212 @@
+/*
+ * Copyright 2024-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "paimon/core/utils/nested_projection_utils.h"
+
+#include "arrow/array/array_nested.h"
+#include "arrow/array/builder_nested.h"
+#include "arrow/array/builder_primitive.h"
+#include "arrow/type.h"
+#include "gtest/gtest.h"
+#include "paimon/common/types/data_field.h"
+#include "paimon/testing/utils/testharness.h"
+
+namespace paimon::test {
+
+// Helper: create an arrow::Field with paimon.id metadata
+static std::shared_ptr<arrow::Field> MakeField(const std::string& name,
+                                                const std::shared_ptr<arrow::DataType>& type,
+                                                int32_t paimon_id) {
+    DataField data_field(paimon_id, arrow::field(name, type));
+    return DataField::ConvertDataFieldToArrowField(data_field);
+}
+
+// ============== GetPaimonFieldId ==============
+
+TEST(NestedProjectionUtilsTest, GetPaimonFieldId_Present) {
+    auto field = MakeField("col", arrow::int32(), 42);
+    ASSERT_EQ(GetPaimonFieldId(field), 42);
+}
+
+TEST(NestedProjectionUtilsTest, GetPaimonFieldId_Missing) {
+    auto field = arrow::field("col", arrow::int32());
+    ASSERT_EQ(GetPaimonFieldId(field), -1);
+}
+
+TEST(NestedProjectionUtilsTest, GetPaimonFieldId_Nullptr) {
+    ASSERT_EQ(GetPaimonFieldId(nullptr), -1);
+}
+
+// ============== FindFieldByPaimonId ==============
+
+TEST(NestedProjectionUtilsTest, FindFieldByPaimonId_Found) {
+    auto struct_type = arrow::struct_({MakeField("x", arrow::int32(), 1),
+                                       MakeField("y", arrow::utf8(), 2)});
+    auto found = FindFieldByPaimonId(struct_type, 2);
+    ASSERT_NE(found, nullptr);
+    ASSERT_EQ(found->name(), "y");
+}
+
+TEST(NestedProjectionUtilsTest, FindFieldByPaimonId_NotFound) {
+    auto struct_type = arrow::struct_({MakeField("x", arrow::int32(), 1)});
+    ASSERT_EQ(FindFieldByPaimonId(struct_type, 99), nullptr);
+}
+
+TEST(NestedProjectionUtilsTest, FindFieldByPaimonId_NonStruct) {
+    ASSERT_EQ(FindFieldByPaimonId(arrow::int32(), 1), nullptr);
+}
+
+// ============== PruneDataType ==============
+
+TEST(NestedProjectionUtilsTest, PruneDataType_IdenticalTypes) {
+    auto type = arrow::int32();
+    ASSERT_OK_AND_ASSIGN(auto result, PruneDataType(type, type));
+    ASSERT_TRUE(result.has_value());
+    ASSERT_TRUE(result.value()->Equals(type));
+}
+
+TEST(NestedProjectionUtilsTest, PruneDataType_AtomicType) {
+    // Different atomic types: return data_type
+    auto read_type = arrow::int64();
+    auto data_type = arrow::int32();
+    ASSERT_OK_AND_ASSIGN(auto result, PruneDataType(read_type, data_type));
+    ASSERT_TRUE(result.has_value());
+    ASSERT_TRUE(result.value()->Equals(data_type));
+}
+
+TEST(NestedProjectionUtilsTest, PruneDataType_StructPruneSubset) {
+    // data: STRUCT<x:INT(id=1), y:STRING(id=2), z:DOUBLE(id=3)>
+    // read: STRUCT<x:INT(id=1)>
+    // expected: STRUCT<x:INT(id=1)>
+    auto data_type = arrow::struct_({MakeField("x", arrow::int32(), 1),
+                                     MakeField("y", arrow::utf8(), 2),
+                                     MakeField("z", arrow::float64(), 3)});
+    auto read_type = arrow::struct_({MakeField("x", arrow::int32(), 1)});
+
+    ASSERT_OK_AND_ASSIGN(auto result, PruneDataType(read_type, data_type));
+    ASSERT_TRUE(result.has_value());
+    ASSERT_EQ(result.value()->num_fields(), 1);
+    ASSERT_EQ(result.value()->field(0)->name(), "x");
+}
+
+TEST(NestedProjectionUtilsTest, PruneDataType_StructAllFieldsPruned) {
+    // data: STRUCT<x:INT(id=1)>
+    // read: STRUCT<y:INT(id=99)>  — no match
+    // expected: nullopt
+    auto data_type = arrow::struct_({MakeField("x", arrow::int32(), 1)});
+    auto read_type = arrow::struct_({MakeField("y", arrow::int32(), 99)});
+
+    ASSERT_OK_AND_ASSIGN(auto result, PruneDataType(read_type, data_type));
+    ASSERT_FALSE(result.has_value());
+}
+
+TEST(NestedProjectionUtilsTest, PruneDataType_NestedStruct) {
+    // data: STRUCT<inner:STRUCT<a:INT(id=10), b:STRING(id=11)>(id=1)>
+    // read: STRUCT<inner:STRUCT<a:INT(id=10)>(id=1)>
+    // expected: STRUCT<inner:STRUCT<a:INT(id=10)>(id=1)>
+    auto inner_data = arrow::struct_({MakeField("a", arrow::int32(), 10),
+                                      MakeField("b", arrow::utf8(), 11)});
+    auto data_type = arrow::struct_({MakeField("inner", inner_data, 1)});
+
+    auto inner_read = arrow::struct_({MakeField("a", arrow::int32(), 10)});
+    auto read_type = arrow::struct_({MakeField("inner", inner_read, 1)});
+
+    ASSERT_OK_AND_ASSIGN(auto result, PruneDataType(read_type, data_type));
+    ASSERT_TRUE(result.has_value());
+    ASSERT_EQ(result.value()->num_fields(), 1);
+    auto pruned_inner = result.value()->field(0)->type();
+    ASSERT_EQ(pruned_inner->num_fields(), 1);
+    ASSERT_EQ(pruned_inner->field(0)->name(), "a");
+}
+
+TEST(NestedProjectionUtilsTest, PruneDataType_ListWithStructElement) {
+    // data: LIST<STRUCT<a:INT(id=10), b:STRING(id=11)>>
+    // read: LIST<STRUCT<a:INT(id=10)>>
+    auto inner_data = arrow::struct_({MakeField("a", arrow::int32(), 10),
+                                      MakeField("b", arrow::utf8(), 11)});
+    auto data_type = arrow::list(arrow::field("item", inner_data));
+
+    auto inner_read = arrow::struct_({MakeField("a", arrow::int32(), 10)});
+    auto read_type = arrow::list(arrow::field("item", inner_read));
+
+    ASSERT_OK_AND_ASSIGN(auto result, PruneDataType(read_type, data_type));
+    ASSERT_TRUE(result.has_value());
+    auto list_type = std::dynamic_pointer_cast<arrow::ListType>(result.value());
+    ASSERT_NE(list_type, nullptr);
+    ASSERT_EQ(list_type->value_type()->num_fields(), 1);
+    ASSERT_EQ(list_type->value_type()->field(0)->name(), "a");
+}
+
+TEST(NestedProjectionUtilsTest, PruneDataType_MapWithStructValue) {
+    // data: MAP<STRING, STRUCT<a:INT(id=10), b:STRING(id=11)>>
+    // read: MAP<STRING, STRUCT<a:INT(id=10)>>
+    auto inner_data = arrow::struct_({MakeField("a", arrow::int32(), 10),
+                                      MakeField("b", arrow::utf8(), 11)});
+    auto data_type = arrow::map(arrow::utf8(), inner_data);
+
+    auto inner_read = arrow::struct_({MakeField("a", arrow::int32(), 10)});
+    auto read_type = arrow::map(arrow::utf8(), inner_read);
+
+    ASSERT_OK_AND_ASSIGN(auto result, PruneDataType(read_type, data_type));
+    ASSERT_TRUE(result.has_value());
+    auto map_type = std::dynamic_pointer_cast<arrow::MapType>(result.value());
+    ASSERT_NE(map_type, nullptr);
+    ASSERT_TRUE(map_type->key_type()->Equals(arrow::utf8()));
+    ASSERT_EQ(map_type->item_type()->num_fields(), 1);
+    ASSERT_EQ(map_type->item_type()->field(0)->name(), "a");
+}
+
+// ============== PruneArray ==============
+
+TEST(NestedProjectionUtilsTest, PruneArray_StructPrune) {
+    // Build a StructArray with fields x:INT, y:STRING
+    arrow::Int32Builder x_builder;
+    ASSERT_TRUE(x_builder.AppendValues({1, 2, 3}).ok());
+    std::shared_ptr<arrow::Array> x_array;
+    ASSERT_TRUE(x_builder.Finish(&x_array).ok());
+
+    arrow::StringBuilder y_builder;
+    ASSERT_TRUE(y_builder.AppendValues({"a", "b", "c"}).ok());
+    std::shared_ptr<arrow::Array> y_array;
+    ASSERT_TRUE(y_builder.Finish(&y_array).ok());
+
+    auto struct_type = arrow::struct_({arrow::field("x", arrow::int32()),
+                                       arrow::field("y", arrow::utf8())});
+    auto struct_result = arrow::StructArray::Make({x_array, y_array},
+                                                   struct_type->fields());
+    ASSERT_TRUE(struct_result.ok());
+    auto struct_array = struct_result.ValueUnsafe();
+
+    // Prune to only keep "x"
+    auto target_type = arrow::struct_({arrow::field("x", arrow::int32())});
+    ASSERT_OK_AND_ASSIGN(auto pruned, PruneArray(struct_array, target_type));
+
+    ASSERT_EQ(pruned->type()->num_fields(), 1);
+    ASSERT_EQ(pruned->type()->field(0)->name(), "x");
+    ASSERT_EQ(pruned->length(), 3);
+}
+
+TEST(NestedProjectionUtilsTest, PruneArray_IdenticalType) {
+    arrow::Int32Builder builder;
+    ASSERT_TRUE(builder.AppendValues({10, 20}).ok());
+    std::shared_ptr<arrow::Array> array;
+    ASSERT_TRUE(builder.Finish(&array).ok());
+
+    ASSERT_OK_AND_ASSIGN(auto pruned, PruneArray(array, arrow::int32()));
+    ASSERT_EQ(pruned.get(), array.get());  // Same pointer — no copy.
+}
+
+}  // namespace paimon::test
diff --git a/src/paimon/format/parquet/file_reader_wrapper.cpp b/src/paimon/format/parquet/file_reader_wrapper.cpp
index e7d6bf606..118476181 100644
--- a/src/paimon/format/parquet/file_reader_wrapper.cpp
+++ b/src/paimon/format/parquet/file_reader_wrapper.cpp
@@ -408,6 +408,137 @@ void FileReaderWrapper::DispatchPreBuffer(std::vector<::arrow::io::ReadRange> ra
 Status FileReaderWrapper::PrepareForReading(const std::vector<TargetRowGroup>& target_row_groups,
                                             const std::vector<int32_t>& column_indices) {
     try {
+        std::vector<std::pair<uint64_t, uint64_t>> target_row_groups;
+        PAIMON_ASSIGN_OR_RAISE(target_row_groups, GetRowGroupRanges(target_row_group_indices));
+
+        // Build position map: rg_index -> position in target_row_groups (O(1) lookup)
+        std::map<int32_t, uint64_t> rg_idx_to_position;
+        {
+            uint64_t pos = 0;
+            for (int32_t rg_idx : target_row_group_indices) {
+                rg_idx_to_position[rg_idx] = pos++;
+            }
+        }
+
+        // Separate row groups into fully matched (Arrow's standard reader) and partially
+        // matched (page-filtered, per-RG reader constructed on demand in Next()).
+        // Per-RG metadata for the page-filtered path is NOT cached on the wrapper — it's
+        // recomputed on demand in Next() from row_group_row_ranges_ + target_column_indices_,
+        // mirroring how the fully-matched path lets Arrow's FileReader own all metadata.
+        std::vector<int32_t> fully_matched_row_groups;
+        page_filtered_indices_.clear();
+        page_filtered_read_schema_.reset();
+
+        // Page-level byte ranges collected here only for the bulk PreBuffer call below;
+        // discarded once PreBuffer is dispatched.
+        std::vector<::arrow::io::ReadRange> page_filtered_byte_ranges;
+
+        for (int32_t rg_idx : target_row_group_indices) {
+            auto range_it = row_group_row_ranges_.find(rg_idx);
+            if (range_it != row_group_row_ranges_.end()) {
+                uint64_t pos = rg_idx_to_position[rg_idx];
+                page_filtered_indices_.insert(pos);
+
+                // Build the page-filter read_schema once on first encounter — it's identical
+                // across all page-filtered RGs in this session.
+                if (!page_filtered_read_schema_) {
+                    if (external_read_schema_) {
+                        // Use externally provided read schema (handles nested column pruning
+                        // correctly where leaf-column-name inference would fail).
+                        page_filtered_read_schema_ = external_read_schema_;
+                    } else {
+                        std::shared_ptr<arrow::Schema> schema;
+                        PAIMON_RETURN_NOT_OK_FROM_ARROW(file_reader_->GetSchema(&schema));
+                        std::vector<std::shared_ptr<arrow::Field>> fields;
+                        auto parquet_schema = file_reader_->parquet_reader()->metadata()->schema();
+                        for (int32_t col_idx : column_indices) {
+                            const std::string& col_name = parquet_schema->Column(col_idx)->name();
+                            auto field = schema->GetFieldByName(col_name);
+                            if (!field) {
+                                return Status::Invalid(fmt::format(
+                                    "PrepareForReading: Parquet column {} ('{}') has no "
+                                    "matching Arrow field in file schema",
+                                    col_idx, col_name));
+                            }
+                            fields.push_back(field);
+                        }
+                        page_filtered_read_schema_ = arrow::schema(fields);
+                    }
+                }
+
+                auto page_ranges = PageFilteredRowGroupReader::ComputePageRanges(
+                    file_reader_->parquet_reader(), rg_idx, range_it->second, column_indices);
+                page_filtered_byte_ranges.insert(page_filtered_byte_ranges.end(),
+                                                 std::make_move_iterator(page_ranges.begin()),
+                                                 std::make_move_iterator(page_ranges.end()));
+            } else {
+                fully_matched_row_groups.push_back(rg_idx);
+            }
+        }
+
+        // Wait for any previously pre-buffered data before starting new pre-buffer.
+        WaitForPendingPreBuffer();
+
+        // Create standard reader for fully matched row groups FIRST.
+        // GetRecordBatchReader internally calls PreBuffer, but we'll override it below
+        // with a single PreBuffer covering ALL row groups (page-filtered + fully-matched)
+        // so that async I/O for all files starts in parallel.
+        std::unique_ptr<arrow::RecordBatchReader> batch_reader;
+        if (!fully_matched_row_groups.empty()) {
+            PAIMON_RETURN_NOT_OK_FROM_ARROW(file_reader_->GetRecordBatchReader(
+                fully_matched_row_groups, column_indices, &batch_reader));
+        }
+
+        // Collect all byte ranges for a single PreBufferRanges call.
+        // Page-filtered RGs: only matching page ranges (from ComputePageRanges).
+        // Fully-matched RGs: entire column chunk ranges.
+        //
+        // When there are no page-filtered RGs, skip the manual PreBufferRanges entirely:
+        // GetRecordBatchReader has already issued PreBuffer internally (driven by
+        // ArrowReaderProperties::pre_buffer=true), and a second PreBufferRanges call here
+        // would tear down and rebuild cached_source_, redundantly re-issuing the same IO
+        // on remote filesystems. The manual path is only needed to merge page-level ranges
+        // with column-chunk ranges into a single PreBuffer covering both kinds of RGs.
+        if (!page_filtered_indices_.empty()) {
+            std::vector<::arrow::io::ReadRange> all_ranges = std::move(page_filtered_byte_ranges);
+
+            // Fully-matched row groups: add entire column chunk ranges
+            // The correct calculation follows Arrow's ColumnChunkMetaData::file_range():
+            // - col_start = data_page_offset (or dictionary_page_offset if present and lower)
+            // - col_length = total_compressed_size (includes all pages: dictionary + data)
+            auto file_metadata = file_reader_->parquet_reader()->metadata();
+            for (int32_t rg_idx : fully_matched_row_groups) {
+                auto rg_metadata = file_metadata->RowGroup(rg_idx);
+                for (int32_t col_idx : column_indices) {
+                    auto col_chunk = rg_metadata->ColumnChunk(col_idx);
+                    int64_t offset = col_chunk->data_page_offset();
+                    if (col_chunk->has_dictionary_page() &&
+                        col_chunk->dictionary_page_offset() > 0 &&
+                        offset > col_chunk->dictionary_page_offset()) {
+                        offset = col_chunk->dictionary_page_offset();
+                    }
+                    int64_t size = col_chunk->total_compressed_size();
+                    all_ranges.push_back({offset, size});
+                }
+            }
+
+            const auto& cache_opts = file_reader_->properties().cache_options();
+            ::arrow::io::IOContext io_ctx(pool_);
+            // Merge overlapping ranges before calling PreBufferRanges, which rejects overlapping
+            // ranges.
+            auto merged_ranges = MergeOverlappingRanges(std::move(all_ranges));
+            // PreBuffer is an optimization - if it fails (e.g., IO error during testing),
+            // continue without pre-buffering. Subsequent reads will fetch data on-demand.
+            try {
+                file_reader_->parquet_reader()->PreBufferRanges(merged_ranges, io_ctx, cache_opts);
+                // Track for cleanup on destruction
+                prebuffered_ranges_ = std::move(merged_ranges);
+            } catch (const std::exception& e) {
+                // Pre-buffering failed, clear ranges to indicate no pre-buffered data available.
+                // Reading will fall back to on-demand I/O.
+                prebuffered_ranges_.clear();
+            }
+        }
         target_row_groups_ = target_row_groups;
         target_column_indices_ = column_indices;
         page_filtered_read_schema_.reset();
diff --git a/src/paimon/format/parquet/file_reader_wrapper.h b/src/paimon/format/parquet/file_reader_wrapper.h
index 748d4052f..7ed7bba43 100644
--- a/src/paimon/format/parquet/file_reader_wrapper.h
+++ b/src/paimon/format/parquet/file_reader_wrapper.h
@@ -122,6 +122,14 @@ class FileReaderWrapper {
     /// Resets reader state so that the next Next() call will re-initialize.
     Status ApplyReadRanges(const std::vector<std::pair<uint64_t, uint64_t>>& read_ranges);
 
+    /// Set the read schema for page-filtered reading. When nested column pruning
+    /// is used, the leaf-column-name-based schema inference in PrepareForReading
+    /// cannot correctly reconstruct nested types. This setter allows the caller
+    /// to provide the correct pruned schema directly.
+    void SetReadSchemaForPageFilter(const std::shared_ptr<arrow::Schema>& schema) {
+        external_read_schema_ = schema;
+    }
+
     /// Get the page index reader for the file.
     /// Returns nullptr if page index is not available.
     std::shared_ptr<::parquet::PageIndexReader> GetPageIndexReader();
@@ -194,6 +202,10 @@ class FileReaderWrapper {
     // all page-filtered RGs in a session.
     std::shared_ptr<arrow::Schema> page_filtered_read_schema_;
 
+    // Externally provided read schema for page-filtered reading.
+    // When set, PrepareForReading uses this instead of inferring from leaf column names.
+    std::shared_ptr<arrow::Schema> external_read_schema_;
+
     // Track pre-buffered ranges so we can wait on destruction
     std::vector<::arrow::io::ReadRange> prebuffered_ranges_;
 };
diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
index 7533cb99a..2e91424ea 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
@@ -39,6 +39,7 @@
 #include "paimon/common/metrics/metrics_impl.h"
 #include "paimon/common/utils/arrow/status_utils.h"
 #include "paimon/common/utils/options_utils.h"
+#include "paimon/core/utils/nested_projection_utils.h"
 #include "paimon/format/parquet/parquet_field_id_converter.h"
 #include "paimon/format/parquet/parquet_format_defs.h"
 #include "paimon/format/parquet/parquet_timestamp_converter.h"
@@ -127,23 +128,32 @@ Status ParquetFileBatchReader::SetReadSchema(
         PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr<arrow::Schema> read_schema,
                                           arrow::ImportSchema(schema));
 
-        PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Schema> file_schema, reader_->GetSchema());
+        PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Schema> raw_file_schema,
+                               reader_->GetSchema());
+        // Convert PARQUET:field_id to paimon.id so that nested column matching works.
+        PAIMON_ASSIGN_OR_RAISE(
+            std::shared_ptr<arrow::Schema> file_schema,
+            ParquetFieldIdConverter::GetPaimonIdsFromParquetIds(raw_file_schema));
+
+        // Recursively match read_schema against file_schema using paimon field IDs.
+        // For STRUCT fields with nested projection, only the requested sub-fields'
+        // leaf columns are collected.
+        PAIMON_ASSIGN_OR_RAISE(std::vector<int32_t> column_indices,
+                               ComputeNestedColumnIndices(read_schema, file_schema));
+
+        // Build column name to index map for page-level filtering.
+        // We still need the full per-top-level-field leaf indices for predicate pushdown.
         std::unordered_map<std::string, std::vector<int32_t>> field_index_map;
-        int32_t i = 0;
+        int32_t flat_idx = 0;
         for (const auto& field : file_schema->fields()) {
-            std::vector<int32_t> v;
-            FlattenSchema(field->type(), &i, &v);
-            field_index_map[field->name()] = v;
+            std::vector<int32_t> leaf_indices;
+            FlattenSchema(field->type(), &flat_idx, &leaf_indices);
+            field_index_map[field->name()] = leaf_indices;
         }
-
-        std::vector<int32_t> column_indices;
-        for (const auto& field : read_schema->field_names()) {
-            if (field_index_map.find(field) != field_index_map.end()) {
-                for (int32_t index : field_index_map[field]) {
-                    column_indices.push_back(index);
-                }
-            } else {
-                return Status::Invalid(fmt::format("Field {} is not found in schema.", field));
+        std::map<std::string, int32_t> column_name_to_index;
+        for (const auto& [name, indices] : field_index_map) {
+            if (!indices.empty()) {
+                column_name_to_index[name] = indices[0];
             }
         }
 
@@ -190,6 +200,12 @@ Status ParquetFileBatchReader::SetReadSchema(
 
         read_data_type_ = arrow::struct_(read_schema->fields());
 
+        // Provide the read schema to FileReaderWrapper for page-filtered reading.
+        // This is needed because nested column pruning produces leaf column indices
+        // whose names don't correspond to top-level Arrow fields, so the wrapper
+        // cannot infer the correct schema from leaf column names alone.
+        reader_->SetReadSchemaForPageFilter(read_schema);
+
         metrics_->SetCounter(ParquetMetrics::READ_ROW_GROUPS_TOTAL,
                              reader_->GetNumberOfRowGroups());
         metrics_->SetCounter(ParquetMetrics::READ_ROW_GROUPS_AFTER_FILTER, row_groups.size());
@@ -428,4 +444,78 @@ Result<::parquet::ArrowReaderProperties> ParquetFileBatchReader::CreateArrowRead
     return arrow_reader_props;
 }
 
+// ---------------------------------------------------------------------------
+// Nested column index computation
+// ---------------------------------------------------------------------------
+
+void ParquetFileBatchReader::CollectLeafIndices(const std::shared_ptr<arrow::DataType>& read_type,
+                                                const std::shared_ptr<arrow::DataType>& file_type,
+                                                int32_t* leaf_index,
+                                                std::vector<int32_t>* indices) {
+    if (file_type->id() == arrow::Type::STRUCT) {
+        for (const auto& file_child : file_type->fields()) {
+            int32_t file_child_id = GetPaimonFieldId(file_child);
+            std::shared_ptr<arrow::Field> read_child =
+                FindFieldByPaimonId(read_type, file_child_id);
+            if (read_child) {
+                CollectLeafIndices(read_child->type(), file_child->type(), leaf_index, indices);
+            } else {
+                SkipLeafIndices(file_child->type(), leaf_index);
+            }
+        }
+    } else if (file_type->id() == arrow::Type::LIST || file_type->id() == arrow::Type::MAP) {
+        // LIST/MAP: recurse into all structural children (offsets are not leaf
+        // columns in Parquet, only the value/key fields are).
+        for (int i = 0; i < file_type->num_fields(); i++) {
+            if (i < read_type->num_fields()) {
+                CollectLeafIndices(read_type->field(i)->type(), file_type->field(i)->type(),
+                                   leaf_index, indices);
+            } else {
+                SkipLeafIndices(file_type->field(i)->type(), leaf_index);
+            }
+        }
+    } else {
+        // Leaf column — collect its index.
+        indices->push_back((*leaf_index)++);
+    }
+}
+
+void ParquetFileBatchReader::SkipLeafIndices(const std::shared_ptr<arrow::DataType>& file_type,
+                                             int32_t* leaf_index) {
+    if (file_type->id() == arrow::Type::STRUCT || file_type->id() == arrow::Type::LIST ||
+        file_type->id() == arrow::Type::MAP) {
+        for (int i = 0; i < file_type->num_fields(); i++) {
+            SkipLeafIndices(file_type->field(i)->type(), leaf_index);
+        }
+    } else {
+        (*leaf_index)++;
+    }
+}
+
+Result<std::vector<int32_t>> ParquetFileBatchReader::ComputeNestedColumnIndices(
+    const std::shared_ptr<arrow::Schema>& read_schema,
+    const std::shared_ptr<arrow::Schema>& file_schema) {
+    std::vector<int32_t> indices;
+    int32_t leaf_index = 0;
+
+    for (const auto& file_field : file_schema->fields()) {
+        int32_t file_field_id = GetPaimonFieldId(file_field);
+        // Find matching field in read_schema by paimon field ID.
+        std::shared_ptr<arrow::Field> read_field = nullptr;
+        for (const auto& candidate : read_schema->fields()) {
+            if (GetPaimonFieldId(candidate) == file_field_id) {
+                read_field = candidate;
+                break;
+            }
+        }
+
+        if (read_field) {
+            CollectLeafIndices(read_field->type(), file_field->type(), &leaf_index, &indices);
+        } else {
+            SkipLeafIndices(file_field->type(), &leaf_index);
+        }
+    }
+    return indices;
+}
+
 }  // namespace paimon::parquet
diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.h b/src/paimon/format/parquet/parquet_file_batch_reader.h
index 8dc412c30..63b70ace0 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader.h
+++ b/src/paimon/format/parquet/parquet_file_batch_reader.h
@@ -150,6 +150,24 @@ class ParquetFileBatchReader : public PrefetchFileBatchReader {
         }
     }
 
+    /// Recursively collect leaf column indices for the sub-fields in read_type
+    /// that match file_type by paimon field ID. Unmatched sub-fields in file_type
+    /// have their leaf indices skipped.
+    static void CollectLeafIndices(const std::shared_ptr<arrow::DataType>& read_type,
+                                   const std::shared_ptr<arrow::DataType>& file_type,
+                                   int32_t* leaf_index, std::vector<int32_t>* indices);
+
+    /// Skip over all leaf column indices of the given file_type without collecting.
+    static void SkipLeafIndices(const std::shared_ptr<arrow::DataType>& file_type,
+                                int32_t* leaf_index);
+
+    /// Compute leaf column indices by recursively matching read_schema against
+    /// file_schema using paimon field IDs. For STRUCT fields, only the requested
+    /// sub-fields are collected; unmatched ones are skipped.
+    static Result<std::vector<int32_t>> ComputeNestedColumnIndices(
+        const std::shared_ptr<arrow::Schema>& read_schema,
+        const std::shared_ptr<arrow::Schema>& file_schema);
+
     // precondition: predicate supposed not be empty
     Result<std::vector<int32_t>> FilterRowGroupsByPredicate(
         const std::shared_ptr<Predicate>& predicate,
diff --git a/test/inte/CMakeLists.txt b/test/inte/CMakeLists.txt
index ae8c9f749..535394809 100644
--- a/test/inte/CMakeLists.txt
+++ b/test/inte/CMakeLists.txt
@@ -97,4 +97,11 @@ if(PAIMON_BUILD_TESTS)
                     test_utils_static
                     ${GTEST_LINK_TOOLCHAIN})
 
+    add_paimon_test(nested_column_pruning_inte_test
+                    STATIC_LINK_LIBS
+                    paimon_shared
+                    ${TEST_STATIC_LINK_LIBS}
+                    test_utils_static
+                    ${GTEST_LINK_TOOLCHAIN})
+
 endif()
diff --git a/test/inte/blob_table_inte_test.cpp b/test/inte/blob_table_inte_test.cpp
index 4cd6ad053..27f3d0fe5 100644
--- a/test/inte/blob_table_inte_test.cpp
+++ b/test/inte/blob_table_inte_test.cpp
@@ -203,7 +203,7 @@ class BlobTableInteTest : public testing::Test, public ::testing::WithParamInter
                                  const std::map<std::string, std::string>& options = {}) const {
         auto splits = plan->Splits();
         ReadContextBuilder read_context_builder(table_path);
-        read_context_builder.SetReadSchema(read_schema).SetPredicate(predicate);
+        read_context_builder.SetReadFieldNames(read_schema).SetPredicate(predicate);
         if (!options.empty()) {
             read_context_builder.SetOptions(options);
         }
diff --git a/test/inte/data_evolution_table_test.cpp b/test/inte/data_evolution_table_test.cpp
index 095e9fa2a..4d0cbaf7e 100644
--- a/test/inte/data_evolution_table_test.cpp
+++ b/test/inte/data_evolution_table_test.cpp
@@ -147,7 +147,7 @@ class DataEvolutionTableTest : public ::testing::Test,
         // read
         auto splits = result_plan->Splits();
         ReadContextBuilder read_context_builder(table_path);
-        read_context_builder.SetReadSchema(read_schema).SetPredicate(predicate);
+        read_context_builder.SetReadFieldNames(read_schema).SetPredicate(predicate);
         PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<ReadContext> read_context,
                                read_context_builder.Finish());
         PAIMON_ASSIGN_OR_RAISE(auto table_read, TableRead::Create(std::move(read_context)));
diff --git a/test/inte/global_index_test.cpp b/test/inte/global_index_test.cpp
index e10add7c8..51afdc2de 100644
--- a/test/inte/global_index_test.cpp
+++ b/test/inte/global_index_test.cpp
@@ -195,7 +195,7 @@ class GlobalIndexTest : public ::testing::Test, public ::testing::WithParamInter
                     const std::shared_ptr<Plan>& result_plan) const {
         auto splits = result_plan->Splits();
         ReadContextBuilder read_context_builder(table_path);
-        read_context_builder.SetReadSchema(read_schema).SetPredicate(predicate).WithFileSystem(fs_);
+        read_context_builder.SetReadFieldNames(read_schema).SetPredicate(predicate).WithFileSystem(fs_);
         PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<ReadContext> read_context,
                                read_context_builder.Finish());
         PAIMON_ASSIGN_OR_RAISE(auto table_read, TableRead::Create(std::move(read_context)));
diff --git a/test/inte/nested_column_pruning_inte_test.cpp b/test/inte/nested_column_pruning_inte_test.cpp
new file mode 100644
index 000000000..857867e62
--- /dev/null
+++ b/test/inte/nested_column_pruning_inte_test.cpp
@@ -0,0 +1,379 @@
+/*
+ * Copyright 2024-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/api.h"
+#include "arrow/c/abi.h"
+#include "arrow/c/bridge.h"
+#include "arrow/ipc/json_simple.h"
+#include "gtest/gtest.h"
+#include "paimon/common/types/data_field.h"
+#include "paimon/common/utils/path_util.h"
+#include "paimon/common/utils/string_utils.h"
+#include "paimon/defs.h"
+#include "paimon/read_context.h"
+#include "paimon/reader/batch_reader.h"
+#include "paimon/result.h"
+#include "paimon/scan_context.h"
+#include "paimon/status.h"
+#include "paimon/table/source/startup_mode.h"
+#include "paimon/table/source/table_read.h"
+#include "paimon/table/source/table_scan.h"
+#include "paimon/testing/utils/read_result_collector.h"
+#include "paimon/testing/utils/test_helper.h"
+#include "paimon/testing/utils/testharness.h"
+
+namespace paimon {
+class DataSplit;
+class RecordBatch;
+}  // namespace paimon
+
+namespace paimon::test {
+
+class NestedColumnPruningInteTest
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<std::string> {
+    void SetUp() override {
+        file_format_ = GetParam();
+        dir_ = UniqueTestDirectory::Create("local");
+        test_dir_ = dir_->Str();
+        table_path_ = PathUtil::JoinPath(test_dir_, "foo.db/bar");
+    }
+    void TearDown() override { dir_.reset(); }
+
+ protected:
+    static std::shared_ptr<arrow::Field> AnnotateField(
+        const std::shared_ptr<arrow::Field>& field, int32_t paimon_id) {
+        auto metadata = arrow::KeyValueMetadata::Make(
+            {DataField::FIELD_ID}, {std::to_string(paimon_id)});
+        if (field->metadata()) {
+            auto merged = field->metadata()->Merge(*metadata);
+            return field->WithMetadata(merged);
+        }
+        return field->WithMetadata(metadata);
+    }
+
+    std::string file_format_;
+    std::string test_dir_;
+    std::string table_path_;
+    std::unique_ptr<UniqueTestDirectory> dir_;
+};
+
+// Test: Table has struct field with 3 sub-fields, read only 1 sub-field via SetReadSchema.
+TEST_P(NestedColumnPruningInteTest, PruneStructSubFields) {
+    // Table schema: f0 (int32), f1 (struct{a: int32, b: utf8, c: float64})
+    auto struct_type = arrow::struct_({
+        arrow::field("a", arrow::int32()),
+        arrow::field("b", arrow::utf8()),
+        arrow::field("c", arrow::float64()),
+    });
+    arrow::FieldVector table_fields = {
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", struct_type),
+    };
+    auto table_schema = arrow::schema(table_fields);
+
+    std::map<std::string, std::string> options = {
+        {Options::MANIFEST_FORMAT, "AVRO"},
+        {Options::FILE_FORMAT, StringUtils::ToUpperCase(file_format_)},
+        {Options::TARGET_FILE_SIZE, "1024"},
+        {Options::BUCKET, "-1"},
+    };
+
+    ASSERT_OK_AND_ASSIGN(
+        auto helper,
+        TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{},
+                           /*primary_keys=*/{}, options, /*is_streaming_mode=*/false));
+
+    // Write data
+    std::string data = R"([
+        [1, [10, "hello", 1.1]],
+        [2, [20, "world", 2.2]],
+        [3, [30, "foo", 3.3]],
+        [4, [40, "bar", 4.4]]
+    ])";
+    ASSERT_OK_AND_ASSIGN(
+        auto batch,
+        TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data,
+                                    /*partition_map=*/{}, /*bucket=*/0, {}));
+    int64_t commit_identifier = 0;
+    ASSERT_OK_AND_ASSIGN(
+        auto commit_msgs,
+        helper->WriteAndCommit(std::move(batch), commit_identifier++,
+                               /*expected_commit_messages=*/std::nullopt));
+
+    // Scan to get splits
+    ASSERT_OK_AND_ASSIGN(
+        auto data_splits,
+        helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
+    ASSERT_FALSE(data_splits.empty());
+
+    // Build projected schema: only read f0 (full) and f1.a (sub-field of struct)
+    // Catalog assigns IDs: f0->0, f1->1, f1.a->2, f1.b->3, f1.c->4
+    auto pruned_struct_type = arrow::struct_({
+        AnnotateField(arrow::field("a", arrow::int32()), 2),
+    });
+    arrow::FieldVector projected_fields = {
+        AnnotateField(arrow::field("f0", arrow::int32()), 0),
+        AnnotateField(arrow::field("f1", pruned_struct_type), 1),
+    };
+    auto projected_schema = arrow::schema(projected_fields);
+
+    // Export to C ArrowSchema
+    ArrowSchema c_schema;
+    ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok());
+
+    // Read with projected schema
+    ReadContextBuilder read_context_builder(table_path_);
+    read_context_builder.SetOptions(options).SetReadSchema(&c_schema);
+    ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish());
+    ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context)));
+    ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits));
+    ASSERT_OK_AND_ASSIGN(auto read_result,
+                         ReadResultCollector::CollectResult(batch_reader.get()));
+
+    // Expected: struct with _VALUE_KIND, f0, f1{a}
+    arrow::FieldVector expected_fields = {
+        arrow::field("_VALUE_KIND", arrow::int8()),
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", arrow::struct_({arrow::field("a", arrow::int32())})),
+    };
+    auto expected_type = arrow::struct_(expected_fields);
+    std::string expected_data = R"([
+        [0, 1, [10]],
+        [0, 2, [20]],
+        [0, 3, [30]],
+        [0, 4, [40]]
+    ])";
+    auto expected_array =
+        arrow::ipc::internal::json::ArrayFromJSON(expected_type, expected_data).ValueOrDie();
+    auto expected_chunked = std::make_shared<arrow::ChunkedArray>(expected_array);
+
+    arrow::EqualOptions equal_options = arrow::EqualOptions::Defaults();
+    bool is_equal = expected_chunked->Equals(read_result, equal_options.diff_sink(&std::cout));
+    if (!is_equal) {
+        std::cout << "[expected_type] " << expected_chunked->type()->ToString() << std::endl;
+        std::cout << "[actual_type]   " << read_result->type()->ToString() << std::endl;
+        std::cout << "[expected] " << expected_chunked->ToString() << std::endl;
+        std::cout << "[actual]   " << read_result->ToString() << std::endl;
+    }
+    ASSERT_TRUE(is_equal);
+}
+
+// Test: Read only top-level fields, skip struct entirely.
+TEST_P(NestedColumnPruningInteTest, PruneEntireStructField) {
+    auto struct_type = arrow::struct_({
+        arrow::field("x", arrow::int64()),
+        arrow::field("y", arrow::utf8()),
+    });
+    arrow::FieldVector table_fields = {
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", struct_type),
+        arrow::field("f2", arrow::float64()),
+    };
+    auto table_schema = arrow::schema(table_fields);
+
+    std::map<std::string, std::string> options = {
+        {Options::MANIFEST_FORMAT, "AVRO"},
+        {Options::FILE_FORMAT, StringUtils::ToUpperCase(file_format_)},
+        {Options::TARGET_FILE_SIZE, "1024"},
+        {Options::BUCKET, "-1"},
+    };
+
+    ASSERT_OK_AND_ASSIGN(
+        auto helper,
+        TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{},
+                           /*primary_keys=*/{}, options, /*is_streaming_mode=*/false));
+
+    std::string data = R"([
+        [100, [1, "aa"], 0.1],
+        [200, [2, "bb"], 0.2],
+        [300, [3, "cc"], 0.3]
+    ])";
+    ASSERT_OK_AND_ASSIGN(
+        auto batch,
+        TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data,
+                                    /*partition_map=*/{}, /*bucket=*/0, {}));
+    int64_t commit_identifier = 0;
+    ASSERT_OK_AND_ASSIGN(
+        auto commit_msgs,
+        helper->WriteAndCommit(std::move(batch), commit_identifier++,
+                               /*expected_commit_messages=*/std::nullopt));
+
+    ASSERT_OK_AND_ASSIGN(
+        auto data_splits,
+        helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
+
+    // Only read f0 and f2, skip f1 entirely.
+    // IDs: f0->0, f1->1, f1.x->2, f1.y->3, f2->4
+    arrow::FieldVector projected_fields = {
+        AnnotateField(arrow::field("f0", arrow::int32()), 0),
+        AnnotateField(arrow::field("f2", arrow::float64()), 4),
+    };
+    auto projected_schema = arrow::schema(projected_fields);
+
+    ArrowSchema c_schema;
+    ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok());
+
+    ReadContextBuilder read_context_builder(table_path_);
+    read_context_builder.SetOptions(options).SetReadSchema(&c_schema);
+    ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish());
+    ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context)));
+    ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits));
+    ASSERT_OK_AND_ASSIGN(auto read_result,
+                         ReadResultCollector::CollectResult(batch_reader.get()));
+
+    arrow::FieldVector expected_fields = {
+        arrow::field("_VALUE_KIND", arrow::int8()),
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f2", arrow::float64()),
+    };
+    auto expected_type = arrow::struct_(expected_fields);
+    std::string expected_data = R"([
+        [0, 100, 0.1],
+        [0, 200, 0.2],
+        [0, 300, 0.3]
+    ])";
+    auto expected_array =
+        arrow::ipc::internal::json::ArrayFromJSON(expected_type, expected_data).ValueOrDie();
+    auto expected_chunked = std::make_shared<arrow::ChunkedArray>(expected_array);
+
+    arrow::EqualOptions equal_options = arrow::EqualOptions::Defaults();
+    bool is_equal = expected_chunked->Equals(read_result, equal_options.diff_sink(&std::cout));
+    if (!is_equal) {
+        std::cout << "[expected_type] " << expected_chunked->type()->ToString() << std::endl;
+        std::cout << "[actual_type]   " << read_result->type()->ToString() << std::endl;
+        std::cout << "[expected] " << expected_chunked->ToString() << std::endl;
+        std::cout << "[actual]   " << read_result->ToString() << std::endl;
+    }
+    ASSERT_TRUE(is_equal);
+}
+
+// Test: Nested struct — prune sub-fields of a struct inside another struct.
+TEST_P(NestedColumnPruningInteTest, PruneDeepNestedStruct) {
+    // Table schema: f0 (int32), f1 (struct{a: int32, inner: struct{x: int64, y: utf8}})
+    auto inner_struct = arrow::struct_({
+        arrow::field("x", arrow::int64()),
+        arrow::field("y", arrow::utf8()),
+    });
+    auto outer_struct = arrow::struct_({
+        arrow::field("a", arrow::int32()),
+        arrow::field("inner", inner_struct),
+    });
+    arrow::FieldVector table_fields = {
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", outer_struct),
+    };
+    auto table_schema = arrow::schema(table_fields);
+
+    std::map<std::string, std::string> options = {
+        {Options::MANIFEST_FORMAT, "AVRO"},
+        {Options::FILE_FORMAT, StringUtils::ToUpperCase(file_format_)},
+        {Options::TARGET_FILE_SIZE, "1024"},
+        {Options::BUCKET, "-1"},
+    };
+
+    ASSERT_OK_AND_ASSIGN(
+        auto helper,
+        TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{},
+                           /*primary_keys=*/{}, options, /*is_streaming_mode=*/false));
+
+    std::string data = R"([
+        [1, [10, [100, "aaa"]]],
+        [2, [20, [200, "bbb"]]],
+        [3, [30, [300, "ccc"]]]
+    ])";
+    ASSERT_OK_AND_ASSIGN(
+        auto batch,
+        TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data,
+                                    /*partition_map=*/{}, /*bucket=*/0, {}));
+    int64_t commit_identifier = 0;
+    ASSERT_OK_AND_ASSIGN(
+        auto commit_msgs,
+        helper->WriteAndCommit(std::move(batch), commit_identifier++,
+                               /*expected_commit_messages=*/std::nullopt));
+
+    ASSERT_OK_AND_ASSIGN(
+        auto data_splits,
+        helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
+
+    // Field IDs (assigned sequentially by catalog):
+    // f0->0, f1->1, f1.a->2, f1.inner->3, f1.inner.x->4, f1.inner.y->5
+    //
+    // Projected: f0, f1{inner{x}} — skip f1.a and f1.inner.y
+    auto pruned_inner = arrow::struct_({
+        AnnotateField(arrow::field("x", arrow::int64()), 4),
+    });
+    auto pruned_outer = arrow::struct_({
+        AnnotateField(arrow::field("inner", pruned_inner), 3),
+    });
+    arrow::FieldVector projected_fields = {
+        AnnotateField(arrow::field("f0", arrow::int32()), 0),
+        AnnotateField(arrow::field("f1", pruned_outer), 1),
+    };
+    auto projected_schema = arrow::schema(projected_fields);
+
+    ArrowSchema c_schema;
+    ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok());
+
+    ReadContextBuilder read_context_builder(table_path_);
+    read_context_builder.SetOptions(options).SetReadSchema(&c_schema);
+    ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish());
+    ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context)));
+    ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits));
+    ASSERT_OK_AND_ASSIGN(auto read_result,
+                         ReadResultCollector::CollectResult(batch_reader.get()));
+
+    arrow::FieldVector expected_fields = {
+        arrow::field("_VALUE_KIND", arrow::int8()),
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", arrow::struct_({
+            arrow::field("inner", arrow::struct_({
+                arrow::field("x", arrow::int64()),
+            })),
+        })),
+    };
+    auto expected_type = arrow::struct_(expected_fields);
+    std::string expected_data = R"([
+        [0, 1, [[100]]],
+        [0, 2, [[200]]],
+        [0, 3, [[300]]]
+    ])";
+    auto expected_array =
+        arrow::ipc::internal::json::ArrayFromJSON(expected_type, expected_data).ValueOrDie();
+    auto expected_chunked = std::make_shared<arrow::ChunkedArray>(expected_array);
+
+    arrow::EqualOptions equal_options = arrow::EqualOptions::Defaults();
+    bool is_equal = expected_chunked->Equals(read_result, equal_options.diff_sink(&std::cout));
+    if (!is_equal) {
+        std::cout << "[expected_type] " << expected_chunked->type()->ToString() << std::endl;
+        std::cout << "[actual_type]   " << read_result->type()->ToString() << std::endl;
+        std::cout << "[expected] " << expected_chunked->ToString() << std::endl;
+        std::cout << "[actual]   " << read_result->ToString() << std::endl;
+    }
+    ASSERT_TRUE(is_equal);
+}
+
+INSTANTIATE_TEST_SUITE_P(FileFormats, NestedColumnPruningInteTest,
+                         ::testing::Values("parquet"));
+
+}  // namespace paimon::test
diff --git a/test/inte/read_inte_test.cpp b/test/inte/read_inte_test.cpp
index d0b71cb39..a267bfca2 100644
--- a/test/inte/read_inte_test.cpp
+++ b/test/inte/read_inte_test.cpp
@@ -506,7 +506,7 @@ TEST_P(ReadInteTest, TestReadOnlyPartitionField) {
 
     ReadContextBuilder context_builder(path);
     context_builder.AddOption(Options::FILE_FORMAT, param.file_format);
-    context_builder.SetReadSchema({"dt"});
+    context_builder.SetReadFieldNames({"dt"});
     context_builder.SetPrefetchCacheMode(param.cache_mode);
     context_builder.EnablePrefetch(param.enable_prefetch)
         .AddOption(Options::FILE_FORMAT, param.file_format)
@@ -1367,7 +1367,7 @@ TEST_P(ReadInteTest, TestAppendReadWithMultipleBuckets) {
     std::string path =
         paimon::test::GetDataDir() + "/" + param.file_format + "/append_09.db/append_09";
     ReadContextBuilder context_builder(path);
-    context_builder.SetReadSchema({"f3", "f0", "f1"});
+    context_builder.SetReadFieldNames({"f3", "f0", "f1"});
     context_builder.SetPrefetchCacheMode(param.cache_mode);
     context_builder.AddOption(Options::FILE_FORMAT, param.file_format)
         .AddOption("read.batch-size", "2")
@@ -1447,7 +1447,7 @@ TEST_P(ReadInteTest, TestAppendReadWithPredicate) {
         paimon::test::GetDataDir() + "/" + param.file_format + "/append_09.db/append_09";
 
     ReadContextBuilder context_builder(path);
-    context_builder.SetReadSchema({"f3", "f0", "f1"});
+    context_builder.SetReadFieldNames({"f3", "f0", "f1"});
     context_builder.SetPrefetchCacheMode(param.cache_mode);
     context_builder.AddOption(Options::FILE_FORMAT, param.file_format)
         .SetPredicate(predicate)
@@ -1551,7 +1551,7 @@ TEST_P(ReadInteTest, TestAppendReadWithComplexTypePredicate) {
                        "/append_complex_data.db/append_complex_data";
     ReadContextBuilder context_builder(path);
     context_builder.SetPrefetchCacheMode(param.cache_mode);
-    context_builder.SetReadSchema({"f6", "f2", "f4", "f3", "f5"});
+    context_builder.SetReadFieldNames({"f6", "f2", "f4", "f3", "f5"});
     context_builder.AddOption(Options::FILE_FORMAT, param.file_format)
         .AddOption("read.batch-size", "2");
     context_builder.SetPredicate(predicate);
@@ -1624,7 +1624,7 @@ TEST_P(ReadInteTest, TestAppendReadWithPredicateOnlyPushdown) {
 
     ReadContextBuilder context_builder(path);
     context_builder.SetPrefetchCacheMode(param.cache_mode);
-    context_builder.SetReadSchema({"f3", "f0", "f1"});
+    context_builder.SetReadFieldNames({"f3", "f0", "f1"});
     context_builder.AddOption(Options::FILE_FORMAT, param.file_format)
         .AddOption("read.batch-size", "2")
         .AddOption("test.enable-adaptive-prefetch-strategy",
@@ -1700,7 +1700,7 @@ TEST_P(ReadInteTest, TestAppendReadWithPredicateAllFiltered) {
 
     ReadContextBuilder context_builder(path);
     context_builder.SetPrefetchCacheMode(param.cache_mode);
-    context_builder.SetReadSchema({"f3", "f0", "f1"});
+    context_builder.SetReadFieldNames({"f3", "f0", "f1"});
     context_builder.AddOption(Options::FILE_FORMAT, param.file_format)
         .AddOption("read.batch-size", "2")
         .AddOption("test.enable-adaptive-prefetch-strategy",
@@ -1785,7 +1785,7 @@ TEST_P(ReadInteTest, TestAppendReadIOException) {
         io_hook->Reset(i, IOHook::Mode::RETURN_ERROR);
         ReadContextBuilder context_builder(paimon::test::GetDataDir() + "/" + param.file_format +
                                            "/append_09.db/append_09/");
-        context_builder.SetReadSchema({"f3", "f0", "f1"});
+        context_builder.SetReadFieldNames({"f3", "f0", "f1"});
         context_builder.SetPrefetchCacheMode(param.cache_mode);
         context_builder.AddOption(Options::FILE_FORMAT, param.file_format)
             .AddOption("read.batch-size", "2")
@@ -2029,7 +2029,7 @@ TEST_P(ReadInteTest, TestPkTableWithSnapshot8) {
     std::string path = paimon::test::GetDataDir() + "/" + param.file_format + "/pk_09.db/pk_09";
     ReadContextBuilder context_builder(path);
     context_builder.SetPrefetchCacheMode(param.cache_mode);
-    context_builder.SetReadSchema({"f0", "f3", "f1"});
+    context_builder.SetReadFieldNames({"f0", "f3", "f1"});
     context_builder.AddOption(Options::FILE_FORMAT, param.file_format)
         .AddOption("read.batch-size", "2");
     context_builder.EnablePrefetch(param.enable_prefetch)
@@ -2203,7 +2203,7 @@ TEST_P(ReadInteTest, TestAppendReadWithSchemaEvolutionWithPredicateFilter) {
                        "/append_table_with_alter_table.db/append_table_with_alter_table/";
     ReadContextBuilder context_builder(path);
     context_builder.SetPrefetchCacheMode(param.cache_mode);
-    context_builder.SetReadSchema({"a", "k", "key1", "d", "key0", "c"});
+    context_builder.SetReadFieldNames({"a", "k", "key1", "d", "key0", "c"});
     context_builder.AddOption(Options::FILE_FORMAT, param.file_format)
         .AddOption("read.batch-size", "2");
     context_builder.SetPredicate(predicate);
@@ -2282,7 +2282,7 @@ TEST_P(ReadInteTest, TestAppendReadWithSchemaEvolutionWithPredicateOnlyPushDown)
                        "append_table_with_alter_table/";
     ReadContextBuilder context_builder(path);
     context_builder.SetPrefetchCacheMode(param.cache_mode);
-    context_builder.SetReadSchema({"a", "k", "key1", "d", "key0", "c"});
+    context_builder.SetReadFieldNames({"a", "k", "key1", "d", "key0", "c"});
     context_builder.AddOption(Options::FILE_FORMAT, param.file_format)
         .AddOption("read.batch-size", "2");
     context_builder.SetPredicate(predicate);
@@ -2355,7 +2355,7 @@ TEST_P(ReadInteTest, TestPkReadSnapshot5WithSchemaEvolution) {
                        "/pk_table_with_alter_table.db/pk_table_with_alter_table/";
     ReadContextBuilder context_builder(path);
     context_builder.SetPrefetchCacheMode(param.cache_mode);
-    context_builder.SetReadSchema({"key1", "k", "key_2", "c", "d", "a", "key0", "e"});
+    context_builder.SetReadFieldNames({"key1", "k", "key_2", "c", "d", "a", "key0", "e"});
     context_builder.AddOption(Options::FILE_FORMAT, param.file_format)
         .AddOption("read.batch-size", "2");
     context_builder.EnablePrefetch(param.enable_prefetch)
@@ -2440,7 +2440,7 @@ TEST_P(ReadInteTest, TestPkReadSnapshot6WithSchemaEvolution) {
                        "/pk_table_with_alter_table.db/pk_table_with_alter_table/";
     ReadContextBuilder context_builder(path);
     context_builder.SetPrefetchCacheMode(param.cache_mode);
-    context_builder.SetReadSchema({"key1", "k", "key_2", "c", "d", "a", "key0", "e"});
+    context_builder.SetReadFieldNames({"key1", "k", "key_2", "c", "d", "a", "key0", "e"});
     context_builder.AddOption(Options::FILE_FORMAT, param.file_format)
         .AddOption("read.batch-size", "2");
     context_builder.EnablePrefetch(param.enable_prefetch)
@@ -2524,7 +2524,7 @@ TEST_P(ReadInteTest, TestPkReadSnapshot6WithSchemaEvolutionWithPredicateOnlyPush
     ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({equal, less_than}));
 
     ReadContextBuilder context_builder(path);
-    context_builder.SetReadSchema({{"key1", "k", "key_2", "c", "d", "a", "key0", "e"}});
+    context_builder.SetReadFieldNames({{"key1", "k", "key_2", "c", "d", "a", "key0", "e"}});
     context_builder.AddOption(Options::FILE_FORMAT, param.file_format)
         .AddOption("read.batch-size", "2");
     context_builder.SetPrefetchCacheMode(param.cache_mode);
@@ -2607,7 +2607,7 @@ TEST_P(ReadInteTest, TestPkReadSnapshot6WithSchemaEvolutionWithPredicateFilter)
 
     ReadContextBuilder context_builder(path);
     context_builder.SetPrefetchCacheMode(param.cache_mode);
-    context_builder.SetReadSchema({"key1", "k", "key_2", "c", "d", "a", "key0", "e"});
+    context_builder.SetReadFieldNames({"key1", "k", "key_2", "c", "d", "a", "key0", "e"});
     context_builder.AddOption(Options::FILE_FORMAT, param.file_format)
         .AddOption("read.batch-size", "2");
     context_builder.SetPredicate(predicate);
@@ -2699,7 +2699,7 @@ TEST_P(ReadInteTest, TestAppendReadWithSchemaEvolutionWithBuildInFieldId) {
 
     ReadContextBuilder context_builder(path);
     context_builder.SetPrefetchCacheMode(param.cache_mode);
-    context_builder.SetReadSchema({"key0", "key1", "k", "c", "d", "a", "e"});
+    context_builder.SetReadFieldNames({"key0", "key1", "k", "c", "d", "a", "e"});
     context_builder.AddOption(Options::FILE_FORMAT, param.file_format)
         .AddOption("read.batch-size", "2");
     context_builder.EnablePrefetch(param.enable_prefetch)
@@ -2817,7 +2817,7 @@ TEST_P(ReadInteTest, TestAppendReadWithSchemaEvolutionWithCast) {
                        "append_table_alter_table_with_cast/";
     ReadContextBuilder context_builder(path);
     context_builder.SetPrefetchCacheMode(param.cache_mode);
-    context_builder.SetReadSchema({"f4", "key0", "key1", "f3", "f1", "f2", "f0", "f6"});
+    context_builder.SetReadFieldNames({"f4", "key0", "key1", "f3", "f1", "f2", "f0", "f6"});
     context_builder.AddOption(Options::FILE_FORMAT, param.file_format)
         .AddOption("read.batch-size", "2");
     context_builder.EnablePrefetch(param.enable_prefetch)
@@ -2898,7 +2898,7 @@ TEST_P(ReadInteTest, TestAppendReadWithSchemaEvolutionWithCastWithPredicatePushD
                        "/append_table_alter_table_with_cast.db/"
                        "append_table_alter_table_with_cast/";
     ReadContextBuilder context_builder(path);
-    context_builder.SetReadSchema({"f4", "key0", "key1", "f3", "f1", "f2", "f0", "f6"});
+    context_builder.SetReadFieldNames({"f4", "key0", "key1", "f3", "f1", "f2", "f0", "f6"});
     context_builder.SetPrefetchCacheMode(param.cache_mode);
     context_builder.AddOption(Options::FILE_FORMAT, param.file_format)
         .AddOption("read.batch-size", "2");
diff --git a/test/inte/scan_and_read_inte_test.cpp b/test/inte/scan_and_read_inte_test.cpp
index b68800963..aafb9c59c 100644
--- a/test/inte/scan_and_read_inte_test.cpp
+++ b/test/inte/scan_and_read_inte_test.cpp
@@ -1025,7 +1025,7 @@ TEST_P(ScanAndReadInteTest, TestWithPKWithNestedType) {
     AddReadOptionsForPrefetch(&read_context_builder);
     ASSERT_OK_AND_ASSIGN(
         auto read_context,
-        read_context_builder.SetReadSchema({"shopId", "dt", "hr", "col0", "col1", "col2"})
+        read_context_builder.SetReadFieldNames({"shopId", "dt", "hr", "col0", "col1", "col2"})
             .Finish());
 
     ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context)));
@@ -2177,7 +2177,7 @@ TEST_P(ScanAndReadInteTest, TestScanWithPredicateAndReadWithUnorderedFieldForPar
 
     ReadContextBuilder read_context_builder(table_path);
     AddReadOptionsForPrefetch(&read_context_builder);
-    read_context_builder.SetReadSchema({"f10", "f8", "f4", "f13"});
+    read_context_builder.SetReadFieldNames({"f10", "f8", "f4", "f13"});
     ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish());
     ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context)));
     ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(result_plan->Splits()));
@@ -2298,7 +2298,7 @@ TEST_F(ScanAndReadInteTest, TestScanWithPredicateAndReadWithUnorderedFieldForLan
     ASSERT_EQ(result_plan->SnapshotId().value(), 1);
 
     ReadContextBuilder read_context_builder(table_path);
-    read_context_builder.SetReadSchema({"f2", "f0"});
+    read_context_builder.SetReadFieldNames({"f2", "f0"});
     ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish());
     ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context)));
     ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(result_plan->Splits()));

From b9a0184ae6f14efd694307bcb8d4d04ab4d77562 Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Thu, 11 Jun 2026 15:47:41 +0800
Subject: [PATCH 02/24] support filter map with keys

---
 src/paimon/common/types/data_field.h          |   3 +
 src/paimon/core/io/field_mapping_reader.cpp   |  27 ++-
 src/paimon/core/utils/field_mapping.cpp       |   2 +-
 .../core/utils/nested_projection_utils.cpp    | 147 +++++++++++++++-
 .../core/utils/nested_projection_utils.h      | 105 ++++++-----
 .../utils/nested_projection_utils_test.cpp    | 165 ++++++++++++++++--
 .../parquet/parquet_file_batch_reader.cpp     |   8 +-
 test/inte/nested_column_pruning_inte_test.cpp |  95 ++++++++++
 8 files changed, 487 insertions(+), 65 deletions(-)

diff --git a/src/paimon/common/types/data_field.h b/src/paimon/common/types/data_field.h
index 7aa339944..27fedda35 100644
--- a/src/paimon/common/types/data_field.h
+++ b/src/paimon/common/types/data_field.h
@@ -41,6 +41,9 @@ class DataField : public Jsonizable<DataField> {
 
     static constexpr char FIELD_ID[] = "paimon.id";
     static constexpr char DESCRIPTION[] = "paimon.description";
+    /// Metadata key for map field selected keys. The value is a JSON array of
+    /// string keys, e.g. '["key1","key2"]'. Only string-keyed maps are supported.
+    static constexpr char MAP_SELECTED_KEYS[] = "paimon.map.selected-keys";
 
  public:
     static std::shared_ptr<arrow::Field> ConvertDataFieldToArrowField(const DataField& field);
diff --git a/src/paimon/core/io/field_mapping_reader.cpp b/src/paimon/core/io/field_mapping_reader.cpp
index 97f733cf5..3f1ab2e65 100644
--- a/src/paimon/core/io/field_mapping_reader.cpp
+++ b/src/paimon/core/io/field_mapping_reader.cpp
@@ -17,6 +17,7 @@
 
 #include <cassert>
 #include <cstddef>
+#include <set>
 #include <utility>
 
 #include "arrow/api.h"
@@ -79,6 +80,16 @@ FieldMappingReader::FieldMappingReader(int32_t field_count,
                 non_partition_info_.non_partition_read_schema[i].Type())) {
             need_mapping_ = true;
         }
+        // Map selected-keys metadata also requires mapping so that
+        // FilterMapArrayBySelectedKeys can filter out unwanted entries.
+        if (!need_mapping_ &&
+            non_partition_info_.non_partition_read_schema[i].Type()->id() == arrow::Type::MAP) {
+            std::set<std::string> selected_keys = NestedProjectionUtils::GetMapSelectedKeys(
+                non_partition_info_.non_partition_read_schema[i].ArrowField());
+            if (!selected_keys.empty()) {
+                need_mapping_ = true;
+            }
+        }
     }
 }
 
@@ -306,7 +317,21 @@ Status FieldMappingReader::MappingFields(const std::shared_ptr<arrow::Array>& da
         // sub-fields than requested, prune the excess here.
         const std::shared_ptr<arrow::DataType>& target_type = read_fields_of_data_array[i].Type();
         if (!field_array->type()->Equals(target_type)) {
-            PAIMON_ASSIGN_OR_RAISE(field_array, PruneArray(field_array, target_type));
+            PAIMON_ASSIGN_OR_RAISE(field_array,
+                                   NestedProjectionUtils::PruneArray(field_array, target_type));
+        }
+
+        // Filter map entries by selected keys if metadata is present.
+        if (field_array->type()->id() == arrow::Type::MAP) {
+            std::set<std::string> selected_keys =
+                NestedProjectionUtils::GetMapSelectedKeys(
+                    read_fields_of_data_array[i].ArrowField());
+            if (!selected_keys.empty()) {
+                PAIMON_ASSIGN_OR_RAISE(
+                    field_array,
+                    NestedProjectionUtils::FilterMapArrayBySelectedKeys(
+                        field_array, selected_keys));
+            }
         }
 
         (*target_array)[idx_in_target_schema[i]] = std::move(field_array);
diff --git a/src/paimon/core/utils/field_mapping.cpp b/src/paimon/core/utils/field_mapping.cpp
index 38bf4b68d..d06ea793a 100644
--- a/src/paimon/core/utils/field_mapping.cpp
+++ b/src/paimon/core/utils/field_mapping.cpp
@@ -107,7 +107,7 @@ Result<ExistFieldInfo> FieldMappingBuilder::CreateExistFieldInfo(
             // projection. For atomic types this is a no-op.
             PAIMON_ASSIGN_OR_RAISE(
                 std::optional<std::shared_ptr<arrow::DataType>> pruned_type,
-                PruneDataType(read_field.Type(), data_field.Type()));
+                NestedProjectionUtils::PruneDataType(read_field.Type(), data_field.Type()));
             if (!pruned_type.has_value()) {
                 // All sub-fields pruned away — treat as non-existent.
                 continue;
diff --git a/src/paimon/core/utils/nested_projection_utils.cpp b/src/paimon/core/utils/nested_projection_utils.cpp
index f06886611..c1babc2cc 100644
--- a/src/paimon/core/utils/nested_projection_utils.cpp
+++ b/src/paimon/core/utils/nested_projection_utils.cpp
@@ -16,17 +16,23 @@
 
 #include "paimon/core/utils/nested_projection_utils.h"
 
+#include <set>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "arrow/array/array_nested.h"
+#include "arrow/array/array_primitive.h"
+#include "arrow/array/builder_primitive.h"
+#include "arrow/array/concatenate.h"
 #include "arrow/type.h"
 #include "fmt/format.h"
 #include "paimon/status.h"
+#include "rapidjson/document.h"
 
 namespace paimon {
 
-Result<std::optional<std::shared_ptr<arrow::DataType>>> PruneDataType(
+Result<std::optional<std::shared_ptr<arrow::DataType>>> NestedProjectionUtils::PruneDataType(
     const std::shared_ptr<arrow::DataType>& read_type,
     const std::shared_ptr<arrow::DataType>& data_type) {
     // Identical types need no pruning.
@@ -106,7 +112,7 @@ Result<std::optional<std::shared_ptr<arrow::DataType>>> PruneDataType(
 // PruneArray — fallback for format readers that return extra nested columns
 // ---------------------------------------------------------------------------
 
-Result<std::shared_ptr<arrow::Array>> PruneArray(
+Result<std::shared_ptr<arrow::Array>> NestedProjectionUtils::PruneArray(
     const std::shared_ptr<arrow::Array>& array,
     const std::shared_ptr<arrow::DataType>& target_type) {
     if (!array || array->type()->Equals(target_type)) {
@@ -172,4 +178,141 @@ Result<std::shared_ptr<arrow::Array>> PruneArray(
     }
 }
 
+// ---------------------------------------------------------------------------
+// Map selected-keys support
+// ---------------------------------------------------------------------------
+
+std::set<std::string> NestedProjectionUtils::GetMapSelectedKeys(
+    const std::shared_ptr<arrow::Field>& field) {
+    std::set<std::string> result;
+    if (!field || !field->HasMetadata() || !field->metadata()) {
+        return result;
+    }
+    auto get_result = field->metadata()->Get(DataField::MAP_SELECTED_KEYS);
+    if (!get_result.ok()) {
+        return result;
+    }
+    const std::string& json_str = get_result.ValueUnsafe();
+    rapidjson::Document doc;
+    doc.Parse(json_str.c_str());
+    if (doc.HasParseError() || !doc.IsArray()) {
+        return result;
+    }
+    for (rapidjson::SizeType i = 0; i < doc.Size(); ++i) {
+        if (doc[i].IsString()) {
+            result.emplace(doc[i].GetString(), doc[i].GetStringLength());
+        }
+    }
+    return result;
+}
+
+Result<std::shared_ptr<arrow::Array>> NestedProjectionUtils::FilterMapArrayBySelectedKeys(
+    const std::shared_ptr<arrow::Array>& array,
+    const std::set<std::string>& selected_keys) {
+    if (selected_keys.empty() || !array || array->length() == 0) {
+        return array;
+    }
+
+    auto map_array = std::static_pointer_cast<arrow::MapArray>(array);
+    auto map_type = std::static_pointer_cast<arrow::MapType>(array->type());
+
+    if (map_type->key_type()->id() != arrow::Type::STRING) {
+        return Status::Invalid(fmt::format(
+            "FilterMapArrayBySelectedKeys only supports string keys, got {}",
+            map_type->key_type()->ToString()));
+    }
+
+    auto keys_array = std::static_pointer_cast<arrow::StringArray>(map_array->keys());
+    auto values_array = map_array->items();
+    int64_t total_entries = keys_array->length();
+    int64_t num_maps = map_array->length();
+
+    // Mark which flat entries to keep
+    std::vector<bool> keep(total_entries, false);
+    int64_t kept_count = 0;
+    for (int64_t i = 0; i < total_entries; ++i) {
+        if (!keys_array->IsNull(i)) {
+            std::string_view key_view = keys_array->GetView(i);
+            std::string key_str(key_view.data(), key_view.size());
+            if (selected_keys.count(key_str) > 0) {
+                keep[i] = true;
+                ++kept_count;
+            }
+        }
+    }
+
+    if (kept_count == total_entries) {
+        return array;
+    }
+
+    // Collect kept slices as contiguous runs to build filtered key/value arrays
+    // via Slice + Concatenate (avoids arrow::compute::Take dependency).
+    arrow::ArrayVector key_slices;
+    arrow::ArrayVector value_slices;
+    key_slices.reserve(kept_count);
+    value_slices.reserve(kept_count);
+
+    std::vector<int32_t> new_offsets;
+    new_offsets.reserve(num_maps + 1);
+    int32_t running_offset = 0;
+
+    for (int64_t map_idx = 0; map_idx < num_maps; ++map_idx) {
+        new_offsets.push_back(running_offset);
+        if (map_array->IsNull(map_idx)) {
+            continue;
+        }
+        int64_t start = map_array->value_offset(map_idx);
+        int64_t end = map_array->value_offset(map_idx + 1);
+        // Collect contiguous runs of kept entries within this map
+        int64_t run_start = -1;
+        for (int64_t entry_idx = start; entry_idx <= end; ++entry_idx) {
+            bool should_keep = (entry_idx < end) && keep[entry_idx];
+            if (should_keep && run_start < 0) {
+                run_start = entry_idx;
+            } else if (!should_keep && run_start >= 0) {
+                int64_t run_len = entry_idx - run_start;
+                key_slices.push_back(keys_array->Slice(run_start, run_len));
+                value_slices.push_back(values_array->Slice(run_start, run_len));
+                running_offset += static_cast<int32_t>(run_len);
+                run_start = -1;
+            }
+        }
+    }
+    new_offsets.push_back(running_offset);
+
+    // Build filtered key/value arrays
+    std::shared_ptr<arrow::Array> filtered_keys;
+    std::shared_ptr<arrow::Array> filtered_values;
+    if (key_slices.empty()) {
+        // All entries filtered out — create empty arrays
+        filtered_keys = keys_array->Slice(0, 0);
+        filtered_values = values_array->Slice(0, 0);
+    } else if (key_slices.size() == 1) {
+        filtered_keys = key_slices[0];
+        filtered_values = value_slices[0];
+    } else {
+        PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(filtered_keys,
+                                          arrow::Concatenate(key_slices));
+        PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(filtered_values,
+                                          arrow::Concatenate(value_slices));
+    }
+
+    // Build new offsets array
+    arrow::Int32Builder offset_builder;
+    PAIMON_RETURN_NOT_OK_FROM_ARROW(offset_builder.Reserve(
+        static_cast<int64_t>(new_offsets.size())));
+    for (int32_t offset : new_offsets) {
+        offset_builder.UnsafeAppend(offset);
+    }
+    std::shared_ptr<arrow::Array> new_offsets_array;
+    PAIMON_RETURN_NOT_OK_FROM_ARROW(offset_builder.Finish(&new_offsets_array));
+
+    PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
+        std::shared_ptr<arrow::Array> result_map,
+        arrow::MapArray::FromArrays(new_offsets_array, filtered_keys, filtered_values,
+                                    arrow::default_memory_pool(),
+                                    map_array->null_bitmap()));
+    return result_map;
+}
+
 }  // namespace paimon
diff --git a/src/paimon/core/utils/nested_projection_utils.h b/src/paimon/core/utils/nested_projection_utils.h
index 7bc798a6b..a8ba0fe77 100644
--- a/src/paimon/core/utils/nested_projection_utils.h
+++ b/src/paimon/core/utils/nested_projection_utils.h
@@ -19,6 +19,7 @@
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -29,53 +30,73 @@
 
 namespace paimon {
 
-/// Extract the paimon field ID from an Arrow field's metadata ("paimon.id").
-/// Returns -1 if the metadata key is not present.
-inline int32_t GetPaimonFieldId(const std::shared_ptr<arrow::Field>& field) {
-    if (!field || !field->HasMetadata() || !field->metadata()) {
-        return -1;
-    }
-    auto result = field->metadata()->Get(DataField::FIELD_ID);
-    if (!result.ok()) {
-        return -1;
-    }
-    std::optional<int32_t> field_id = StringUtils::StringToValue<int32_t>(result.ValueUnsafe());
-    return field_id.value_or(-1);
-}
+/// Utility class for nested column pruning and map key selection.
+class NestedProjectionUtils {
+ public:
+    NestedProjectionUtils() = delete;
 
-/// Find a child field in a STRUCT DataType by paimon field ID.
-/// Returns nullptr if no child has the given ID.
-inline std::shared_ptr<arrow::Field> FindFieldByPaimonId(
-    const std::shared_ptr<arrow::DataType>& struct_type, int32_t field_id) {
-    if (!struct_type || struct_type->id() != arrow::Type::STRUCT) {
-        return nullptr;
+    /// Extract the paimon field ID from an Arrow field's metadata ("paimon.id").
+    /// Returns -1 if the metadata key is not present.
+    static int32_t GetPaimonFieldId(const std::shared_ptr<arrow::Field>& field) {
+        if (!field || !field->HasMetadata() || !field->metadata()) {
+            return -1;
+        }
+        auto result = field->metadata()->Get(DataField::FIELD_ID);
+        if (!result.ok()) {
+            return -1;
+        }
+        std::optional<int32_t> field_id =
+            StringUtils::StringToValue<int32_t>(result.ValueUnsafe());
+        return field_id.value_or(-1);
     }
-    for (const auto& child : struct_type->fields()) {
-        if (GetPaimonFieldId(child) == field_id) {
-            return child;
+
+    /// Find a child field in a STRUCT DataType by paimon field ID.
+    /// Returns nullptr if no child has the given ID.
+    static std::shared_ptr<arrow::Field> FindFieldByPaimonId(
+        const std::shared_ptr<arrow::DataType>& struct_type, int32_t field_id) {
+        if (!struct_type || struct_type->id() != arrow::Type::STRUCT) {
+            return nullptr;
+        }
+        for (const auto& child : struct_type->fields()) {
+            if (GetPaimonFieldId(child) == field_id) {
+                return child;
+            }
         }
+        return nullptr;
     }
-    return nullptr;
-}
 
-/// Recursively prune `data_type` so that only the sub-fields requested by
-/// `read_type` are retained. Matching is done by paimon field ID to support
-/// schema evolution (field renames).
-///
-/// Supported nesting: STRUCT, LIST (element recurse), MAP (key/value recurse).
-/// For atomic types, `data_type` is returned as-is.
-///
-/// Returns std::nullopt when all sub-fields of a STRUCT are pruned away
-/// (caller should skip this field entirely, mirroring Java's null return).
-Result<std::optional<std::shared_ptr<arrow::DataType>>> PruneDataType(
-    const std::shared_ptr<arrow::DataType>& read_type,
-    const std::shared_ptr<arrow::DataType>& data_type);
+    /// Recursively prune `data_type` so that only the sub-fields requested by
+    /// `read_type` are retained. Matching is done by paimon field ID to support
+    /// schema evolution (field renames).
+    ///
+    /// Supported nesting: STRUCT, LIST (element recurse), MAP (key/value recurse).
+    /// For atomic types, `data_type` is returned as-is.
+    ///
+    /// Returns std::nullopt when all sub-fields of a STRUCT are pruned away
+    /// (caller should skip this field entirely, mirroring Java's null return).
+    static Result<std::optional<std::shared_ptr<arrow::DataType>>> PruneDataType(
+        const std::shared_ptr<arrow::DataType>& read_type,
+        const std::shared_ptr<arrow::DataType>& data_type);
+
+    /// Prune a StructArray so that only the sub-fields present in `target_type`
+    /// are kept. Used as a fallback when the format reader returns more columns
+    /// than requested.
+    static Result<std::shared_ptr<arrow::Array>> PruneArray(
+        const std::shared_ptr<arrow::Array>& array,
+        const std::shared_ptr<arrow::DataType>& target_type);
+
+    /// Parse the "paimon.map.selected-keys" metadata from an Arrow field.
+    /// Returns an empty set if the metadata key is absent or the field is not a MAP.
+    /// The metadata value must be a JSON array of strings, e.g. '["key1","key2"]'.
+    static std::set<std::string> GetMapSelectedKeys(
+        const std::shared_ptr<arrow::Field>& field);
 
-/// Prune a StructArray so that only the sub-fields present in `target_type`
-/// are kept. Used as a fallback when the format reader returns more columns
-/// than requested.
-Result<std::shared_ptr<arrow::Array>> PruneArray(
-    const std::shared_ptr<arrow::Array>& array,
-    const std::shared_ptr<arrow::DataType>& target_type);
+    /// Filter a MapArray so that only entries whose key is in `selected_keys` are kept.
+    /// Only supports string-keyed maps. Returns the original array unchanged if
+    /// `selected_keys` is empty.
+    static Result<std::shared_ptr<arrow::Array>> FilterMapArrayBySelectedKeys(
+        const std::shared_ptr<arrow::Array>& map_array,
+        const std::set<std::string>& selected_keys);
+};
 
 }  // namespace paimon
diff --git a/src/paimon/core/utils/nested_projection_utils_test.cpp b/src/paimon/core/utils/nested_projection_utils_test.cpp
index f66932885..c7add426a 100644
--- a/src/paimon/core/utils/nested_projection_utils_test.cpp
+++ b/src/paimon/core/utils/nested_projection_utils_test.cpp
@@ -38,16 +38,16 @@ static std::shared_ptr<arrow::Field> MakeField(const std::string& name,
 
 TEST(NestedProjectionUtilsTest, GetPaimonFieldId_Present) {
     auto field = MakeField("col", arrow::int32(), 42);
-    ASSERT_EQ(GetPaimonFieldId(field), 42);
+    ASSERT_EQ(NestedProjectionUtils::GetPaimonFieldId(field), 42);
 }
 
 TEST(NestedProjectionUtilsTest, GetPaimonFieldId_Missing) {
     auto field = arrow::field("col", arrow::int32());
-    ASSERT_EQ(GetPaimonFieldId(field), -1);
+    ASSERT_EQ(NestedProjectionUtils::GetPaimonFieldId(field), -1);
 }
 
 TEST(NestedProjectionUtilsTest, GetPaimonFieldId_Nullptr) {
-    ASSERT_EQ(GetPaimonFieldId(nullptr), -1);
+    ASSERT_EQ(NestedProjectionUtils::GetPaimonFieldId(nullptr), -1);
 }
 
 // ============== FindFieldByPaimonId ==============
@@ -55,25 +55,25 @@ TEST(NestedProjectionUtilsTest, GetPaimonFieldId_Nullptr) {
 TEST(NestedProjectionUtilsTest, FindFieldByPaimonId_Found) {
     auto struct_type = arrow::struct_({MakeField("x", arrow::int32(), 1),
                                        MakeField("y", arrow::utf8(), 2)});
-    auto found = FindFieldByPaimonId(struct_type, 2);
+    auto found = NestedProjectionUtils::FindFieldByPaimonId(struct_type, 2);
     ASSERT_NE(found, nullptr);
     ASSERT_EQ(found->name(), "y");
 }
 
 TEST(NestedProjectionUtilsTest, FindFieldByPaimonId_NotFound) {
     auto struct_type = arrow::struct_({MakeField("x", arrow::int32(), 1)});
-    ASSERT_EQ(FindFieldByPaimonId(struct_type, 99), nullptr);
+    ASSERT_EQ(NestedProjectionUtils::FindFieldByPaimonId(struct_type, 99), nullptr);
 }
 
 TEST(NestedProjectionUtilsTest, FindFieldByPaimonId_NonStruct) {
-    ASSERT_EQ(FindFieldByPaimonId(arrow::int32(), 1), nullptr);
+    ASSERT_EQ(NestedProjectionUtils::FindFieldByPaimonId(arrow::int32(), 1), nullptr);
 }
 
 // ============== PruneDataType ==============
 
 TEST(NestedProjectionUtilsTest, PruneDataType_IdenticalTypes) {
     auto type = arrow::int32();
-    ASSERT_OK_AND_ASSIGN(auto result, PruneDataType(type, type));
+    ASSERT_OK_AND_ASSIGN(auto result, NestedProjectionUtils::PruneDataType(type, type));
     ASSERT_TRUE(result.has_value());
     ASSERT_TRUE(result.value()->Equals(type));
 }
@@ -82,7 +82,7 @@ TEST(NestedProjectionUtilsTest, PruneDataType_AtomicType) {
     // Different atomic types: return data_type
     auto read_type = arrow::int64();
     auto data_type = arrow::int32();
-    ASSERT_OK_AND_ASSIGN(auto result, PruneDataType(read_type, data_type));
+    ASSERT_OK_AND_ASSIGN(auto result, NestedProjectionUtils::PruneDataType(read_type, data_type));
     ASSERT_TRUE(result.has_value());
     ASSERT_TRUE(result.value()->Equals(data_type));
 }
@@ -96,7 +96,7 @@ TEST(NestedProjectionUtilsTest, PruneDataType_StructPruneSubset) {
                                      MakeField("z", arrow::float64(), 3)});
     auto read_type = arrow::struct_({MakeField("x", arrow::int32(), 1)});
 
-    ASSERT_OK_AND_ASSIGN(auto result, PruneDataType(read_type, data_type));
+    ASSERT_OK_AND_ASSIGN(auto result, NestedProjectionUtils::PruneDataType(read_type, data_type));
     ASSERT_TRUE(result.has_value());
     ASSERT_EQ(result.value()->num_fields(), 1);
     ASSERT_EQ(result.value()->field(0)->name(), "x");
@@ -109,7 +109,7 @@ TEST(NestedProjectionUtilsTest, PruneDataType_StructAllFieldsPruned) {
     auto data_type = arrow::struct_({MakeField("x", arrow::int32(), 1)});
     auto read_type = arrow::struct_({MakeField("y", arrow::int32(), 99)});
 
-    ASSERT_OK_AND_ASSIGN(auto result, PruneDataType(read_type, data_type));
+    ASSERT_OK_AND_ASSIGN(auto result, NestedProjectionUtils::PruneDataType(read_type, data_type));
     ASSERT_FALSE(result.has_value());
 }
 
@@ -124,7 +124,7 @@ TEST(NestedProjectionUtilsTest, PruneDataType_NestedStruct) {
     auto inner_read = arrow::struct_({MakeField("a", arrow::int32(), 10)});
     auto read_type = arrow::struct_({MakeField("inner", inner_read, 1)});
 
-    ASSERT_OK_AND_ASSIGN(auto result, PruneDataType(read_type, data_type));
+    ASSERT_OK_AND_ASSIGN(auto result, NestedProjectionUtils::PruneDataType(read_type, data_type));
     ASSERT_TRUE(result.has_value());
     ASSERT_EQ(result.value()->num_fields(), 1);
     auto pruned_inner = result.value()->field(0)->type();
@@ -142,7 +142,7 @@ TEST(NestedProjectionUtilsTest, PruneDataType_ListWithStructElement) {
     auto inner_read = arrow::struct_({MakeField("a", arrow::int32(), 10)});
     auto read_type = arrow::list(arrow::field("item", inner_read));
 
-    ASSERT_OK_AND_ASSIGN(auto result, PruneDataType(read_type, data_type));
+    ASSERT_OK_AND_ASSIGN(auto result, NestedProjectionUtils::PruneDataType(read_type, data_type));
     ASSERT_TRUE(result.has_value());
     auto list_type = std::dynamic_pointer_cast<arrow::ListType>(result.value());
     ASSERT_NE(list_type, nullptr);
@@ -160,7 +160,7 @@ TEST(NestedProjectionUtilsTest, PruneDataType_MapWithStructValue) {
     auto inner_read = arrow::struct_({MakeField("a", arrow::int32(), 10)});
     auto read_type = arrow::map(arrow::utf8(), inner_read);
 
-    ASSERT_OK_AND_ASSIGN(auto result, PruneDataType(read_type, data_type));
+    ASSERT_OK_AND_ASSIGN(auto result, NestedProjectionUtils::PruneDataType(read_type, data_type));
     ASSERT_TRUE(result.has_value());
     auto map_type = std::dynamic_pointer_cast<arrow::MapType>(result.value());
     ASSERT_NE(map_type, nullptr);
@@ -192,7 +192,7 @@ TEST(NestedProjectionUtilsTest, PruneArray_StructPrune) {
 
     // Prune to only keep "x"
     auto target_type = arrow::struct_({arrow::field("x", arrow::int32())});
-    ASSERT_OK_AND_ASSIGN(auto pruned, PruneArray(struct_array, target_type));
+    ASSERT_OK_AND_ASSIGN(auto pruned, NestedProjectionUtils::PruneArray(struct_array, target_type));
 
     ASSERT_EQ(pruned->type()->num_fields(), 1);
     ASSERT_EQ(pruned->type()->field(0)->name(), "x");
@@ -205,8 +205,143 @@ TEST(NestedProjectionUtilsTest, PruneArray_IdenticalType) {
     std::shared_ptr<arrow::Array> array;
     ASSERT_TRUE(builder.Finish(&array).ok());
 
-    ASSERT_OK_AND_ASSIGN(auto pruned, PruneArray(array, arrow::int32()));
+    ASSERT_OK_AND_ASSIGN(auto pruned, NestedProjectionUtils::PruneArray(array, arrow::int32()));
     ASSERT_EQ(pruned.get(), array.get());  // Same pointer — no copy.
 }
 
+// ============== GetMapSelectedKeys ==============
+
+TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_Present) {
+    auto metadata = arrow::KeyValueMetadata::Make(
+        {DataField::MAP_SELECTED_KEYS}, {R"(["key1","key2","key3"])"});
+    auto field = arrow::field("m", arrow::map(arrow::utf8(), arrow::int32()), /*nullable=*/true,
+                              metadata);
+    auto keys = NestedProjectionUtils::GetMapSelectedKeys(field);
+    ASSERT_EQ(keys.size(), 3);
+    ASSERT_TRUE(keys.count("key1"));
+    ASSERT_TRUE(keys.count("key2"));
+    ASSERT_TRUE(keys.count("key3"));
+}
+
+TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_Absent) {
+    auto field = arrow::field("m", arrow::map(arrow::utf8(), arrow::int32()));
+    auto keys = NestedProjectionUtils::GetMapSelectedKeys(field);
+    ASSERT_TRUE(keys.empty());
+}
+
+TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_InvalidJson) {
+    auto metadata = arrow::KeyValueMetadata::Make(
+        {DataField::MAP_SELECTED_KEYS}, {"not_json"});
+    auto field = arrow::field("m", arrow::map(arrow::utf8(), arrow::int32()), /*nullable=*/true,
+                              metadata);
+    auto keys = NestedProjectionUtils::GetMapSelectedKeys(field);
+    ASSERT_TRUE(keys.empty());
+}
+
+TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_Nullptr) {
+    auto keys = NestedProjectionUtils::GetMapSelectedKeys(nullptr);
+    ASSERT_TRUE(keys.empty());
+}
+
+// ============== FilterMapArrayBySelectedKeys ==============
+
+// Helper to build a MapArray<string, int32> from vectors of key-value pairs.
+static std::shared_ptr<arrow::Array> BuildStringInt32MapArray(
+    const std::vector<std::vector<std::pair<std::string, int32_t>>>& maps,
+    const std::vector<bool>& null_mask = {}) {
+    auto key_builder = std::make_shared<arrow::StringBuilder>();
+    auto value_builder = std::make_shared<arrow::Int32Builder>();
+    arrow::MapBuilder map_builder(arrow::default_memory_pool(), key_builder, value_builder);
+    for (size_t i = 0; i < maps.size(); ++i) {
+        if (!null_mask.empty() && !null_mask[i]) {
+            EXPECT_TRUE(map_builder.AppendNull().ok());
+            continue;
+        }
+        EXPECT_TRUE(map_builder.Append().ok());
+        for (const auto& [k, v] : maps[i]) {
+            EXPECT_TRUE(key_builder->Append(k).ok());
+            EXPECT_TRUE(value_builder->Append(v).ok());
+        }
+    }
+    std::shared_ptr<arrow::Array> result;
+    EXPECT_TRUE(map_builder.Finish(&result).ok());
+    return result;
+}
+
+TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_Basic) {
+    // Map with 3 entries each, select only "a" and "c"
+    auto map_array = BuildStringInt32MapArray({
+        {{"a", 1}, {"b", 2}, {"c", 3}},
+        {{"a", 10}, {"d", 40}},
+    });
+
+    std::set<std::string> selected = {"a", "c"};
+    ASSERT_OK_AND_ASSIGN(auto filtered, NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
+
+    auto result = std::static_pointer_cast<arrow::MapArray>(filtered);
+    ASSERT_EQ(result->length(), 2);
+
+    // First map: should have "a" and "c"
+    ASSERT_EQ(result->value_length(0), 2);
+    auto keys0 = std::static_pointer_cast<arrow::StringArray>(result->keys());
+    ASSERT_EQ(keys0->GetString(result->value_offset(0)), "a");
+    ASSERT_EQ(keys0->GetString(result->value_offset(0) + 1), "c");
+
+    // Second map: should have only "a"
+    ASSERT_EQ(result->value_length(1), 1);
+    ASSERT_EQ(keys0->GetString(result->value_offset(1)), "a");
+}
+
+TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_EmptySelectedKeys) {
+    auto map_array = BuildStringInt32MapArray({{{"a", 1}}});
+    std::set<std::string> empty_keys;
+    ASSERT_OK_AND_ASSIGN(auto filtered, NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, empty_keys));
+    // Should return original array unchanged
+    ASSERT_EQ(filtered.get(), map_array.get());
+}
+
+TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_AllKept) {
+    auto map_array = BuildStringInt32MapArray({{{"a", 1}, {"b", 2}}});
+    std::set<std::string> selected = {"a", "b"};
+    ASSERT_OK_AND_ASSIGN(auto filtered, NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
+    // All entries match, should return original
+    ASSERT_EQ(filtered.get(), map_array.get());
+}
+
+TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_NoneKept) {
+    auto map_array = BuildStringInt32MapArray({{{"a", 1}, {"b", 2}}});
+    std::set<std::string> selected = {"x", "y"};
+    ASSERT_OK_AND_ASSIGN(auto filtered, NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
+    auto result = std::static_pointer_cast<arrow::MapArray>(filtered);
+    ASSERT_EQ(result->length(), 1);
+    ASSERT_EQ(result->value_length(0), 0);
+}
+
+TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_WithNull) {
+    // maps[0] = {"a":1}, maps[1] = null, maps[2] = {"b":2,"c":3}
+    auto map_array = BuildStringInt32MapArray(
+        {{{"a", 1}}, {}, {{"b", 2}, {"c", 3}}},
+        {true, false, true});
+
+    std::set<std::string> selected = {"a", "c"};
+    ASSERT_OK_AND_ASSIGN(auto filtered, NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
+    auto result = std::static_pointer_cast<arrow::MapArray>(filtered);
+    ASSERT_EQ(result->length(), 3);
+    // maps[0] = {"a":1}
+    ASSERT_EQ(result->value_length(0), 1);
+    // maps[1] = null
+    ASSERT_TRUE(result->IsNull(1));
+    // maps[2] = {"c":3}
+    ASSERT_EQ(result->value_length(2), 1);
+    auto keys = std::static_pointer_cast<arrow::StringArray>(result->keys());
+    ASSERT_EQ(keys->GetString(result->value_offset(2)), "c");
+}
+
+TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_EmptyArray) {
+    auto map_array = BuildStringInt32MapArray({});
+    std::set<std::string> selected = {"a"};
+    ASSERT_OK_AND_ASSIGN(auto filtered, NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
+    ASSERT_EQ(filtered->length(), 0);
+}
+
 }  // namespace paimon::test
diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
index 2e91424ea..c35f70288 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
@@ -454,9 +454,9 @@ void ParquetFileBatchReader::CollectLeafIndices(const std::shared_ptr<arrow::Dat
                                                 std::vector<int32_t>* indices) {
     if (file_type->id() == arrow::Type::STRUCT) {
         for (const auto& file_child : file_type->fields()) {
-            int32_t file_child_id = GetPaimonFieldId(file_child);
+            int32_t file_child_id = NestedProjectionUtils::GetPaimonFieldId(file_child);
             std::shared_ptr<arrow::Field> read_child =
-                FindFieldByPaimonId(read_type, file_child_id);
+                NestedProjectionUtils::FindFieldByPaimonId(read_type, file_child_id);
             if (read_child) {
                 CollectLeafIndices(read_child->type(), file_child->type(), leaf_index, indices);
             } else {
@@ -499,11 +499,11 @@ Result<std::vector<int32_t>> ParquetFileBatchReader::ComputeNestedColumnIndices(
     int32_t leaf_index = 0;
 
     for (const auto& file_field : file_schema->fields()) {
-        int32_t file_field_id = GetPaimonFieldId(file_field);
+        int32_t file_field_id = NestedProjectionUtils::GetPaimonFieldId(file_field);
         // Find matching field in read_schema by paimon field ID.
         std::shared_ptr<arrow::Field> read_field = nullptr;
         for (const auto& candidate : read_schema->fields()) {
-            if (GetPaimonFieldId(candidate) == file_field_id) {
+            if (NestedProjectionUtils::GetPaimonFieldId(candidate) == file_field_id) {
                 read_field = candidate;
                 break;
             }
diff --git a/test/inte/nested_column_pruning_inte_test.cpp b/test/inte/nested_column_pruning_inte_test.cpp
index 857867e62..95c9dc3ae 100644
--- a/test/inte/nested_column_pruning_inte_test.cpp
+++ b/test/inte/nested_column_pruning_inte_test.cpp
@@ -373,6 +373,101 @@ TEST_P(NestedColumnPruningInteTest, PruneDeepNestedStruct) {
     ASSERT_TRUE(is_equal);
 }
 
+// Test: Table has MAP<STRING, INT32> field, read with selected keys filter.
+TEST_P(NestedColumnPruningInteTest, MapSelectedKeys) {
+    // Table schema: f0 (int32), f1 (map<string, int32>)
+    auto map_type = arrow::map(arrow::utf8(), arrow::int32());
+    arrow::FieldVector table_fields = {
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", map_type),
+    };
+    auto table_schema = arrow::schema(table_fields);
+
+    std::map<std::string, std::string> options = {
+        {Options::MANIFEST_FORMAT, "AVRO"},
+        {Options::FILE_FORMAT, StringUtils::ToUpperCase(file_format_)},
+        {Options::TARGET_FILE_SIZE, "1024"},
+        {Options::BUCKET, "-1"},
+    };
+
+    ASSERT_OK_AND_ASSIGN(
+        auto helper,
+        TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{},
+                           /*primary_keys=*/{}, options, /*is_streaming_mode=*/false));
+
+    // Write data: each row has a map with keys "a", "b", "c"
+    std::string data = R"([
+        [1, [["a", 10], ["b", 20], ["c", 30]]],
+        [2, [["a", 100], ["c", 300]]],
+        [3, [["b", 200], ["c", 400], ["d", 500]]]
+    ])";
+    ASSERT_OK_AND_ASSIGN(
+        auto batch,
+        TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data,
+                                    /*partition_map=*/{}, /*bucket=*/0, {}));
+    int64_t commit_identifier = 0;
+    ASSERT_OK_AND_ASSIGN(
+        auto commit_msgs,
+        helper->WriteAndCommit(std::move(batch), commit_identifier++,
+                               /*expected_commit_messages=*/std::nullopt));
+
+    // Scan to get splits
+    ASSERT_OK_AND_ASSIGN(
+        auto data_splits,
+        helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
+    ASSERT_FALSE(data_splits.empty());
+
+    // Build projected schema: read f0 and f1 with selected keys ["a", "c"]
+    auto selected_keys_metadata = arrow::KeyValueMetadata::Make(
+        {DataField::MAP_SELECTED_KEYS}, {R"(["a","c"])"});
+    arrow::FieldVector projected_fields = {
+        AnnotateField(arrow::field("f0", arrow::int32()), 0),
+        AnnotateField(arrow::field("f1", map_type), 1)->WithMetadata(
+            AnnotateField(arrow::field("f1", map_type), 1)
+                ->metadata()->Merge(*selected_keys_metadata)),
+    };
+    auto projected_schema = arrow::schema(projected_fields);
+
+    // Export to C ArrowSchema
+    ArrowSchema c_schema;
+    ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok());
+
+    // Read with projected schema
+    ReadContextBuilder read_context_builder(table_path_);
+    read_context_builder.SetOptions(options).SetReadSchema(&c_schema);
+    ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish());
+    ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context)));
+    ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits));
+    ASSERT_OK_AND_ASSIGN(auto read_result,
+                         ReadResultCollector::CollectResult(batch_reader.get()));
+
+    // Expected: only keys "a" and "c" remain in each map
+    arrow::FieldVector expected_fields = {
+        arrow::field("_VALUE_KIND", arrow::int8()),
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", arrow::map(arrow::utf8(), arrow::int32())),
+    };
+    auto expected_type = arrow::struct_(expected_fields);
+    std::string expected_data = R"([
+        [0, 1, [["a", 10], ["c", 30]]],
+        [0, 2, [["a", 100], ["c", 300]]],
+        [0, 3, [["c", 400]]]
+    ])";
+    auto expected_array =
+        arrow::ipc::internal::json::ArrayFromJSON(expected_type, expected_data).ValueOrDie();
+    auto expected_chunked = std::make_shared<arrow::ChunkedArray>(expected_array);
+
+    arrow::EqualOptions equal_options = arrow::EqualOptions::Defaults();
+    bool is_equal = expected_chunked->Equals(read_result, equal_options.diff_sink(&std::cout));
+    if (!is_equal) {
+        std::cout << "[expected_type] " << expected_chunked->type()->ToString() << std::endl;
+        std::cout << "[actual_type]   " << read_result->type()->ToString() << std::endl;
+        std::cout << "[expected] " << expected_chunked->ToString() << std::endl;
+        std::cout << "[actual]   " << read_result->ToString() << std::endl;
+    }
+    ASSERT_TRUE(is_equal);
+}
+
 INSTANTIATE_TEST_SUITE_P(FileFormats, NestedColumnPruningInteTest,
                          ::testing::Values("parquet"));
 

From 17fde055ec6a65727a04ed98c09f27c24543b2f1 Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Fri, 12 Jun 2026 11:50:16 +0800
Subject: [PATCH 03/24] fix

---
 include/paimon/read_context.h                 | 26 +++++-----
 .../common/memory/memory_segment_test.cpp     |  4 +-
 src/paimon/core/operation/read_context.cpp    | 52 ++++++-------------
 .../core/utils/nested_projection_utils.cpp    |  6 +--
 .../core/utils/nested_projection_utils.h      |  2 +-
 .../utils/nested_projection_utils_test.cpp    |  2 +-
 .../parquet/parquet_file_batch_reader.cpp     |  2 -
 test/inte/nested_column_pruning_inte_test.cpp |  2 +-
 8 files changed, 34 insertions(+), 62 deletions(-)

diff --git a/include/paimon/read_context.h b/include/paimon/read_context.h
index 47165e24f..fcaec06ff 100644
--- a/include/paimon/read_context.h
+++ b/include/paimon/read_context.h
@@ -45,7 +45,7 @@ class FileSystem;
 class PAIMON_EXPORT ReadContext {
  public:
     ReadContext(const std::string& path, const std::string& branch,
-                const std::vector<std::string>& read_schema,
+                const std::vector<std::string>& read_field_names,
                 const std::vector<int32_t>& read_field_ids,
                 const std::shared_ptr<Predicate>& predicate, bool enable_predicate_filter,
                 bool enable_prefetch, uint32_t prefetch_batch_count,
@@ -77,7 +77,7 @@ class PAIMON_EXPORT ReadContext {
     }
 
     const std::vector<std::string>& GetReadFieldNames() const {
-        return read_schema_;
+        return read_field_names_;
     }
 
     const std::vector<int32_t>& GetReadFieldIds() const {
@@ -133,24 +133,23 @@ class PAIMON_EXPORT ReadContext {
 
     /// Whether a read schema (C ArrowSchema) for nested column pruning was provided.
     bool HasReadSchema() const {
-        return has_read_schema_;
+        return read_schema_ != nullptr;
     }
 
     /// Get the read schema as a mutable C ArrowSchema pointer.
     /// ImportSchema will consume (release) the schema content.
     ArrowSchema* GetReadSchema() {
-        return &read_schema_c_;
+        return read_schema_;
     }
 
-    /// Set the read schema from a C ArrowSchema. Moves the content into this object.
-    /// The input schema's release will be set to nullptr after the move.
+    /// Set the read schema from a C ArrowSchema pointer. Does NOT take ownership.
     /// Called internally by ReadContextBuilder.
     void SetReadSchema(ArrowSchema* schema);
 
  private:
     std::string path_;
     std::string branch_;
-    std::vector<std::string> read_schema_;
+    std::vector<std::string> read_field_names_;
     std::vector<int32_t> read_field_ids_;
     std::shared_ptr<Predicate> predicate_;
     bool enable_predicate_filter_;
@@ -168,8 +167,7 @@ class PAIMON_EXPORT ReadContext {
     PrefetchCacheMode prefetch_cache_mode_;
     CacheConfig cache_config_;
     std::shared_ptr<Cache> cache_;
-    ArrowSchema read_schema_c_{};   // C ABI schema for nested column pruning
-    bool has_read_schema_ = false;  // whether read_schema_c_ holds valid content
+    ArrowSchema* read_schema_ = nullptr;  // C ABI schema for nested column pruning, not owned
 };
 
 /// `ReadContextBuilder` used to build a `ReadContext`, has input validation.
@@ -209,18 +207,18 @@ class PAIMON_EXPORT ReadContextBuilder {
     ///       Calling both will ignore the read schema set by SetReadFieldNames().
     ReadContextBuilder& SetReadFieldIds(const std::vector<int32_t>& read_field_ids);
 
-    /// Set the projected Arrow Schema for nested column pruning.
+    /// Set the read Arrow Schema for nested column pruning.
     ///
-    /// The projected schema is an Arrow C Data Interface schema where STRUCT types
+    /// The read schema is an Arrow C Data Interface schema where STRUCT types
     /// may contain only a subset of the original sub-fields, enabling nested column
     /// pruning to reduce I/O. Each Arrow field must carry a "paimon.id" metadata
     /// entry for field matching.
     ///
-    /// @param projected_schema Arrow C Schema (consumed/released by this call).
+    /// @param read_schema Arrow C Schema. The caller retains ownership.
     /// @return Reference to this builder for method chaining.
-    /// @note Priority: projected_arrow_schema > read_field_ids > read_field_names.
+    /// @note Priority: read_schema > read_field_ids > read_field_names.
     ///       When set, read_field_ids and read_field_names are ignored.
-    ReadContextBuilder& SetReadSchema(ArrowSchema* projected_schema);
+    ReadContextBuilder& SetReadSchema(ArrowSchema* read_schema);
 
     /// Set a configuration options map to set some option entries which are not defined in the
     /// table schema or whose values you want to overwrite.
diff --git a/src/paimon/common/memory/memory_segment_test.cpp b/src/paimon/common/memory/memory_segment_test.cpp
index 95a7a6ed9..c79a26d5c 100644
--- a/src/paimon/common/memory/memory_segment_test.cpp
+++ b/src/paimon/common/memory/memory_segment_test.cpp
@@ -555,9 +555,7 @@ TEST(MemorySegmentTest, TestDoubleAccess) {
     delete[] occupied;
 }
 
-// ------------------------------------------------------------------------
-//  Bulk Byte Movements
-// ------------------------------------------------------------------------
+// Bulk Byte Movements
 
 TEST(MemorySegmentTest, TestBulkByteAccess) {
     auto pool = paimon::GetDefaultPool();
diff --git a/src/paimon/core/operation/read_context.cpp b/src/paimon/core/operation/read_context.cpp
index c2c7306a5..4fe89abe4 100644
--- a/src/paimon/core/operation/read_context.cpp
+++ b/src/paimon/core/operation/read_context.cpp
@@ -30,19 +30,20 @@ namespace paimon {
 class Predicate;
 
 ReadContext::ReadContext(
-    const std::string& path, const std::string& branch, const std::vector<std::string>& read_schema,
-    const std::vector<int32_t>& read_field_ids, const std::shared_ptr<Predicate>& predicate,
-    bool enable_predicate_filter, bool enable_prefetch, uint32_t prefetch_batch_count,
-    uint32_t prefetch_max_parallel_num, bool enable_multi_thread_row_to_batch,
-    uint32_t row_to_batch_thread_number, const std::optional<std::string>& table_schema,
-    const std::shared_ptr<MemoryPool>& memory_pool, const std::shared_ptr<Executor>& executor,
+    const std::string& path, const std::string& branch,
+    const std::vector<std::string>& read_field_names, const std::vector<int32_t>& read_field_ids,
+    const std::shared_ptr<Predicate>& predicate, bool enable_predicate_filter, bool enable_prefetch,
+    uint32_t prefetch_batch_count, uint32_t prefetch_max_parallel_num,
+    bool enable_multi_thread_row_to_batch, uint32_t row_to_batch_thread_number,
+    const std::optional<std::string>& table_schema, const std::shared_ptr<MemoryPool>& memory_pool,
+    const std::shared_ptr<Executor>& executor,
     const std::shared_ptr<FileSystem>& specific_file_system,
     const std::map<std::string, std::string>& fs_scheme_to_identifier_map,
     const std::map<std::string, std::string>& options, PrefetchCacheMode prefetch_cache_mode,
     const CacheConfig& cache_config, const std::shared_ptr<Cache>& cache)
     : path_(path),
       branch_(branch),
-      read_schema_(read_schema),
+      read_field_names_(read_field_names),
       read_field_ids_(read_field_ids),
       predicate_(predicate),
       enable_predicate_filter_(enable_predicate_filter),
@@ -61,18 +62,11 @@ ReadContext::ReadContext(
       cache_config_(cache_config),
       cache_(cache) {}
 
-ReadContext::~ReadContext() {
-    if (has_read_schema_ && read_schema_c_.release) {
-        read_schema_c_.release(&read_schema_c_);
-    }
-}
+ReadContext::~ReadContext() {}
 
 void ReadContext::SetReadSchema(ArrowSchema* schema) {
     if (schema && schema->release) {
-        // Move the C schema content into our member. After move, source's release is nullptr.
-        read_schema_c_ = *schema;
-        schema->release = nullptr;
-        has_read_schema_ = true;
+        read_schema_ = schema;
     }
 }
 
@@ -83,10 +77,7 @@ class ReadContextBuilder::Impl {
         branch_ = BranchManager::DEFAULT_MAIN_BRANCH;
         read_field_names_.clear();
         read_field_ids_.clear();
-        if (projected_c_schema_.release) {
-            projected_c_schema_.release(&projected_c_schema_);
-        }
-        projected_c_schema_ = {};
+        read_schema_ = nullptr;
         fs_scheme_to_identifier_map_.clear();
         options_.clear();
         predicate_.reset();
@@ -110,7 +101,7 @@ class ReadContextBuilder::Impl {
     std::string branch_ = BranchManager::DEFAULT_MAIN_BRANCH;
     std::vector<std::string> read_field_names_;
     std::vector<int32_t> read_field_ids_;
-    ArrowSchema projected_c_schema_{};
+    ArrowSchema* read_schema_ = nullptr;
     std::map<std::string, std::string> fs_scheme_to_identifier_map_;
     std::map<std::string, std::string> options_;
     std::shared_ptr<Predicate> predicate_;
@@ -162,18 +153,9 @@ ReadContextBuilder& ReadContextBuilder::SetReadFieldIds(
     return *this;
 }
 
-ReadContextBuilder& ReadContextBuilder::SetReadSchema(ArrowSchema* projected_schema) {
-    if (projected_schema && projected_schema->release) {
-        // Import consumes the input C schema, then export a fresh copy into our member.
-        auto import_result = arrow::ImportSchema(projected_schema);
-        if (import_result.ok()) {
-            // Release any previously held schema.
-            if (impl_->projected_c_schema_.release) {
-                impl_->projected_c_schema_.release(&impl_->projected_c_schema_);
-            }
-            impl_->projected_c_schema_ = {};
-            (void)arrow::ExportSchema(*import_result.ValueUnsafe(), &impl_->projected_c_schema_);
-        }
+ReadContextBuilder& ReadContextBuilder::SetReadSchema(ArrowSchema* read_schema) {
+    if (read_schema && read_schema->release) {
+        impl_->read_schema_ = read_schema;
     }
     return *this;
 }
@@ -298,8 +280,8 @@ Result<std::unique_ptr<ReadContext>> ReadContextBuilder::Finish() {
         impl_->table_schema_, impl_->memory_pool_, impl_->executor_, impl_->specific_file_system_,
         impl_->fs_scheme_to_identifier_map_, impl_->options_, impl_->prefetch_cache_mode_,
         impl_->cache_config_, impl_->cache_);
-    if (impl_->projected_c_schema_.release) {
-        ctx->SetReadSchema(&impl_->projected_c_schema_);
+    if (impl_->read_schema_ && impl_->read_schema_->release) {
+        ctx->SetReadSchema(impl_->read_schema_);
     }
     impl_->Reset();
     return ctx;
diff --git a/src/paimon/core/utils/nested_projection_utils.cpp b/src/paimon/core/utils/nested_projection_utils.cpp
index c1babc2cc..c451376ae 100644
--- a/src/paimon/core/utils/nested_projection_utils.cpp
+++ b/src/paimon/core/utils/nested_projection_utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright 2024-present Alibaba Inc.
+ * Copyright 2026-present Alibaba Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -108,9 +108,7 @@ Result<std::optional<std::shared_ptr<arrow::DataType>>> NestedProjectionUtils::P
     }
 }
 
-// ---------------------------------------------------------------------------
 // PruneArray — fallback for format readers that return extra nested columns
-// ---------------------------------------------------------------------------
 
 Result<std::shared_ptr<arrow::Array>> NestedProjectionUtils::PruneArray(
     const std::shared_ptr<arrow::Array>& array,
@@ -178,9 +176,7 @@ Result<std::shared_ptr<arrow::Array>> NestedProjectionUtils::PruneArray(
     }
 }
 
-// ---------------------------------------------------------------------------
 // Map selected-keys support
-// ---------------------------------------------------------------------------
 
 std::set<std::string> NestedProjectionUtils::GetMapSelectedKeys(
     const std::shared_ptr<arrow::Field>& field) {
diff --git a/src/paimon/core/utils/nested_projection_utils.h b/src/paimon/core/utils/nested_projection_utils.h
index a8ba0fe77..3973641b7 100644
--- a/src/paimon/core/utils/nested_projection_utils.h
+++ b/src/paimon/core/utils/nested_projection_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2024-present Alibaba Inc.
+ * Copyright 2026-present Alibaba Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/src/paimon/core/utils/nested_projection_utils_test.cpp b/src/paimon/core/utils/nested_projection_utils_test.cpp
index c7add426a..6d27ee555 100644
--- a/src/paimon/core/utils/nested_projection_utils_test.cpp
+++ b/src/paimon/core/utils/nested_projection_utils_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright 2024-present Alibaba Inc.
+ * Copyright 2026-present Alibaba Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
index c35f70288..1d8c92f40 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
@@ -444,9 +444,7 @@ Result<::parquet::ArrowReaderProperties> ParquetFileBatchReader::CreateArrowRead
     return arrow_reader_props;
 }
 
-// ---------------------------------------------------------------------------
 // Nested column index computation
-// ---------------------------------------------------------------------------
 
 void ParquetFileBatchReader::CollectLeafIndices(const std::shared_ptr<arrow::DataType>& read_type,
                                                 const std::shared_ptr<arrow::DataType>& file_type,
diff --git a/test/inte/nested_column_pruning_inte_test.cpp b/test/inte/nested_column_pruning_inte_test.cpp
index 95c9dc3ae..2008dfa58 100644
--- a/test/inte/nested_column_pruning_inte_test.cpp
+++ b/test/inte/nested_column_pruning_inte_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright 2024-present Alibaba Inc.
+ * Copyright 2026-present Alibaba Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From f747cadb61ca75fab5df7e189546a38b4df49379 Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Fri, 12 Jun 2026 17:12:15 +0800
Subject: [PATCH 04/24] fix

---
 test/inte/nested_column_pruning_inte_test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/inte/nested_column_pruning_inte_test.cpp b/test/inte/nested_column_pruning_inte_test.cpp
index 2008dfa58..708747a39 100644
--- a/test/inte/nested_column_pruning_inte_test.cpp
+++ b/test/inte/nested_column_pruning_inte_test.cpp
@@ -469,6 +469,6 @@ TEST_P(NestedColumnPruningInteTest, MapSelectedKeys) {
 }
 
 INSTANTIATE_TEST_SUITE_P(FileFormats, NestedColumnPruningInteTest,
-                         ::testing::Values("parquet"));
+                         ::testing::Values("parquet", "orc"));
 
 }  // namespace paimon::test

From b94baf8fa7c541ca58c59cb27264d1c8da024bb6 Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Mon, 15 Jun 2026 17:12:37 +0800
Subject: [PATCH 05/24] fix

---
 src/paimon/common/types/data_field.h          |   4 +-
 .../core/utils/nested_projection_utils.cpp    |  16 +--
 .../core/utils/nested_projection_utils.h      |   2 +-
 .../utils/nested_projection_utils_test.cpp    |   6 +-
 .../format/parquet/file_reader_wrapper.cpp    | 131 ------------------
 .../parquet/parquet_file_batch_reader.cpp     | 108 +++++++++++++--
 .../parquet_file_batch_reader_test.cpp        |  32 +++++
 test/inte/nested_column_pruning_inte_test.cpp | 118 +++++++++++++++-
 8 files changed, 255 insertions(+), 162 deletions(-)

diff --git a/src/paimon/common/types/data_field.h b/src/paimon/common/types/data_field.h
index 27fedda35..210db42ea 100644
--- a/src/paimon/common/types/data_field.h
+++ b/src/paimon/common/types/data_field.h
@@ -41,8 +41,8 @@ class DataField : public Jsonizable<DataField> {
 
     static constexpr char FIELD_ID[] = "paimon.id";
     static constexpr char DESCRIPTION[] = "paimon.description";
-    /// Metadata key for map field selected keys. The value is a JSON array of
-    /// string keys, e.g. '["key1","key2"]'. Only string-keyed maps are supported.
+    /// Metadata key for map field selected keys. The value is a comma-separated
+    /// string of key names, e.g. 'key1,key2'. Only string-keyed maps are supported.
     static constexpr char MAP_SELECTED_KEYS[] = "paimon.map.selected-keys";
 
  public:
diff --git a/src/paimon/core/utils/nested_projection_utils.cpp b/src/paimon/core/utils/nested_projection_utils.cpp
index c451376ae..259ba0f0b 100644
--- a/src/paimon/core/utils/nested_projection_utils.cpp
+++ b/src/paimon/core/utils/nested_projection_utils.cpp
@@ -27,8 +27,8 @@
 #include "arrow/array/concatenate.h"
 #include "arrow/type.h"
 #include "fmt/format.h"
+#include "paimon/common/utils/string_utils.h"
 #include "paimon/status.h"
-#include "rapidjson/document.h"
 
 namespace paimon {
 
@@ -188,17 +188,9 @@ std::set<std::string> NestedProjectionUtils::GetMapSelectedKeys(
     if (!get_result.ok()) {
         return result;
     }
-    const std::string& json_str = get_result.ValueUnsafe();
-    rapidjson::Document doc;
-    doc.Parse(json_str.c_str());
-    if (doc.HasParseError() || !doc.IsArray()) {
-        return result;
-    }
-    for (rapidjson::SizeType i = 0; i < doc.Size(); ++i) {
-        if (doc[i].IsString()) {
-            result.emplace(doc[i].GetString(), doc[i].GetStringLength());
-        }
-    }
+    const std::string& value = get_result.ValueUnsafe();
+    auto tokens = StringUtils::Split(value, ",");
+    result.insert(tokens.begin(), tokens.end());
     return result;
 }
 
diff --git a/src/paimon/core/utils/nested_projection_utils.h b/src/paimon/core/utils/nested_projection_utils.h
index 3973641b7..fe35bdfcf 100644
--- a/src/paimon/core/utils/nested_projection_utils.h
+++ b/src/paimon/core/utils/nested_projection_utils.h
@@ -31,7 +31,7 @@
 namespace paimon {
 
 /// Utility class for nested column pruning and map key selection.
-class NestedProjectionUtils {
+class PAIMON_EXPORT NestedProjectionUtils {
  public:
     NestedProjectionUtils() = delete;
 
diff --git a/src/paimon/core/utils/nested_projection_utils_test.cpp b/src/paimon/core/utils/nested_projection_utils_test.cpp
index 6d27ee555..9f68f28b2 100644
--- a/src/paimon/core/utils/nested_projection_utils_test.cpp
+++ b/src/paimon/core/utils/nested_projection_utils_test.cpp
@@ -213,7 +213,7 @@ TEST(NestedProjectionUtilsTest, PruneArray_IdenticalType) {
 
 TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_Present) {
     auto metadata = arrow::KeyValueMetadata::Make(
-        {DataField::MAP_SELECTED_KEYS}, {R"(["key1","key2","key3"])"});
+        {DataField::MAP_SELECTED_KEYS}, {"key1,key2,key3"});
     auto field = arrow::field("m", arrow::map(arrow::utf8(), arrow::int32()), /*nullable=*/true,
                               metadata);
     auto keys = NestedProjectionUtils::GetMapSelectedKeys(field);
@@ -229,9 +229,9 @@ TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_Absent) {
     ASSERT_TRUE(keys.empty());
 }
 
-TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_InvalidJson) {
+TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_EmptyString) {
     auto metadata = arrow::KeyValueMetadata::Make(
-        {DataField::MAP_SELECTED_KEYS}, {"not_json"});
+        {DataField::MAP_SELECTED_KEYS}, {""});
     auto field = arrow::field("m", arrow::map(arrow::utf8(), arrow::int32()), /*nullable=*/true,
                               metadata);
     auto keys = NestedProjectionUtils::GetMapSelectedKeys(field);
diff --git a/src/paimon/format/parquet/file_reader_wrapper.cpp b/src/paimon/format/parquet/file_reader_wrapper.cpp
index 118476181..e7d6bf606 100644
--- a/src/paimon/format/parquet/file_reader_wrapper.cpp
+++ b/src/paimon/format/parquet/file_reader_wrapper.cpp
@@ -408,137 +408,6 @@ void FileReaderWrapper::DispatchPreBuffer(std::vector<::arrow::io::ReadRange> ra
 Status FileReaderWrapper::PrepareForReading(const std::vector<TargetRowGroup>& target_row_groups,
                                             const std::vector<int32_t>& column_indices) {
     try {
-        std::vector<std::pair<uint64_t, uint64_t>> target_row_groups;
-        PAIMON_ASSIGN_OR_RAISE(target_row_groups, GetRowGroupRanges(target_row_group_indices));
-
-        // Build position map: rg_index -> position in target_row_groups (O(1) lookup)
-        std::map<int32_t, uint64_t> rg_idx_to_position;
-        {
-            uint64_t pos = 0;
-            for (int32_t rg_idx : target_row_group_indices) {
-                rg_idx_to_position[rg_idx] = pos++;
-            }
-        }
-
-        // Separate row groups into fully matched (Arrow's standard reader) and partially
-        // matched (page-filtered, per-RG reader constructed on demand in Next()).
-        // Per-RG metadata for the page-filtered path is NOT cached on the wrapper — it's
-        // recomputed on demand in Next() from row_group_row_ranges_ + target_column_indices_,
-        // mirroring how the fully-matched path lets Arrow's FileReader own all metadata.
-        std::vector<int32_t> fully_matched_row_groups;
-        page_filtered_indices_.clear();
-        page_filtered_read_schema_.reset();
-
-        // Page-level byte ranges collected here only for the bulk PreBuffer call below;
-        // discarded once PreBuffer is dispatched.
-        std::vector<::arrow::io::ReadRange> page_filtered_byte_ranges;
-
-        for (int32_t rg_idx : target_row_group_indices) {
-            auto range_it = row_group_row_ranges_.find(rg_idx);
-            if (range_it != row_group_row_ranges_.end()) {
-                uint64_t pos = rg_idx_to_position[rg_idx];
-                page_filtered_indices_.insert(pos);
-
-                // Build the page-filter read_schema once on first encounter — it's identical
-                // across all page-filtered RGs in this session.
-                if (!page_filtered_read_schema_) {
-                    if (external_read_schema_) {
-                        // Use externally provided read schema (handles nested column pruning
-                        // correctly where leaf-column-name inference would fail).
-                        page_filtered_read_schema_ = external_read_schema_;
-                    } else {
-                        std::shared_ptr<arrow::Schema> schema;
-                        PAIMON_RETURN_NOT_OK_FROM_ARROW(file_reader_->GetSchema(&schema));
-                        std::vector<std::shared_ptr<arrow::Field>> fields;
-                        auto parquet_schema = file_reader_->parquet_reader()->metadata()->schema();
-                        for (int32_t col_idx : column_indices) {
-                            const std::string& col_name = parquet_schema->Column(col_idx)->name();
-                            auto field = schema->GetFieldByName(col_name);
-                            if (!field) {
-                                return Status::Invalid(fmt::format(
-                                    "PrepareForReading: Parquet column {} ('{}') has no "
-                                    "matching Arrow field in file schema",
-                                    col_idx, col_name));
-                            }
-                            fields.push_back(field);
-                        }
-                        page_filtered_read_schema_ = arrow::schema(fields);
-                    }
-                }
-
-                auto page_ranges = PageFilteredRowGroupReader::ComputePageRanges(
-                    file_reader_->parquet_reader(), rg_idx, range_it->second, column_indices);
-                page_filtered_byte_ranges.insert(page_filtered_byte_ranges.end(),
-                                                 std::make_move_iterator(page_ranges.begin()),
-                                                 std::make_move_iterator(page_ranges.end()));
-            } else {
-                fully_matched_row_groups.push_back(rg_idx);
-            }
-        }
-
-        // Wait for any previously pre-buffered data before starting new pre-buffer.
-        WaitForPendingPreBuffer();
-
-        // Create standard reader for fully matched row groups FIRST.
-        // GetRecordBatchReader internally calls PreBuffer, but we'll override it below
-        // with a single PreBuffer covering ALL row groups (page-filtered + fully-matched)
-        // so that async I/O for all files starts in parallel.
-        std::unique_ptr<arrow::RecordBatchReader> batch_reader;
-        if (!fully_matched_row_groups.empty()) {
-            PAIMON_RETURN_NOT_OK_FROM_ARROW(file_reader_->GetRecordBatchReader(
-                fully_matched_row_groups, column_indices, &batch_reader));
-        }
-
-        // Collect all byte ranges for a single PreBufferRanges call.
-        // Page-filtered RGs: only matching page ranges (from ComputePageRanges).
-        // Fully-matched RGs: entire column chunk ranges.
-        //
-        // When there are no page-filtered RGs, skip the manual PreBufferRanges entirely:
-        // GetRecordBatchReader has already issued PreBuffer internally (driven by
-        // ArrowReaderProperties::pre_buffer=true), and a second PreBufferRanges call here
-        // would tear down and rebuild cached_source_, redundantly re-issuing the same IO
-        // on remote filesystems. The manual path is only needed to merge page-level ranges
-        // with column-chunk ranges into a single PreBuffer covering both kinds of RGs.
-        if (!page_filtered_indices_.empty()) {
-            std::vector<::arrow::io::ReadRange> all_ranges = std::move(page_filtered_byte_ranges);
-
-            // Fully-matched row groups: add entire column chunk ranges
-            // The correct calculation follows Arrow's ColumnChunkMetaData::file_range():
-            // - col_start = data_page_offset (or dictionary_page_offset if present and lower)
-            // - col_length = total_compressed_size (includes all pages: dictionary + data)
-            auto file_metadata = file_reader_->parquet_reader()->metadata();
-            for (int32_t rg_idx : fully_matched_row_groups) {
-                auto rg_metadata = file_metadata->RowGroup(rg_idx);
-                for (int32_t col_idx : column_indices) {
-                    auto col_chunk = rg_metadata->ColumnChunk(col_idx);
-                    int64_t offset = col_chunk->data_page_offset();
-                    if (col_chunk->has_dictionary_page() &&
-                        col_chunk->dictionary_page_offset() > 0 &&
-                        offset > col_chunk->dictionary_page_offset()) {
-                        offset = col_chunk->dictionary_page_offset();
-                    }
-                    int64_t size = col_chunk->total_compressed_size();
-                    all_ranges.push_back({offset, size});
-                }
-            }
-
-            const auto& cache_opts = file_reader_->properties().cache_options();
-            ::arrow::io::IOContext io_ctx(pool_);
-            // Merge overlapping ranges before calling PreBufferRanges, which rejects overlapping
-            // ranges.
-            auto merged_ranges = MergeOverlappingRanges(std::move(all_ranges));
-            // PreBuffer is an optimization - if it fails (e.g., IO error during testing),
-            // continue without pre-buffering. Subsequent reads will fetch data on-demand.
-            try {
-                file_reader_->parquet_reader()->PreBufferRanges(merged_ranges, io_ctx, cache_opts);
-                // Track for cleanup on destruction
-                prebuffered_ranges_ = std::move(merged_ranges);
-            } catch (const std::exception& e) {
-                // Pre-buffering failed, clear ranges to indicate no pre-buffered data available.
-                // Reading will fall back to on-demand I/O.
-                prebuffered_ranges_.clear();
-            }
-        }
         target_row_groups_ = target_row_groups;
         target_column_indices_ = column_indices;
         page_filtered_read_schema_.reset();
diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
index 1d8c92f40..2c68a5256 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
@@ -58,6 +58,99 @@ class Predicate;
 
 namespace paimon::parquet {
 
+namespace {
+
+std::shared_ptr<arrow::Field> FindMatchingReadField(
+    const arrow::FieldVector& read_fields, const std::shared_ptr<arrow::Field>& file_field) {
+    int32_t file_field_id = NestedProjectionUtils::GetPaimonFieldId(file_field);
+    if (file_field_id != -1) {
+        for (const auto& candidate : read_fields) {
+            if (NestedProjectionUtils::GetPaimonFieldId(candidate) == file_field_id) {
+                return candidate;
+            }
+        }
+    }
+
+    for (const auto& candidate : read_fields) {
+        if (candidate->name() == file_field->name()) {
+            return candidate;
+        }
+    }
+    return nullptr;
+}
+
+Result<std::shared_ptr<arrow::Array>> PruneArrayToReadType(
+    const std::shared_ptr<arrow::Array>& array,
+    const std::shared_ptr<arrow::DataType>& target_type) {
+    if (!array || array->type()->Equals(target_type)) {
+        return array;
+    }
+
+    switch (target_type->id()) {
+        case arrow::Type::STRUCT: {
+            auto struct_array = std::static_pointer_cast<arrow::StructArray>(array);
+            auto target_struct_type = std::static_pointer_cast<arrow::StructType>(target_type);
+            arrow::ArrayVector pruned_children;
+            arrow::FieldVector pruned_fields;
+            pruned_children.reserve(target_struct_type->num_fields());
+            pruned_fields.reserve(target_struct_type->num_fields());
+            for (const auto& target_field : target_struct_type->fields()) {
+                auto src_field = FindMatchingReadField(struct_array->type()->fields(), target_field);
+                if (!src_field) {
+                    return Status::Invalid(fmt::format(
+                        "PruneArrayToReadType: field '{}' not found in struct array",
+                        target_field->name()));
+                }
+                auto child = struct_array->GetFieldByName(src_field->name());
+                PAIMON_ASSIGN_OR_RAISE(auto pruned_child,
+                                       PruneArrayToReadType(child, target_field->type()));
+                pruned_children.push_back(std::move(pruned_child));
+                pruned_fields.push_back(src_field->WithType(pruned_children.back()->type()));
+            }
+            PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
+                std::shared_ptr<arrow::StructArray> result_struct,
+                arrow::StructArray::Make(pruned_children, pruned_fields,
+                                         struct_array->null_bitmap(), struct_array->null_count(),
+                                         struct_array->offset()));
+            return std::static_pointer_cast<arrow::Array>(result_struct);
+        }
+
+        case arrow::Type::LIST: {
+            auto list_array = std::static_pointer_cast<arrow::ListArray>(array);
+            const auto& target_elem_type =
+                static_cast<const arrow::ListType&>(*target_type).value_type();
+            PAIMON_ASSIGN_OR_RAISE(auto pruned_values,
+                                   PruneArrayToReadType(list_array->values(), target_elem_type));
+            PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
+                std::shared_ptr<arrow::ListArray> result_list,
+                arrow::ListArray::FromArrays(*list_array->offsets(), *pruned_values,
+                                             arrow::default_memory_pool(),
+                                             list_array->null_bitmap(), list_array->null_count()));
+            return std::static_pointer_cast<arrow::Array>(result_list);
+        }
+
+        case arrow::Type::MAP: {
+            auto map_array = std::static_pointer_cast<arrow::MapArray>(array);
+            const auto& target_map_type = static_cast<const arrow::MapType&>(*target_type);
+            PAIMON_ASSIGN_OR_RAISE(auto pruned_keys,
+                                   PruneArrayToReadType(map_array->keys(), target_map_type.key_type()));
+            PAIMON_ASSIGN_OR_RAISE(auto pruned_items,
+                                   PruneArrayToReadType(map_array->items(),
+                                                        target_map_type.item_type()));
+            PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
+                std::shared_ptr<arrow::Array> result_map,
+                arrow::MapArray::FromArrays(map_array->offsets(), pruned_keys, pruned_items,
+                                            arrow::default_memory_pool()));
+            return result_map;
+        }
+
+        default:
+            return array;
+    }
+}
+
+}  // namespace
+
 ParquetFileBatchReader::ParquetFileBatchReader(
     std::shared_ptr<arrow::io::RandomAccessFile>&& input_stream,
     std::unique_ptr<FileReaderWrapper>&& reader, const std::map<std::string, std::string>& options,
@@ -352,6 +445,7 @@ Result<BatchReader::ReadBatch> ParquetFileBatchReader::NextBatch() {
         }
         PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr<arrow::Array> array,
                                           batch->ToStructArray());
+        PAIMON_ASSIGN_OR_RAISE(array, PruneArrayToReadType(array, read_data_type_));
         PAIMON_ASSIGN_OR_RAISE(bool need_cast, ParquetTimestampConverter::NeedCastArrayForTimestamp(
                                                    array->type(), read_data_type_));
         if (need_cast) {
@@ -452,9 +546,8 @@ void ParquetFileBatchReader::CollectLeafIndices(const std::shared_ptr<arrow::Dat
                                                 std::vector<int32_t>* indices) {
     if (file_type->id() == arrow::Type::STRUCT) {
         for (const auto& file_child : file_type->fields()) {
-            int32_t file_child_id = NestedProjectionUtils::GetPaimonFieldId(file_child);
             std::shared_ptr<arrow::Field> read_child =
-                NestedProjectionUtils::FindFieldByPaimonId(read_type, file_child_id);
+                FindMatchingReadField(read_type->fields(), file_child);
             if (read_child) {
                 CollectLeafIndices(read_child->type(), file_child->type(), leaf_index, indices);
             } else {
@@ -497,15 +590,8 @@ Result<std::vector<int32_t>> ParquetFileBatchReader::ComputeNestedColumnIndices(
     int32_t leaf_index = 0;
 
     for (const auto& file_field : file_schema->fields()) {
-        int32_t file_field_id = NestedProjectionUtils::GetPaimonFieldId(file_field);
-        // Find matching field in read_schema by paimon field ID.
-        std::shared_ptr<arrow::Field> read_field = nullptr;
-        for (const auto& candidate : read_schema->fields()) {
-            if (NestedProjectionUtils::GetPaimonFieldId(candidate) == file_field_id) {
-                read_field = candidate;
-                break;
-            }
-        }
+        std::shared_ptr<arrow::Field> read_field =
+            FindMatchingReadField(read_schema->fields(), file_field);
 
         if (read_field) {
             CollectLeafIndices(read_field->type(), file_field->type(), &leaf_index, &indices);
diff --git a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp
index 73501cbfd..21c14d1aa 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp
@@ -281,6 +281,38 @@ TEST_F(ParquetFileBatchReaderTest, TestSetReadSchema) {
     ASSERT_FALSE(result_with_read_schema);
 }
 
+TEST_F(ParquetFileBatchReaderTest, TestSetReadSchemaWithLegacyParquetMissingFieldIds) {
+    std::string file_name = paimon::test::GetDataDir() +
+                            "/parquet/append_09.db/append_09/f1=20/bucket-0/"
+                            "data-b446f78a-2cfb-4b3b-add8-31295d24a277-0.parquet";
+
+    std::vector<DataField> read_fields = {
+        DataField(0, arrow::field("f0", arrow::utf8())),
+        DataField(2, arrow::field("f2", arrow::int32())),
+        DataField(3, arrow::field("f3", arrow::float64())),
+    };
+    auto read_schema = DataField::ConvertDataFieldsToArrowSchema(read_fields);
+
+    auto parquet_batch_reader =
+        PrepareParquetFileBatchReader(file_name, read_schema, /*predicate=*/nullptr,
+                                      /*selection_bitmap=*/std::nullopt, batch_size_);
+
+    ASSERT_OK_AND_ASSIGN(auto result_array, paimon::test::ReadResultCollector::CollectResult(
+                                                parquet_batch_reader.get()));
+
+    std::shared_ptr<arrow::ChunkedArray> expected_array;
+    ASSERT_TRUE(arrow::ipc::internal::json::ChunkedArrayFromJSON(
+                    arrow::struct_(read_schema->fields()),
+                    {R"([
+        ["Lucy", 1, 14.1]
+    ])"},
+                    &expected_array)
+                    .ok());
+    ASSERT_TRUE(result_array->Equals(expected_array))
+        << "expected: " << expected_array->ToString() << "\nactual: "
+        << result_array->ToString();
+}
+
 TEST_F(ParquetFileBatchReaderTest, TestNextBatchSimple) {
     std::string file_name = paimon::test::GetDataDir() +
                             "parquet/parquet_append_table.db/parquet_append_table/bucket-0/"
diff --git a/test/inte/nested_column_pruning_inte_test.cpp b/test/inte/nested_column_pruning_inte_test.cpp
index 708747a39..b6e1cf564 100644
--- a/test/inte/nested_column_pruning_inte_test.cpp
+++ b/test/inte/nested_column_pruning_inte_test.cpp
@@ -417,9 +417,9 @@ TEST_P(NestedColumnPruningInteTest, MapSelectedKeys) {
         helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
     ASSERT_FALSE(data_splits.empty());
 
-    // Build projected schema: read f0 and f1 with selected keys ["a", "c"]
+    // Build projected schema: read f0 and f1 with selected keys "a,c"
     auto selected_keys_metadata = arrow::KeyValueMetadata::Make(
-        {DataField::MAP_SELECTED_KEYS}, {R"(["a","c"])"});
+        {DataField::MAP_SELECTED_KEYS}, {"a,c"});
     arrow::FieldVector projected_fields = {
         AnnotateField(arrow::field("f0", arrow::int32()), 0),
         AnnotateField(arrow::field("f1", map_type), 1)->WithMetadata(
@@ -468,6 +468,120 @@ TEST_P(NestedColumnPruningInteTest, MapSelectedKeys) {
     ASSERT_TRUE(is_equal);
 }
 
+// Test: Deeper nested struct — prune sub-fields of a struct inside a struct inside another struct.
+TEST_P(NestedColumnPruningInteTest, PruneDeeperNestedStruct) {
+    // Table schema: f0 (int32), f1 (struct{a: int32, inner1: struct{x: int64, inner2: struct{p: utf8, q: float64}}})
+    auto inner2_struct = arrow::struct_({
+        arrow::field("p", arrow::utf8()),
+        arrow::field("q", arrow::float64()),
+    });
+    auto inner1_struct = arrow::struct_({
+        arrow::field("x", arrow::int64()),
+        arrow::field("inner2", inner2_struct),
+    });
+    auto outer_struct = arrow::struct_({
+        arrow::field("a", arrow::int32()),
+        arrow::field("inner1", inner1_struct),
+    });
+    arrow::FieldVector table_fields = {
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", outer_struct),
+    };
+    auto table_schema = arrow::schema(table_fields);
+
+    std::map<std::string, std::string> options = {
+        {Options::MANIFEST_FORMAT, "AVRO"},
+        {Options::FILE_FORMAT, StringUtils::ToUpperCase(file_format_)},
+        {Options::TARGET_FILE_SIZE, "1024"},
+        {Options::BUCKET, "-1"},
+    };
+
+    ASSERT_OK_AND_ASSIGN(
+        auto helper,
+        TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{},
+                           /*primary_keys=*/{}, options, /*is_streaming_mode=*/false));
+
+    std::string data = R"([
+        [1, [10, [100, ["ppp", 1.1]]]],
+        [2, [20, [200, ["qqq", 2.2]]]],
+        [3, [30, [300, ["rrr", 3.3]]]]
+    ])";
+    ASSERT_OK_AND_ASSIGN(
+        auto batch,
+        TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data,
+                                    /*partition_map=*/{}, /*bucket=*/0, {}));
+    int64_t commit_identifier = 0;
+    ASSERT_OK_AND_ASSIGN(
+        auto commit_msgs,
+        helper->WriteAndCommit(std::move(batch), commit_identifier++,
+                               /*expected_commit_messages=*/std::nullopt));
+
+    ASSERT_OK_AND_ASSIGN(
+        auto data_splits,
+        helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
+
+    // Field IDs (assigned sequentially by catalog):
+    // f0->0, f1->1, f1.a->2, f1.inner1->3, f1.inner1.x->4, f1.inner1.inner2->5, f1.inner1.inner2.p->6, f1.inner1.inner2.q->7
+    //
+    // Projected: f0, f1{inner1{inner2{p}}}
+    auto pruned_inner2 = arrow::struct_({
+        AnnotateField(arrow::field("p", arrow::utf8()), 6),
+    });
+    auto pruned_inner1 = arrow::struct_({
+        AnnotateField(arrow::field("inner2", pruned_inner2), 5),
+    });
+    auto pruned_outer = arrow::struct_({
+        AnnotateField(arrow::field("inner1", pruned_inner1), 3),
+    });
+    arrow::FieldVector projected_fields = {
+        AnnotateField(arrow::field("f0", arrow::int32()), 0),
+        AnnotateField(arrow::field("f1", pruned_outer), 1),
+    };
+    auto projected_schema = arrow::schema(projected_fields);
+
+    ArrowSchema c_schema;
+    ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok());
+
+    ReadContextBuilder read_context_builder(table_path_);
+    read_context_builder.SetOptions(options).SetReadSchema(&c_schema);
+    ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish());
+    ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context)));
+    ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits));
+    ASSERT_OK_AND_ASSIGN(auto read_result,
+                         ReadResultCollector::CollectResult(batch_reader.get()));
+
+    arrow::FieldVector expected_fields = {
+        arrow::field("_VALUE_KIND", arrow::int8()),
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", arrow::struct_({
+            arrow::field("inner1", arrow::struct_({
+                arrow::field("inner2", arrow::struct_({
+                    arrow::field("p", arrow::utf8()),
+                })),
+            })),
+        })),
+    };
+    auto expected_type = arrow::struct_(expected_fields);
+    std::string expected_data = R"([
+        [0, 1, [[[ "ppp" ]]]],
+        [0, 2, [[[ "qqq" ]]]],
+        [0, 3, [[[ "rrr" ]]]]
+    ])";
+    auto expected_array =
+        arrow::ipc::internal::json::ArrayFromJSON(expected_type, expected_data).ValueOrDie();
+    auto expected_chunked = std::make_shared<arrow::ChunkedArray>(expected_array);
+
+    arrow::EqualOptions equal_options = arrow::EqualOptions::Defaults();
+    bool is_equal = expected_chunked->Equals(read_result, equal_options.diff_sink(&std::cout));
+    if (!is_equal) {
+        std::cout << "[expected_type] " << expected_chunked->type()->ToString() << std::endl;
+        std::cout << "[actual_type]   " << read_result->type()->ToString() << std::endl;
+        std::cout << "[expected] " << expected_chunked->ToString() << std::endl;
+        std::cout << "[actual]   " << read_result->ToString() << std::endl;
+    }
+    ASSERT_TRUE(is_equal);
+}
+
 INSTANTIATE_TEST_SUITE_P(FileFormats, NestedColumnPruningInteTest,
                          ::testing::Values("parquet", "orc"));
 

From c447e1e659e980cdc05bc039dfe76cafcb469286 Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Mon, 15 Jun 2026 17:24:01 +0800
Subject: [PATCH 06/24] fix

---
 src/paimon/core/io/field_mapping_reader.cpp   |  42 ++--
 src/paimon/core/io/field_mapping_reader.h     |   8 +-
 .../core/operation/internal_read_context.cpp  |   5 +-
 .../operation/internal_read_context_test.cpp  |   3 +-
 .../core/utils/nested_projection_utils.cpp    |  59 +++---
 .../core/utils/nested_projection_utils.h      |   9 +-
 .../utils/nested_projection_utils_test.cpp    |  71 ++++---
 .../parquet/parquet_file_batch_reader.cpp     |  20 +-
 .../parquet_file_batch_reader_test.cpp        |   6 +-
 test/inte/global_index_test.cpp               |   4 +-
 test/inte/nested_column_pruning_inte_test.cpp | 193 ++++++++----------
 11 files changed, 195 insertions(+), 225 deletions(-)

diff --git a/src/paimon/core/io/field_mapping_reader.cpp b/src/paimon/core/io/field_mapping_reader.cpp
index 3f1ab2e65..933a53e28 100644
--- a/src/paimon/core/io/field_mapping_reader.cpp
+++ b/src/paimon/core/io/field_mapping_reader.cpp
@@ -158,10 +158,9 @@ Result<BatchReader::ReadBatchWithBitmap> FieldMappingReader::NextBatchWithBitmap
     // mapping non-partition array
     PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> casted_non_partition_array,
                            CastNonPartitionArrayIfNeed(non_partition_array));
-    PAIMON_RETURN_NOT_OK(
-        MappingFields(casted_non_partition_array, non_partition_info_.non_partition_read_schema,
-                      non_partition_info_.idx_in_target_read_schema, &target_array,
-                      &target_field_names));
+    PAIMON_RETURN_NOT_OK(MappingFields(
+        casted_non_partition_array, non_partition_info_.non_partition_read_schema,
+        non_partition_info_.idx_in_target_read_schema, &target_array, &target_field_names));
 
     // mapping partition array
     if (partition_info_ != std::nullopt) {
@@ -170,10 +169,9 @@ Result<BatchReader::ReadBatchWithBitmap> FieldMappingReader::NextBatchWithBitmap
                                    GeneratePartitionArray(non_partition_array->length()));
         }
         auto trim_partition_array = partition_array_->Slice(0, non_partition_array->length());
-        PAIMON_RETURN_NOT_OK(
-            MappingFields(trim_partition_array, partition_info_.value().partition_read_schema,
-                          partition_info_.value().idx_in_target_read_schema, &target_array,
-                          &target_field_names));
+        PAIMON_RETURN_NOT_OK(MappingFields(
+            trim_partition_array, partition_info_.value().partition_read_schema,
+            partition_info_.value().idx_in_target_read_schema, &target_array, &target_field_names));
     }
     // mapping non-exist array
     if (non_exist_field_info_ != std::nullopt) {
@@ -182,10 +180,10 @@ Result<BatchReader::ReadBatchWithBitmap> FieldMappingReader::NextBatchWithBitmap
                                    GenerateNonExistArray(non_partition_array->length()));
         }
         auto trim_non_exist_array = non_exist_array_->Slice(0, non_partition_array->length());
-        PAIMON_RETURN_NOT_OK(
-            MappingFields(trim_non_exist_array, non_exist_field_info_.value().non_exist_read_schema,
-                          non_exist_field_info_.value().idx_in_target_read_schema, &target_array,
-                          &target_field_names));
+        PAIMON_RETURN_NOT_OK(MappingFields(trim_non_exist_array,
+                                           non_exist_field_info_.value().non_exist_read_schema,
+                                           non_exist_field_info_.value().idx_in_target_read_schema,
+                                           &target_array, &target_field_names));
     }
 
     // construct target array
@@ -303,10 +301,10 @@ Result<std::shared_ptr<arrow::Array>> FieldMappingReader::GenerateNonExistArray(
 }
 
 Status FieldMappingReader::MappingFields(const std::shared_ptr<arrow::Array>& data_array,
-                                        const std::vector<DataField>& read_fields_of_data_array,
-                                        const std::vector<int32_t>& idx_in_target_schema,
-                                        arrow::ArrayVector* target_array,
-                                        std::vector<std::string>* target_field_names) {
+                                         const std::vector<DataField>& read_fields_of_data_array,
+                                         const std::vector<int32_t>& idx_in_target_schema,
+                                         arrow::ArrayVector* target_array,
+                                         std::vector<std::string>* target_field_names) {
     auto* struct_array = arrow::internal::checked_cast<arrow::StructArray*>(data_array.get());
     assert(struct_array);
     assert(struct_array->fields().size() == idx_in_target_schema.size());
@@ -323,14 +321,12 @@ Status FieldMappingReader::MappingFields(const std::shared_ptr<arrow::Array>& da
 
         // Filter map entries by selected keys if metadata is present.
         if (field_array->type()->id() == arrow::Type::MAP) {
-            std::set<std::string> selected_keys =
-                NestedProjectionUtils::GetMapSelectedKeys(
-                    read_fields_of_data_array[i].ArrowField());
+            std::set<std::string> selected_keys = NestedProjectionUtils::GetMapSelectedKeys(
+                read_fields_of_data_array[i].ArrowField());
             if (!selected_keys.empty()) {
-                PAIMON_ASSIGN_OR_RAISE(
-                    field_array,
-                    NestedProjectionUtils::FilterMapArrayBySelectedKeys(
-                        field_array, selected_keys));
+                PAIMON_ASSIGN_OR_RAISE(field_array,
+                                       NestedProjectionUtils::FilterMapArrayBySelectedKeys(
+                                           field_array, selected_keys));
             }
         }
 
diff --git a/src/paimon/core/io/field_mapping_reader.h b/src/paimon/core/io/field_mapping_reader.h
index 1ab7f41eb..3e7611a74 100644
--- a/src/paimon/core/io/field_mapping_reader.h
+++ b/src/paimon/core/io/field_mapping_reader.h
@@ -97,10 +97,10 @@ class FieldMappingReader : public FileBatchReader {
         const std::shared_ptr<arrow::Array>& src_array) const;
 
     static Status MappingFields(const std::shared_ptr<arrow::Array>& src_array,
-                               const std::vector<DataField>& read_fields_of_data_array,
-                               const std::vector<int32_t>& idx_in_target_schema,
-                               arrow::ArrayVector* target_array,
-                               std::vector<std::string>* target_field_names);
+                                const std::vector<DataField>& read_fields_of_data_array,
+                                const std::vector<int32_t>& idx_in_target_schema,
+                                arrow::ArrayVector* target_array,
+                                std::vector<std::string>* target_field_names);
 
  private:
     bool need_mapping_ = false;
diff --git a/src/paimon/core/operation/internal_read_context.cpp b/src/paimon/core/operation/internal_read_context.cpp
index a1b670827..2e73bf51a 100644
--- a/src/paimon/core/operation/internal_read_context.cpp
+++ b/src/paimon/core/operation/internal_read_context.cpp
@@ -42,9 +42,8 @@ Result<std::unique_ptr<InternalReadContext>> InternalReadContext::Create(
         // Nested column pruning path: user provided a projected C ArrowSchema
         // where STRUCT types may contain only a subset of sub-fields.
         // ImportSchema consumes the C schema — that's fine, it's one-shot usage.
-        PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
-            std::shared_ptr<arrow::Schema> projected_schema,
-            arrow::ImportSchema(context->GetReadSchema()));
+        PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr<arrow::Schema> projected_schema,
+                                          arrow::ImportSchema(context->GetReadSchema()));
         PAIMON_ASSIGN_OR_RAISE(read_data_fields,
                                DataField::ConvertArrowSchemaToDataFields(projected_schema));
         // Validate that every top-level field exists in the table schema by field ID.
diff --git a/src/paimon/core/operation/internal_read_context_test.cpp b/src/paimon/core/operation/internal_read_context_test.cpp
index 4ef53baa8..35dbade05 100644
--- a/src/paimon/core/operation/internal_read_context_test.cpp
+++ b/src/paimon/core/operation/internal_read_context_test.cpp
@@ -103,7 +103,8 @@ TEST(InternalReadContext, TestReadWithRowTrackingAndScoreFields) {
         // test simple
         std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09";
         ReadContextBuilder context_builder(path);
-        context_builder.SetReadFieldNames({"f3", "f0", "_ROW_ID", "_SEQUENCE_NUMBER", "_INDEX_SCORE"});
+        context_builder.SetReadFieldNames(
+            {"f3", "f0", "_ROW_ID", "_SEQUENCE_NUMBER", "_INDEX_SCORE"});
         ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish());
         SchemaManager schema_manager(std::make_shared<LocalFileSystem>(), read_context->GetPath());
         ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0));
diff --git a/src/paimon/core/utils/nested_projection_utils.cpp b/src/paimon/core/utils/nested_projection_utils.cpp
index 259ba0f0b..bb25a0477 100644
--- a/src/paimon/core/utils/nested_projection_utils.cpp
+++ b/src/paimon/core/utils/nested_projection_utils.cpp
@@ -70,34 +70,29 @@ Result<std::optional<std::shared_ptr<arrow::DataType>>> NestedProjectionUtils::P
         case arrow::Type::LIST: {
             const auto& read_list = static_cast<const arrow::ListType&>(*read_type);
             const auto& data_list = static_cast<const arrow::ListType&>(*data_type);
-            PAIMON_ASSIGN_OR_RAISE(
-                std::optional<std::shared_ptr<arrow::DataType>> pruned_elem,
-                PruneDataType(read_list.value_type(), data_list.value_type()));
+            PAIMON_ASSIGN_OR_RAISE(std::optional<std::shared_ptr<arrow::DataType>> pruned_elem,
+                                   PruneDataType(read_list.value_type(), data_list.value_type()));
             if (!pruned_elem.has_value()) {
                 return std::optional<std::shared_ptr<arrow::DataType>>(std::nullopt);
             }
-            std::shared_ptr<arrow::DataType> result_type = arrow::list(
-                arrow::field(data_list.value_field()->name(), pruned_elem.value(),
-                             data_list.value_field()->nullable(),
-                             data_list.value_field()->metadata()));
+            std::shared_ptr<arrow::DataType> result_type = arrow::list(arrow::field(
+                data_list.value_field()->name(), pruned_elem.value(),
+                data_list.value_field()->nullable(), data_list.value_field()->metadata()));
             return std::optional<std::shared_ptr<arrow::DataType>>(std::move(result_type));
         }
 
         case arrow::Type::MAP: {
             const auto& read_map = static_cast<const arrow::MapType&>(*read_type);
             const auto& data_map = static_cast<const arrow::MapType&>(*data_type);
-            PAIMON_ASSIGN_OR_RAISE(
-                std::optional<std::shared_ptr<arrow::DataType>> pruned_key,
-                PruneDataType(read_map.key_type(), data_map.key_type()));
-            PAIMON_ASSIGN_OR_RAISE(
-                std::optional<std::shared_ptr<arrow::DataType>> pruned_value,
-                PruneDataType(read_map.item_type(), data_map.item_type()));
+            PAIMON_ASSIGN_OR_RAISE(std::optional<std::shared_ptr<arrow::DataType>> pruned_key,
+                                   PruneDataType(read_map.key_type(), data_map.key_type()));
+            PAIMON_ASSIGN_OR_RAISE(std::optional<std::shared_ptr<arrow::DataType>> pruned_value,
+                                   PruneDataType(read_map.item_type(), data_map.item_type()));
             if (!pruned_key.has_value() || !pruned_value.has_value()) {
                 return std::optional<std::shared_ptr<arrow::DataType>>(std::nullopt);
             }
-            std::shared_ptr<arrow::DataType> result_type =
-                arrow::map(pruned_key.value(), pruned_value.value(),
-                           data_map.key_field()->nullable());
+            std::shared_ptr<arrow::DataType> result_type = arrow::map(
+                pruned_key.value(), pruned_value.value(), data_map.key_field()->nullable());
             return std::optional<std::shared_ptr<arrow::DataType>>(std::move(result_type));
         }
 
@@ -137,8 +132,8 @@ Result<std::shared_ptr<arrow::Array>> NestedProjectionUtils::PruneArray(
             PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
                 std::shared_ptr<arrow::StructArray> result_struct,
                 arrow::StructArray::Make(pruned_children, pruned_fields,
-                                         struct_array->null_bitmap(),
-                                         struct_array->null_count(), struct_array->offset()));
+                                         struct_array->null_bitmap(), struct_array->null_count(),
+                                         struct_array->offset()));
             return std::static_pointer_cast<arrow::Array>(result_struct);
         }
 
@@ -150,9 +145,9 @@ Result<std::shared_ptr<arrow::Array>> NestedProjectionUtils::PruneArray(
                                    PruneArray(list_array->values(), target_elem_type));
             PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
                 std::shared_ptr<arrow::ListArray> result_list,
-                arrow::ListArray::FromArrays(
-                    *list_array->offsets(), *pruned_values, arrow::default_memory_pool(),
-                    list_array->null_bitmap(), list_array->null_count()));
+                arrow::ListArray::FromArrays(*list_array->offsets(), *pruned_values,
+                                             arrow::default_memory_pool(),
+                                             list_array->null_bitmap(), list_array->null_count()));
             return std::static_pointer_cast<arrow::Array>(result_list);
         }
 
@@ -195,8 +190,7 @@ std::set<std::string> NestedProjectionUtils::GetMapSelectedKeys(
 }
 
 Result<std::shared_ptr<arrow::Array>> NestedProjectionUtils::FilterMapArrayBySelectedKeys(
-    const std::shared_ptr<arrow::Array>& array,
-    const std::set<std::string>& selected_keys) {
+    const std::shared_ptr<arrow::Array>& array, const std::set<std::string>& selected_keys) {
     if (selected_keys.empty() || !array || array->length() == 0) {
         return array;
     }
@@ -205,9 +199,9 @@ Result<std::shared_ptr<arrow::Array>> NestedProjectionUtils::FilterMapArrayBySel
     auto map_type = std::static_pointer_cast<arrow::MapType>(array->type());
 
     if (map_type->key_type()->id() != arrow::Type::STRING) {
-        return Status::Invalid(fmt::format(
-            "FilterMapArrayBySelectedKeys only supports string keys, got {}",
-            map_type->key_type()->ToString()));
+        return Status::Invalid(
+            fmt::format("FilterMapArrayBySelectedKeys only supports string keys, got {}",
+                        map_type->key_type()->ToString()));
     }
 
     auto keys_array = std::static_pointer_cast<arrow::StringArray>(map_array->keys());
@@ -279,16 +273,14 @@ Result<std::shared_ptr<arrow::Array>> NestedProjectionUtils::FilterMapArrayBySel
         filtered_keys = key_slices[0];
         filtered_values = value_slices[0];
     } else {
-        PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(filtered_keys,
-                                          arrow::Concatenate(key_slices));
-        PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(filtered_values,
-                                          arrow::Concatenate(value_slices));
+        PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(filtered_keys, arrow::Concatenate(key_slices));
+        PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(filtered_values, arrow::Concatenate(value_slices));
     }
 
     // Build new offsets array
     arrow::Int32Builder offset_builder;
-    PAIMON_RETURN_NOT_OK_FROM_ARROW(offset_builder.Reserve(
-        static_cast<int64_t>(new_offsets.size())));
+    PAIMON_RETURN_NOT_OK_FROM_ARROW(
+        offset_builder.Reserve(static_cast<int64_t>(new_offsets.size())));
     for (int32_t offset : new_offsets) {
         offset_builder.UnsafeAppend(offset);
     }
@@ -298,8 +290,7 @@ Result<std::shared_ptr<arrow::Array>> NestedProjectionUtils::FilterMapArrayBySel
     PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
         std::shared_ptr<arrow::Array> result_map,
         arrow::MapArray::FromArrays(new_offsets_array, filtered_keys, filtered_values,
-                                    arrow::default_memory_pool(),
-                                    map_array->null_bitmap()));
+                                    arrow::default_memory_pool(), map_array->null_bitmap()));
     return result_map;
 }
 
diff --git a/src/paimon/core/utils/nested_projection_utils.h b/src/paimon/core/utils/nested_projection_utils.h
index fe35bdfcf..ad0f27996 100644
--- a/src/paimon/core/utils/nested_projection_utils.h
+++ b/src/paimon/core/utils/nested_projection_utils.h
@@ -45,8 +45,7 @@ class PAIMON_EXPORT NestedProjectionUtils {
         if (!result.ok()) {
             return -1;
         }
-        std::optional<int32_t> field_id =
-            StringUtils::StringToValue<int32_t>(result.ValueUnsafe());
+        std::optional<int32_t> field_id = StringUtils::StringToValue<int32_t>(result.ValueUnsafe());
         return field_id.value_or(-1);
     }
 
@@ -88,15 +87,13 @@ class PAIMON_EXPORT NestedProjectionUtils {
     /// Parse the "paimon.map.selected-keys" metadata from an Arrow field.
     /// Returns an empty set if the metadata key is absent or the field is not a MAP.
     /// The metadata value must be a JSON array of strings, e.g. '["key1","key2"]'.
-    static std::set<std::string> GetMapSelectedKeys(
-        const std::shared_ptr<arrow::Field>& field);
+    static std::set<std::string> GetMapSelectedKeys(const std::shared_ptr<arrow::Field>& field);
 
     /// Filter a MapArray so that only entries whose key is in `selected_keys` are kept.
     /// Only supports string-keyed maps. Returns the original array unchanged if
     /// `selected_keys` is empty.
     static Result<std::shared_ptr<arrow::Array>> FilterMapArrayBySelectedKeys(
-        const std::shared_ptr<arrow::Array>& map_array,
-        const std::set<std::string>& selected_keys);
+        const std::shared_ptr<arrow::Array>& map_array, const std::set<std::string>& selected_keys);
 };
 
 }  // namespace paimon
diff --git a/src/paimon/core/utils/nested_projection_utils_test.cpp b/src/paimon/core/utils/nested_projection_utils_test.cpp
index 9f68f28b2..911b22438 100644
--- a/src/paimon/core/utils/nested_projection_utils_test.cpp
+++ b/src/paimon/core/utils/nested_projection_utils_test.cpp
@@ -28,8 +28,8 @@ namespace paimon::test {
 
 // Helper: create an arrow::Field with paimon.id metadata
 static std::shared_ptr<arrow::Field> MakeField(const std::string& name,
-                                                const std::shared_ptr<arrow::DataType>& type,
-                                                int32_t paimon_id) {
+                                               const std::shared_ptr<arrow::DataType>& type,
+                                               int32_t paimon_id) {
     DataField data_field(paimon_id, arrow::field(name, type));
     return DataField::ConvertDataFieldToArrowField(data_field);
 }
@@ -53,8 +53,8 @@ TEST(NestedProjectionUtilsTest, GetPaimonFieldId_Nullptr) {
 // ============== FindFieldByPaimonId ==============
 
 TEST(NestedProjectionUtilsTest, FindFieldByPaimonId_Found) {
-    auto struct_type = arrow::struct_({MakeField("x", arrow::int32(), 1),
-                                       MakeField("y", arrow::utf8(), 2)});
+    auto struct_type =
+        arrow::struct_({MakeField("x", arrow::int32(), 1), MakeField("y", arrow::utf8(), 2)});
     auto found = NestedProjectionUtils::FindFieldByPaimonId(struct_type, 2);
     ASSERT_NE(found, nullptr);
     ASSERT_EQ(found->name(), "y");
@@ -91,9 +91,9 @@ TEST(NestedProjectionUtilsTest, PruneDataType_StructPruneSubset) {
     // data: STRUCT<x:INT(id=1), y:STRING(id=2), z:DOUBLE(id=3)>
     // read: STRUCT<x:INT(id=1)>
     // expected: STRUCT<x:INT(id=1)>
-    auto data_type = arrow::struct_({MakeField("x", arrow::int32(), 1),
-                                     MakeField("y", arrow::utf8(), 2),
-                                     MakeField("z", arrow::float64(), 3)});
+    auto data_type =
+        arrow::struct_({MakeField("x", arrow::int32(), 1), MakeField("y", arrow::utf8(), 2),
+                        MakeField("z", arrow::float64(), 3)});
     auto read_type = arrow::struct_({MakeField("x", arrow::int32(), 1)});
 
     ASSERT_OK_AND_ASSIGN(auto result, NestedProjectionUtils::PruneDataType(read_type, data_type));
@@ -117,8 +117,8 @@ TEST(NestedProjectionUtilsTest, PruneDataType_NestedStruct) {
     // data: STRUCT<inner:STRUCT<a:INT(id=10), b:STRING(id=11)>(id=1)>
     // read: STRUCT<inner:STRUCT<a:INT(id=10)>(id=1)>
     // expected: STRUCT<inner:STRUCT<a:INT(id=10)>(id=1)>
-    auto inner_data = arrow::struct_({MakeField("a", arrow::int32(), 10),
-                                      MakeField("b", arrow::utf8(), 11)});
+    auto inner_data =
+        arrow::struct_({MakeField("a", arrow::int32(), 10), MakeField("b", arrow::utf8(), 11)});
     auto data_type = arrow::struct_({MakeField("inner", inner_data, 1)});
 
     auto inner_read = arrow::struct_({MakeField("a", arrow::int32(), 10)});
@@ -135,8 +135,8 @@ TEST(NestedProjectionUtilsTest, PruneDataType_NestedStruct) {
 TEST(NestedProjectionUtilsTest, PruneDataType_ListWithStructElement) {
     // data: LIST<STRUCT<a:INT(id=10), b:STRING(id=11)>>
     // read: LIST<STRUCT<a:INT(id=10)>>
-    auto inner_data = arrow::struct_({MakeField("a", arrow::int32(), 10),
-                                      MakeField("b", arrow::utf8(), 11)});
+    auto inner_data =
+        arrow::struct_({MakeField("a", arrow::int32(), 10), MakeField("b", arrow::utf8(), 11)});
     auto data_type = arrow::list(arrow::field("item", inner_data));
 
     auto inner_read = arrow::struct_({MakeField("a", arrow::int32(), 10)});
@@ -153,8 +153,8 @@ TEST(NestedProjectionUtilsTest, PruneDataType_ListWithStructElement) {
 TEST(NestedProjectionUtilsTest, PruneDataType_MapWithStructValue) {
     // data: MAP<STRING, STRUCT<a:INT(id=10), b:STRING(id=11)>>
     // read: MAP<STRING, STRUCT<a:INT(id=10)>>
-    auto inner_data = arrow::struct_({MakeField("a", arrow::int32(), 10),
-                                      MakeField("b", arrow::utf8(), 11)});
+    auto inner_data =
+        arrow::struct_({MakeField("a", arrow::int32(), 10), MakeField("b", arrow::utf8(), 11)});
     auto data_type = arrow::map(arrow::utf8(), inner_data);
 
     auto inner_read = arrow::struct_({MakeField("a", arrow::int32(), 10)});
@@ -183,10 +183,9 @@ TEST(NestedProjectionUtilsTest, PruneArray_StructPrune) {
     std::shared_ptr<arrow::Array> y_array;
     ASSERT_TRUE(y_builder.Finish(&y_array).ok());
 
-    auto struct_type = arrow::struct_({arrow::field("x", arrow::int32()),
-                                       arrow::field("y", arrow::utf8())});
-    auto struct_result = arrow::StructArray::Make({x_array, y_array},
-                                                   struct_type->fields());
+    auto struct_type =
+        arrow::struct_({arrow::field("x", arrow::int32()), arrow::field("y", arrow::utf8())});
+    auto struct_result = arrow::StructArray::Make({x_array, y_array}, struct_type->fields());
     ASSERT_TRUE(struct_result.ok());
     auto struct_array = struct_result.ValueUnsafe();
 
@@ -212,10 +211,10 @@ TEST(NestedProjectionUtilsTest, PruneArray_IdenticalType) {
 // ============== GetMapSelectedKeys ==============
 
 TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_Present) {
-    auto metadata = arrow::KeyValueMetadata::Make(
-        {DataField::MAP_SELECTED_KEYS}, {"key1,key2,key3"});
-    auto field = arrow::field("m", arrow::map(arrow::utf8(), arrow::int32()), /*nullable=*/true,
-                              metadata);
+    auto metadata =
+        arrow::KeyValueMetadata::Make({DataField::MAP_SELECTED_KEYS}, {"key1,key2,key3"});
+    auto field =
+        arrow::field("m", arrow::map(arrow::utf8(), arrow::int32()), /*nullable=*/true, metadata);
     auto keys = NestedProjectionUtils::GetMapSelectedKeys(field);
     ASSERT_EQ(keys.size(), 3);
     ASSERT_TRUE(keys.count("key1"));
@@ -230,10 +229,9 @@ TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_Absent) {
 }
 
 TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_EmptyString) {
-    auto metadata = arrow::KeyValueMetadata::Make(
-        {DataField::MAP_SELECTED_KEYS}, {""});
-    auto field = arrow::field("m", arrow::map(arrow::utf8(), arrow::int32()), /*nullable=*/true,
-                              metadata);
+    auto metadata = arrow::KeyValueMetadata::Make({DataField::MAP_SELECTED_KEYS}, {""});
+    auto field =
+        arrow::field("m", arrow::map(arrow::utf8(), arrow::int32()), /*nullable=*/true, metadata);
     auto keys = NestedProjectionUtils::GetMapSelectedKeys(field);
     ASSERT_TRUE(keys.empty());
 }
@@ -276,7 +274,8 @@ TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_Basic) {
     });
 
     std::set<std::string> selected = {"a", "c"};
-    ASSERT_OK_AND_ASSIGN(auto filtered, NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
+    ASSERT_OK_AND_ASSIGN(auto filtered,
+                         NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
 
     auto result = std::static_pointer_cast<arrow::MapArray>(filtered);
     ASSERT_EQ(result->length(), 2);
@@ -295,7 +294,8 @@ TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_Basic) {
 TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_EmptySelectedKeys) {
     auto map_array = BuildStringInt32MapArray({{{"a", 1}}});
     std::set<std::string> empty_keys;
-    ASSERT_OK_AND_ASSIGN(auto filtered, NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, empty_keys));
+    ASSERT_OK_AND_ASSIGN(
+        auto filtered, NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, empty_keys));
     // Should return original array unchanged
     ASSERT_EQ(filtered.get(), map_array.get());
 }
@@ -303,7 +303,8 @@ TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_EmptySelectedKeys)
 TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_AllKept) {
     auto map_array = BuildStringInt32MapArray({{{"a", 1}, {"b", 2}}});
     std::set<std::string> selected = {"a", "b"};
-    ASSERT_OK_AND_ASSIGN(auto filtered, NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
+    ASSERT_OK_AND_ASSIGN(auto filtered,
+                         NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
     // All entries match, should return original
     ASSERT_EQ(filtered.get(), map_array.get());
 }
@@ -311,7 +312,8 @@ TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_AllKept) {
 TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_NoneKept) {
     auto map_array = BuildStringInt32MapArray({{{"a", 1}, {"b", 2}}});
     std::set<std::string> selected = {"x", "y"};
-    ASSERT_OK_AND_ASSIGN(auto filtered, NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
+    ASSERT_OK_AND_ASSIGN(auto filtered,
+                         NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
     auto result = std::static_pointer_cast<arrow::MapArray>(filtered);
     ASSERT_EQ(result->length(), 1);
     ASSERT_EQ(result->value_length(0), 0);
@@ -319,12 +321,12 @@ TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_NoneKept) {
 
 TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_WithNull) {
     // maps[0] = {"a":1}, maps[1] = null, maps[2] = {"b":2,"c":3}
-    auto map_array = BuildStringInt32MapArray(
-        {{{"a", 1}}, {}, {{"b", 2}, {"c", 3}}},
-        {true, false, true});
+    auto map_array =
+        BuildStringInt32MapArray({{{"a", 1}}, {}, {{"b", 2}, {"c", 3}}}, {true, false, true});
 
     std::set<std::string> selected = {"a", "c"};
-    ASSERT_OK_AND_ASSIGN(auto filtered, NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
+    ASSERT_OK_AND_ASSIGN(auto filtered,
+                         NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
     auto result = std::static_pointer_cast<arrow::MapArray>(filtered);
     ASSERT_EQ(result->length(), 3);
     // maps[0] = {"a":1}
@@ -340,7 +342,8 @@ TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_WithNull) {
 TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_EmptyArray) {
     auto map_array = BuildStringInt32MapArray({});
     std::set<std::string> selected = {"a"};
-    ASSERT_OK_AND_ASSIGN(auto filtered, NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
+    ASSERT_OK_AND_ASSIGN(auto filtered,
+                         NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
     ASSERT_EQ(filtered->length(), 0);
 }
 
diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
index 2c68a5256..406ff018d 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
@@ -95,11 +95,12 @@ Result<std::shared_ptr<arrow::Array>> PruneArrayToReadType(
             pruned_children.reserve(target_struct_type->num_fields());
             pruned_fields.reserve(target_struct_type->num_fields());
             for (const auto& target_field : target_struct_type->fields()) {
-                auto src_field = FindMatchingReadField(struct_array->type()->fields(), target_field);
+                auto src_field =
+                    FindMatchingReadField(struct_array->type()->fields(), target_field);
                 if (!src_field) {
-                    return Status::Invalid(fmt::format(
-                        "PruneArrayToReadType: field '{}' not found in struct array",
-                        target_field->name()));
+                    return Status::Invalid(
+                        fmt::format("PruneArrayToReadType: field '{}' not found in struct array",
+                                    target_field->name()));
                 }
                 auto child = struct_array->GetFieldByName(src_field->name());
                 PAIMON_ASSIGN_OR_RAISE(auto pruned_child,
@@ -132,11 +133,12 @@ Result<std::shared_ptr<arrow::Array>> PruneArrayToReadType(
         case arrow::Type::MAP: {
             auto map_array = std::static_pointer_cast<arrow::MapArray>(array);
             const auto& target_map_type = static_cast<const arrow::MapType&>(*target_type);
-            PAIMON_ASSIGN_OR_RAISE(auto pruned_keys,
-                                   PruneArrayToReadType(map_array->keys(), target_map_type.key_type()));
-            PAIMON_ASSIGN_OR_RAISE(auto pruned_items,
-                                   PruneArrayToReadType(map_array->items(),
-                                                        target_map_type.item_type()));
+            PAIMON_ASSIGN_OR_RAISE(
+                auto pruned_keys,
+                PruneArrayToReadType(map_array->keys(), target_map_type.key_type()));
+            PAIMON_ASSIGN_OR_RAISE(
+                auto pruned_items,
+                PruneArrayToReadType(map_array->items(), target_map_type.item_type()));
             PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
                 std::shared_ptr<arrow::Array> result_map,
                 arrow::MapArray::FromArrays(map_array->offsets(), pruned_keys, pruned_items,
diff --git a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp
index 21c14d1aa..f0653fb22 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp
@@ -302,15 +302,13 @@ TEST_F(ParquetFileBatchReaderTest, TestSetReadSchemaWithLegacyParquetMissingFiel
 
     std::shared_ptr<arrow::ChunkedArray> expected_array;
     ASSERT_TRUE(arrow::ipc::internal::json::ChunkedArrayFromJSON(
-                    arrow::struct_(read_schema->fields()),
-                    {R"([
+                    arrow::struct_(read_schema->fields()), {R"([
         ["Lucy", 1, 14.1]
     ])"},
                     &expected_array)
                     .ok());
     ASSERT_TRUE(result_array->Equals(expected_array))
-        << "expected: " << expected_array->ToString() << "\nactual: "
-        << result_array->ToString();
+        << "expected: " << expected_array->ToString() << "\nactual: " << result_array->ToString();
 }
 
 TEST_F(ParquetFileBatchReaderTest, TestNextBatchSimple) {
diff --git a/test/inte/global_index_test.cpp b/test/inte/global_index_test.cpp
index 51afdc2de..d50388424 100644
--- a/test/inte/global_index_test.cpp
+++ b/test/inte/global_index_test.cpp
@@ -195,7 +195,9 @@ class GlobalIndexTest : public ::testing::Test, public ::testing::WithParamInter
                     const std::shared_ptr<Plan>& result_plan) const {
         auto splits = result_plan->Splits();
         ReadContextBuilder read_context_builder(table_path);
-        read_context_builder.SetReadFieldNames(read_schema).SetPredicate(predicate).WithFileSystem(fs_);
+        read_context_builder.SetReadFieldNames(read_schema)
+            .SetPredicate(predicate)
+            .WithFileSystem(fs_);
         PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<ReadContext> read_context,
                                read_context_builder.Finish());
         PAIMON_ASSIGN_OR_RAISE(auto table_read, TableRead::Create(std::move(read_context)));
diff --git a/test/inte/nested_column_pruning_inte_test.cpp b/test/inte/nested_column_pruning_inte_test.cpp
index b6e1cf564..9e1f8ccc4 100644
--- a/test/inte/nested_column_pruning_inte_test.cpp
+++ b/test/inte/nested_column_pruning_inte_test.cpp
@@ -49,22 +49,23 @@ class RecordBatch;
 
 namespace paimon::test {
 
-class NestedColumnPruningInteTest
-    : public ::testing::Test,
-      public ::testing::WithParamInterface<std::string> {
+class NestedColumnPruningInteTest : public ::testing::Test,
+                                    public ::testing::WithParamInterface<std::string> {
     void SetUp() override {
         file_format_ = GetParam();
         dir_ = UniqueTestDirectory::Create("local");
         test_dir_ = dir_->Str();
         table_path_ = PathUtil::JoinPath(test_dir_, "foo.db/bar");
     }
-    void TearDown() override { dir_.reset(); }
+    void TearDown() override {
+        dir_.reset();
+    }
 
  protected:
-    static std::shared_ptr<arrow::Field> AnnotateField(
-        const std::shared_ptr<arrow::Field>& field, int32_t paimon_id) {
-        auto metadata = arrow::KeyValueMetadata::Make(
-            {DataField::FIELD_ID}, {std::to_string(paimon_id)});
+    static std::shared_ptr<arrow::Field> AnnotateField(const std::shared_ptr<arrow::Field>& field,
+                                                       int32_t paimon_id) {
+        auto metadata =
+            arrow::KeyValueMetadata::Make({DataField::FIELD_ID}, {std::to_string(paimon_id)});
         if (field->metadata()) {
             auto merged = field->metadata()->Merge(*metadata);
             return field->WithMetadata(merged);
@@ -100,9 +101,8 @@ TEST_P(NestedColumnPruningInteTest, PruneStructSubFields) {
     };
 
     ASSERT_OK_AND_ASSIGN(
-        auto helper,
-        TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{},
-                           /*primary_keys=*/{}, options, /*is_streaming_mode=*/false));
+        auto helper, TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{},
+                                        /*primary_keys=*/{}, options, /*is_streaming_mode=*/false));
 
     // Write data
     std::string data = R"([
@@ -111,20 +111,17 @@ TEST_P(NestedColumnPruningInteTest, PruneStructSubFields) {
         [3, [30, "foo", 3.3]],
         [4, [40, "bar", 4.4]]
     ])";
-    ASSERT_OK_AND_ASSIGN(
-        auto batch,
-        TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data,
-                                    /*partition_map=*/{}, /*bucket=*/0, {}));
+    ASSERT_OK_AND_ASSIGN(auto batch,
+                         TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data,
+                                                     /*partition_map=*/{}, /*bucket=*/0, {}));
     int64_t commit_identifier = 0;
-    ASSERT_OK_AND_ASSIGN(
-        auto commit_msgs,
-        helper->WriteAndCommit(std::move(batch), commit_identifier++,
-                               /*expected_commit_messages=*/std::nullopt));
+    ASSERT_OK_AND_ASSIGN(auto commit_msgs,
+                         helper->WriteAndCommit(std::move(batch), commit_identifier++,
+                                                /*expected_commit_messages=*/std::nullopt));
 
     // Scan to get splits
-    ASSERT_OK_AND_ASSIGN(
-        auto data_splits,
-        helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
+    ASSERT_OK_AND_ASSIGN(auto data_splits,
+                         helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
     ASSERT_FALSE(data_splits.empty());
 
     // Build projected schema: only read f0 (full) and f1.a (sub-field of struct)
@@ -148,8 +145,7 @@ TEST_P(NestedColumnPruningInteTest, PruneStructSubFields) {
     ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish());
     ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context)));
     ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits));
-    ASSERT_OK_AND_ASSIGN(auto read_result,
-                         ReadResultCollector::CollectResult(batch_reader.get()));
+    ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get()));
 
     // Expected: struct with _VALUE_KIND, f0, f1{a}
     arrow::FieldVector expected_fields = {
@@ -200,28 +196,24 @@ TEST_P(NestedColumnPruningInteTest, PruneEntireStructField) {
     };
 
     ASSERT_OK_AND_ASSIGN(
-        auto helper,
-        TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{},
-                           /*primary_keys=*/{}, options, /*is_streaming_mode=*/false));
+        auto helper, TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{},
+                                        /*primary_keys=*/{}, options, /*is_streaming_mode=*/false));
 
     std::string data = R"([
         [100, [1, "aa"], 0.1],
         [200, [2, "bb"], 0.2],
         [300, [3, "cc"], 0.3]
     ])";
-    ASSERT_OK_AND_ASSIGN(
-        auto batch,
-        TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data,
-                                    /*partition_map=*/{}, /*bucket=*/0, {}));
+    ASSERT_OK_AND_ASSIGN(auto batch,
+                         TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data,
+                                                     /*partition_map=*/{}, /*bucket=*/0, {}));
     int64_t commit_identifier = 0;
-    ASSERT_OK_AND_ASSIGN(
-        auto commit_msgs,
-        helper->WriteAndCommit(std::move(batch), commit_identifier++,
-                               /*expected_commit_messages=*/std::nullopt));
+    ASSERT_OK_AND_ASSIGN(auto commit_msgs,
+                         helper->WriteAndCommit(std::move(batch), commit_identifier++,
+                                                /*expected_commit_messages=*/std::nullopt));
 
-    ASSERT_OK_AND_ASSIGN(
-        auto data_splits,
-        helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
+    ASSERT_OK_AND_ASSIGN(auto data_splits,
+                         helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
 
     // Only read f0 and f2, skip f1 entirely.
     // IDs: f0->0, f1->1, f1.x->2, f1.y->3, f2->4
@@ -239,8 +231,7 @@ TEST_P(NestedColumnPruningInteTest, PruneEntireStructField) {
     ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish());
     ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context)));
     ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits));
-    ASSERT_OK_AND_ASSIGN(auto read_result,
-                         ReadResultCollector::CollectResult(batch_reader.get()));
+    ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get()));
 
     arrow::FieldVector expected_fields = {
         arrow::field("_VALUE_KIND", arrow::int8()),
@@ -293,28 +284,24 @@ TEST_P(NestedColumnPruningInteTest, PruneDeepNestedStruct) {
     };
 
     ASSERT_OK_AND_ASSIGN(
-        auto helper,
-        TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{},
-                           /*primary_keys=*/{}, options, /*is_streaming_mode=*/false));
+        auto helper, TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{},
+                                        /*primary_keys=*/{}, options, /*is_streaming_mode=*/false));
 
     std::string data = R"([
         [1, [10, [100, "aaa"]]],
         [2, [20, [200, "bbb"]]],
         [3, [30, [300, "ccc"]]]
     ])";
-    ASSERT_OK_AND_ASSIGN(
-        auto batch,
-        TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data,
-                                    /*partition_map=*/{}, /*bucket=*/0, {}));
+    ASSERT_OK_AND_ASSIGN(auto batch,
+                         TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data,
+                                                     /*partition_map=*/{}, /*bucket=*/0, {}));
     int64_t commit_identifier = 0;
-    ASSERT_OK_AND_ASSIGN(
-        auto commit_msgs,
-        helper->WriteAndCommit(std::move(batch), commit_identifier++,
-                               /*expected_commit_messages=*/std::nullopt));
+    ASSERT_OK_AND_ASSIGN(auto commit_msgs,
+                         helper->WriteAndCommit(std::move(batch), commit_identifier++,
+                                                /*expected_commit_messages=*/std::nullopt));
 
-    ASSERT_OK_AND_ASSIGN(
-        auto data_splits,
-        helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
+    ASSERT_OK_AND_ASSIGN(auto data_splits,
+                         helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
 
     // Field IDs (assigned sequentially by catalog):
     // f0->0, f1->1, f1.a->2, f1.inner->3, f1.inner.x->4, f1.inner.y->5
@@ -340,17 +327,16 @@ TEST_P(NestedColumnPruningInteTest, PruneDeepNestedStruct) {
     ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish());
     ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context)));
     ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits));
-    ASSERT_OK_AND_ASSIGN(auto read_result,
-                         ReadResultCollector::CollectResult(batch_reader.get()));
+    ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get()));
 
     arrow::FieldVector expected_fields = {
         arrow::field("_VALUE_KIND", arrow::int8()),
         arrow::field("f0", arrow::int32()),
         arrow::field("f1", arrow::struct_({
-            arrow::field("inner", arrow::struct_({
-                arrow::field("x", arrow::int64()),
-            })),
-        })),
+                               arrow::field("inner", arrow::struct_({
+                                                         arrow::field("x", arrow::int64()),
+                                                     })),
+                           })),
     };
     auto expected_type = arrow::struct_(expected_fields);
     std::string expected_data = R"([
@@ -391,9 +377,8 @@ TEST_P(NestedColumnPruningInteTest, MapSelectedKeys) {
     };
 
     ASSERT_OK_AND_ASSIGN(
-        auto helper,
-        TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{},
-                           /*primary_keys=*/{}, options, /*is_streaming_mode=*/false));
+        auto helper, TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{},
+                                        /*primary_keys=*/{}, options, /*is_streaming_mode=*/false));
 
     // Write data: each row has a map with keys "a", "b", "c"
     std::string data = R"([
@@ -401,30 +386,28 @@ TEST_P(NestedColumnPruningInteTest, MapSelectedKeys) {
         [2, [["a", 100], ["c", 300]]],
         [3, [["b", 200], ["c", 400], ["d", 500]]]
     ])";
-    ASSERT_OK_AND_ASSIGN(
-        auto batch,
-        TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data,
-                                    /*partition_map=*/{}, /*bucket=*/0, {}));
+    ASSERT_OK_AND_ASSIGN(auto batch,
+                         TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data,
+                                                     /*partition_map=*/{}, /*bucket=*/0, {}));
     int64_t commit_identifier = 0;
-    ASSERT_OK_AND_ASSIGN(
-        auto commit_msgs,
-        helper->WriteAndCommit(std::move(batch), commit_identifier++,
-                               /*expected_commit_messages=*/std::nullopt));
+    ASSERT_OK_AND_ASSIGN(auto commit_msgs,
+                         helper->WriteAndCommit(std::move(batch), commit_identifier++,
+                                                /*expected_commit_messages=*/std::nullopt));
 
     // Scan to get splits
-    ASSERT_OK_AND_ASSIGN(
-        auto data_splits,
-        helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
+    ASSERT_OK_AND_ASSIGN(auto data_splits,
+                         helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
     ASSERT_FALSE(data_splits.empty());
 
     // Build projected schema: read f0 and f1 with selected keys "a,c"
-    auto selected_keys_metadata = arrow::KeyValueMetadata::Make(
-        {DataField::MAP_SELECTED_KEYS}, {"a,c"});
+    auto selected_keys_metadata =
+        arrow::KeyValueMetadata::Make({DataField::MAP_SELECTED_KEYS}, {"a,c"});
     arrow::FieldVector projected_fields = {
         AnnotateField(arrow::field("f0", arrow::int32()), 0),
-        AnnotateField(arrow::field("f1", map_type), 1)->WithMetadata(
-            AnnotateField(arrow::field("f1", map_type), 1)
-                ->metadata()->Merge(*selected_keys_metadata)),
+        AnnotateField(arrow::field("f1", map_type), 1)
+            ->WithMetadata(AnnotateField(arrow::field("f1", map_type), 1)
+                               ->metadata()
+                               ->Merge(*selected_keys_metadata)),
     };
     auto projected_schema = arrow::schema(projected_fields);
 
@@ -438,8 +421,7 @@ TEST_P(NestedColumnPruningInteTest, MapSelectedKeys) {
     ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish());
     ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context)));
     ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits));
-    ASSERT_OK_AND_ASSIGN(auto read_result,
-                         ReadResultCollector::CollectResult(batch_reader.get()));
+    ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get()));
 
     // Expected: only keys "a" and "c" remain in each map
     arrow::FieldVector expected_fields = {
@@ -470,7 +452,8 @@ TEST_P(NestedColumnPruningInteTest, MapSelectedKeys) {
 
 // Test: Deeper nested struct — prune sub-fields of a struct inside a struct inside another struct.
 TEST_P(NestedColumnPruningInteTest, PruneDeeperNestedStruct) {
-    // Table schema: f0 (int32), f1 (struct{a: int32, inner1: struct{x: int64, inner2: struct{p: utf8, q: float64}}})
+    // Table schema: f0 (int32), f1 (struct{a: int32, inner1: struct{x: int64, inner2: struct{p:
+    // utf8, q: float64}}})
     auto inner2_struct = arrow::struct_({
         arrow::field("p", arrow::utf8()),
         arrow::field("q", arrow::float64()),
@@ -497,31 +480,28 @@ TEST_P(NestedColumnPruningInteTest, PruneDeeperNestedStruct) {
     };
 
     ASSERT_OK_AND_ASSIGN(
-        auto helper,
-        TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{},
-                           /*primary_keys=*/{}, options, /*is_streaming_mode=*/false));
+        auto helper, TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{},
+                                        /*primary_keys=*/{}, options, /*is_streaming_mode=*/false));
 
     std::string data = R"([
         [1, [10, [100, ["ppp", 1.1]]]],
         [2, [20, [200, ["qqq", 2.2]]]],
         [3, [30, [300, ["rrr", 3.3]]]]
     ])";
-    ASSERT_OK_AND_ASSIGN(
-        auto batch,
-        TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data,
-                                    /*partition_map=*/{}, /*bucket=*/0, {}));
+    ASSERT_OK_AND_ASSIGN(auto batch,
+                         TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data,
+                                                     /*partition_map=*/{}, /*bucket=*/0, {}));
     int64_t commit_identifier = 0;
-    ASSERT_OK_AND_ASSIGN(
-        auto commit_msgs,
-        helper->WriteAndCommit(std::move(batch), commit_identifier++,
-                               /*expected_commit_messages=*/std::nullopt));
+    ASSERT_OK_AND_ASSIGN(auto commit_msgs,
+                         helper->WriteAndCommit(std::move(batch), commit_identifier++,
+                                                /*expected_commit_messages=*/std::nullopt));
 
-    ASSERT_OK_AND_ASSIGN(
-        auto data_splits,
-        helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
+    ASSERT_OK_AND_ASSIGN(auto data_splits,
+                         helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
 
     // Field IDs (assigned sequentially by catalog):
-    // f0->0, f1->1, f1.a->2, f1.inner1->3, f1.inner1.x->4, f1.inner1.inner2->5, f1.inner1.inner2.p->6, f1.inner1.inner2.q->7
+    // f0->0, f1->1, f1.a->2, f1.inner1->3, f1.inner1.x->4, f1.inner1.inner2->5,
+    // f1.inner1.inner2.p->6, f1.inner1.inner2.q->7
     //
     // Projected: f0, f1{inner1{inner2{p}}}
     auto pruned_inner2 = arrow::struct_({
@@ -547,19 +527,20 @@ TEST_P(NestedColumnPruningInteTest, PruneDeeperNestedStruct) {
     ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish());
     ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context)));
     ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits));
-    ASSERT_OK_AND_ASSIGN(auto read_result,
-                         ReadResultCollector::CollectResult(batch_reader.get()));
+    ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get()));
 
     arrow::FieldVector expected_fields = {
         arrow::field("_VALUE_KIND", arrow::int8()),
         arrow::field("f0", arrow::int32()),
-        arrow::field("f1", arrow::struct_({
-            arrow::field("inner1", arrow::struct_({
-                arrow::field("inner2", arrow::struct_({
-                    arrow::field("p", arrow::utf8()),
-                })),
-            })),
-        })),
+        arrow::field(
+            "f1", arrow::struct_({
+                      arrow::field("inner1",
+                                   arrow::struct_({
+                                       arrow::field("inner2", arrow::struct_({
+                                                                  arrow::field("p", arrow::utf8()),
+                                                              })),
+                                   })),
+                  })),
     };
     auto expected_type = arrow::struct_(expected_fields);
     std::string expected_data = R"([

From 9c308b9c9544db9c97e8d0e6921c7f1d7e1897f0 Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Tue, 16 Jun 2026 09:09:35 +0800
Subject: [PATCH 07/24] fix

---
 include/paimon/read_context.h                 |   2 +-
 .../core/operation/internal_read_context.cpp  | 113 +++++++++++-------
 .../core/operation/internal_read_context.h    |   5 +
 .../operation/internal_read_context_test.cpp  |  45 +++++++
 test/inte/nested_column_pruning_inte_test.cpp |  98 +++++++++++++++
 5 files changed, 222 insertions(+), 41 deletions(-)

diff --git a/include/paimon/read_context.h b/include/paimon/read_context.h
index fcaec06ff..377c30969 100644
--- a/include/paimon/read_context.h
+++ b/include/paimon/read_context.h
@@ -167,7 +167,7 @@ class PAIMON_EXPORT ReadContext {
     PrefetchCacheMode prefetch_cache_mode_;
     CacheConfig cache_config_;
     std::shared_ptr<Cache> cache_;
-    ArrowSchema* read_schema_ = nullptr;  // C ABI schema for nested column pruning, not owned
+    ArrowSchema* read_schema_ = nullptr;
 };
 
 /// `ReadContextBuilder` used to build a `ReadContext`, has input validation.
diff --git a/src/paimon/core/operation/internal_read_context.cpp b/src/paimon/core/operation/internal_read_context.cpp
index 2e73bf51a..1252be215 100644
--- a/src/paimon/core/operation/internal_read_context.cpp
+++ b/src/paimon/core/operation/internal_read_context.cpp
@@ -16,6 +16,7 @@
 
 #include "paimon/core/operation/internal_read_context.h"
 
+#include <optional>
 #include <utility>
 
 #include "arrow/c/abi.h"
@@ -28,6 +29,59 @@
 #include "paimon/status.h"
 
 namespace paimon {
+
+std::optional<DataField> InternalReadContext::TryResolveSpecialFieldById(
+    int32_t field_id, const CoreOptions& core_options) {
+    if (field_id == SpecialFields::ValueKind().Id()) {
+        return SpecialFields::ValueKind();
+    }
+    if (field_id == SpecialFields::RowId().Id()) {
+        if (core_options.RowTrackingEnabled()) {
+            return SpecialFields::RowId();
+        }
+        return std::nullopt;
+    }
+    if (field_id == SpecialFields::SequenceNumber().Id()) {
+        if (core_options.RowTrackingEnabled() || core_options.KeyValueSequenceNumberEnabled()) {
+            return SpecialFields::SequenceNumber();
+        }
+        return std::nullopt;
+    }
+    if (field_id == SpecialFields::IndexScore().Id()) {
+        if (core_options.DataEvolutionEnabled()) {
+            return SpecialFields::IndexScore();
+        }
+        return std::nullopt;
+    }
+    return std::nullopt;
+}
+
+std::optional<DataField> InternalReadContext::TryResolveSpecialFieldByName(
+    const std::string& name, const CoreOptions& core_options) {
+    if (name == SpecialFields::ValueKind().Name()) {
+        return SpecialFields::ValueKind();
+    }
+    if (name == SpecialFields::RowId().Name()) {
+        if (core_options.RowTrackingEnabled()) {
+            return SpecialFields::RowId();
+        }
+        return std::nullopt;
+    }
+    if (name == SpecialFields::SequenceNumber().Name()) {
+        if (core_options.RowTrackingEnabled() || core_options.KeyValueSequenceNumberEnabled()) {
+            return SpecialFields::SequenceNumber();
+        }
+        return std::nullopt;
+    }
+    if (name == SpecialFields::IndexScore().Name()) {
+        if (core_options.DataEvolutionEnabled()) {
+            return SpecialFields::IndexScore();
+        }
+        return std::nullopt;
+    }
+    return std::nullopt;
+}
+
 Result<std::unique_ptr<InternalReadContext>> InternalReadContext::Create(
     const std::shared_ptr<ReadContext>& context, const std::shared_ptr<TableSchema>& table_schema,
     const std::map<std::string, std::string>& options) {
@@ -46,34 +100,28 @@ Result<std::unique_ptr<InternalReadContext>> InternalReadContext::Create(
                                           arrow::ImportSchema(context->GetReadSchema()));
         PAIMON_ASSIGN_OR_RAISE(read_data_fields,
                                DataField::ConvertArrowSchemaToDataFields(projected_schema));
-        // Validate that every top-level field exists in the table schema by field ID.
-        for (const auto& field : read_data_fields) {
-            if (!SpecialFields::IsSpecialFieldName(field.Name())) {
-                PAIMON_ASSIGN_OR_RAISE([[maybe_unused]] DataField unused,
-                                       table_schema->GetField(field.Id()));
+        // Align special-field validation with read_field_ids/read_field_names branches.
+        for (auto& field : read_data_fields) {
+            if (auto resolved_special_field =
+                    TryResolveSpecialFieldById(field.Id(), core_options)) {
+                field = *resolved_special_field;
+                continue;
+            }
+            if (SpecialFields::IsSpecialFieldName(field.Name())) {
+                if (auto resolved_special_field =
+                        TryResolveSpecialFieldByName(field.Name(), core_options)) {
+                    field = *resolved_special_field;
+                    continue;
+                }
             }
+            PAIMON_ASSIGN_OR_RAISE([[maybe_unused]] DataField unused,
+                                   table_schema->GetField(field.Id()));
         }
     } else if (!context->GetReadFieldIds().empty()) {
         read_data_fields.reserve(context->GetReadFieldIds().size());
         for (const auto& field_id : context->GetReadFieldIds()) {
-            // if enable row tracking or data evolution, check special fields
-            if (core_options.RowTrackingEnabled() && field_id == SpecialFields::RowId().Id()) {
-                read_data_fields.push_back(SpecialFields::RowId());
-                continue;
-            }
-            if ((core_options.RowTrackingEnabled() ||
-                 core_options.KeyValueSequenceNumberEnabled()) &&
-                field_id == SpecialFields::SequenceNumber().Id()) {
-                read_data_fields.push_back(SpecialFields::SequenceNumber());
-                continue;
-            }
-            if (field_id == SpecialFields::ValueKind().Id()) {
-                read_data_fields.push_back(SpecialFields::ValueKind());
-                continue;
-            }
-            if (core_options.DataEvolutionEnabled() &&
-                field_id == SpecialFields::IndexScore().Id()) {
-                read_data_fields.push_back(SpecialFields::IndexScore());
+            if (auto resolved_special_field = TryResolveSpecialFieldById(field_id, core_options)) {
+                read_data_fields.push_back(*resolved_special_field);
                 continue;
             }
             PAIMON_ASSIGN_OR_RAISE(DataField field, table_schema->GetField(field_id));
@@ -82,23 +130,8 @@ Result<std::unique_ptr<InternalReadContext>> InternalReadContext::Create(
     } else if (!context->GetReadFieldNames().empty()) {
         read_data_fields.reserve(context->GetReadFieldNames().size());
         for (const auto& name : context->GetReadFieldNames()) {
-            // if enable row tracking or data evolution, check special fields
-            if (core_options.RowTrackingEnabled() && name == SpecialFields::RowId().Name()) {
-                read_data_fields.push_back(SpecialFields::RowId());
-                continue;
-            }
-            if ((core_options.RowTrackingEnabled() ||
-                 core_options.KeyValueSequenceNumberEnabled()) &&
-                name == SpecialFields::SequenceNumber().Name()) {
-                read_data_fields.push_back(SpecialFields::SequenceNumber());
-                continue;
-            }
-            if (name == SpecialFields::ValueKind().Name()) {
-                read_data_fields.push_back(SpecialFields::ValueKind());
-                continue;
-            }
-            if (core_options.DataEvolutionEnabled() && name == SpecialFields::IndexScore().Name()) {
-                read_data_fields.push_back(SpecialFields::IndexScore());
+            if (auto resolved_special_field = TryResolveSpecialFieldByName(name, core_options)) {
+                read_data_fields.push_back(*resolved_special_field);
                 continue;
             }
             PAIMON_ASSIGN_OR_RAISE(DataField field, table_schema->GetField(name));
diff --git a/src/paimon/core/operation/internal_read_context.h b/src/paimon/core/operation/internal_read_context.h
index 12b734a62..f685137b6 100644
--- a/src/paimon/core/operation/internal_read_context.h
+++ b/src/paimon/core/operation/internal_read_context.h
@@ -112,6 +112,11 @@ class InternalReadContext {
                         const std::shared_ptr<arrow::Schema>& read_schema,
                         const CoreOptions& options);
 
+    static std::optional<DataField> TryResolveSpecialFieldById(int32_t field_id,
+                                                               const CoreOptions& core_options);
+    static std::optional<DataField> TryResolveSpecialFieldByName(const std::string& name,
+                                                                 const CoreOptions& core_options);
+
     std::shared_ptr<ReadContext> read_context_;
     std::shared_ptr<TableSchema> table_schema_;
     std::shared_ptr<arrow::Schema> read_schema_;
diff --git a/src/paimon/core/operation/internal_read_context_test.cpp b/src/paimon/core/operation/internal_read_context_test.cpp
index 35dbade05..ee4c555ac 100644
--- a/src/paimon/core/operation/internal_read_context_test.cpp
+++ b/src/paimon/core/operation/internal_read_context_test.cpp
@@ -192,4 +192,49 @@ TEST(InternalReadContext, TestReadWithFieldIdsAndSpecialFields) {
     }
 }
 
+TEST(InternalReadContext, TestReadWithProjectedSchemaAndSpecialFields) {
+    std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09";
+
+    std::vector<DataField> projected_fields = {DataField(0, arrow::field("f0", arrow::utf8())),
+                                               SpecialFields::RowId(),
+                                               SpecialFields::SequenceNumber(),
+                                               SpecialFields::IndexScore()};
+    auto schema_manager = SchemaManager(std::make_shared<LocalFileSystem>(), path);
+    ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0));
+
+    // Without options, special fields should be rejected in projected-schema path too.
+    {
+        auto projected_schema = DataField::ConvertDataFieldsToArrowSchema(projected_fields);
+        ArrowSchema c_schema;
+        ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok());
+        ReadContextBuilder context_builder(path);
+        context_builder.SetReadSchema(&c_schema);
+        ASSERT_OK_AND_ASSIGN(auto unique_read_context, context_builder.Finish());
+        std::shared_ptr<ReadContext> read_context = std::move(unique_read_context);
+        ASSERT_NOK_WITH_MSG(InternalReadContext::Create(read_context, table_schema,
+                                                        table_schema->Options()),
+                            "not exist in table schema");
+    }
+
+    // With options enabled, projected-schema path should accept these special fields.
+    auto enabled_options = table_schema->Options();
+    enabled_options[Options::ROW_TRACKING_ENABLED] = "true";
+    enabled_options[Options::DATA_EVOLUTION_ENABLED] = "true";
+
+    {
+        auto projected_schema = DataField::ConvertDataFieldsToArrowSchema(projected_fields);
+        ArrowSchema c_schema;
+        ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok());
+        ReadContextBuilder context_builder(path);
+        context_builder.SetReadSchema(&c_schema);
+        ASSERT_OK_AND_ASSIGN(auto unique_read_context, context_builder.Finish());
+        std::shared_ptr<ReadContext> read_context = std::move(unique_read_context);
+        ASSERT_OK_AND_ASSIGN(auto internal_context,
+                             InternalReadContext::Create(read_context, table_schema,
+                                                         enabled_options));
+        auto expected_schema = DataField::ConvertDataFieldsToArrowSchema(projected_fields);
+        ASSERT_TRUE(internal_context->GetReadSchema()->Equals(expected_schema));
+    }
+}
+
 }  // namespace paimon::test
diff --git a/test/inte/nested_column_pruning_inte_test.cpp b/test/inte/nested_column_pruning_inte_test.cpp
index 9e1f8ccc4..ac789c664 100644
--- a/test/inte/nested_column_pruning_inte_test.cpp
+++ b/test/inte/nested_column_pruning_inte_test.cpp
@@ -26,6 +26,7 @@
 #include "arrow/c/bridge.h"
 #include "arrow/ipc/json_simple.h"
 #include "gtest/gtest.h"
+#include "paimon/common/table/special_fields.h"
 #include "paimon/common/types/data_field.h"
 #include "paimon/common/utils/path_util.h"
 #include "paimon/common/utils/string_utils.h"
@@ -359,6 +360,103 @@ TEST_P(NestedColumnPruningInteTest, PruneDeepNestedStruct) {
     ASSERT_TRUE(is_equal);
 }
 
+// Test: Nested projected schema with special fields under row tracking.
+TEST_P(NestedColumnPruningInteTest, PruneNestedStructWithSpecialFields) {
+    // Table schema: f0 (int32), f1 (struct{a: int32, inner: struct{x: int64, y: utf8}})
+    auto inner_struct = arrow::struct_({
+        arrow::field("x", arrow::int64()),
+        arrow::field("y", arrow::utf8()),
+    });
+    auto outer_struct = arrow::struct_({
+        arrow::field("a", arrow::int32()),
+        arrow::field("inner", inner_struct),
+    });
+    arrow::FieldVector table_fields = {
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", outer_struct),
+    };
+    auto table_schema = arrow::schema(table_fields);
+
+    std::map<std::string, std::string> options = {
+        {Options::MANIFEST_FORMAT, "AVRO"},
+        {Options::FILE_FORMAT, StringUtils::ToUpperCase(file_format_)},
+        {Options::TARGET_FILE_SIZE, "1024"},
+        {Options::BUCKET, "-1"},
+        {Options::ROW_TRACKING_ENABLED, "true"},
+    };
+
+    ASSERT_OK_AND_ASSIGN(
+        auto helper, TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{},
+                                        /*primary_keys=*/{}, options, /*is_streaming_mode=*/false));
+
+    std::string data = R"([
+        [1, [10, [100, "aaa"]]],
+        [2, [20, [200, "bbb"]]],
+        [3, [30, [300, "ccc"]]]
+    ])";
+    ASSERT_OK_AND_ASSIGN(auto batch,
+                         TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data,
+                                                     /*partition_map=*/{}, /*bucket=*/0, {}));
+    int64_t commit_identifier = 0;
+    ASSERT_OK_AND_ASSIGN(auto commit_msgs,
+                         helper->WriteAndCommit(std::move(batch), commit_identifier++,
+                                                /*expected_commit_messages=*/std::nullopt));
+
+    ASSERT_OK_AND_ASSIGN(auto data_splits,
+                         helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
+
+    // Field IDs (assigned sequentially by catalog):
+    // f0->0, f1->1, f1.a->2, f1.inner->3, f1.inner.x->4, f1.inner.y->5
+    // Projected: f0, f1{inner{x}}, _SEQUENCE_NUMBER, _ROW_ID
+    auto pruned_inner = arrow::struct_({
+        AnnotateField(arrow::field("x", arrow::int64()), 4),
+    });
+    auto pruned_outer = arrow::struct_({
+        AnnotateField(arrow::field("inner", pruned_inner), 3),
+    });
+    arrow::FieldVector projected_fields = {
+        AnnotateField(arrow::field("f0", arrow::int32()), 0),
+        AnnotateField(arrow::field("f1", pruned_outer), 1),
+        AnnotateField(arrow::field("_SEQUENCE_NUMBER", arrow::int64()),
+                      SpecialFields::SequenceNumber().Id()),
+        AnnotateField(arrow::field("_ROW_ID", arrow::int64()), SpecialFields::RowId().Id()),
+    };
+    auto projected_schema = arrow::schema(projected_fields);
+
+    ArrowSchema c_schema;
+    ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok());
+
+    ReadContextBuilder read_context_builder(table_path_);
+    read_context_builder.SetOptions(options).SetReadSchema(&c_schema);
+    ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish());
+    ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context)));
+    ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits));
+    ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get()));
+
+    ASSERT_EQ(read_result->num_chunks(), 1);
+    auto result_array = std::dynamic_pointer_cast<arrow::StructArray>(read_result->chunk(0));
+    ASSERT_TRUE(result_array);
+
+    ASSERT_TRUE(result_array->GetFieldByName("_SEQUENCE_NUMBER"));
+    ASSERT_TRUE(result_array->GetFieldByName("_ROW_ID"));
+    auto nested_col = result_array->GetFieldByName("f1");
+    ASSERT_TRUE(nested_col);
+
+    auto expected_nested_type = arrow::struct_({
+        arrow::field("inner", arrow::struct_({arrow::field("x", arrow::int64())})),
+    });
+    ASSERT_TRUE(nested_col->type()->Equals(expected_nested_type));
+
+    auto expected_nested_array =
+        arrow::ipc::internal::json::ArrayFromJSON(expected_nested_type, R"([
+            [[100]],
+            [[200]],
+            [[300]]
+        ])")
+            .ValueOrDie();
+    ASSERT_TRUE(nested_col->Equals(expected_nested_array));
+}
+
 // Test: Table has MAP<STRING, INT32> field, read with selected keys filter.
 TEST_P(NestedColumnPruningInteTest, MapSelectedKeys) {
     // Table schema: f0 (int32), f1 (map<string, int32>)

From b5301c4a86aced27566a705e00f9bc75a67dc709 Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Tue, 16 Jun 2026 09:11:54 +0800
Subject: [PATCH 08/24] fix

---
 .../operation/internal_read_context_test.cpp  | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/paimon/core/operation/internal_read_context_test.cpp b/src/paimon/core/operation/internal_read_context_test.cpp
index ee4c555ac..18d1bdc5f 100644
--- a/src/paimon/core/operation/internal_read_context_test.cpp
+++ b/src/paimon/core/operation/internal_read_context_test.cpp
@@ -195,10 +195,9 @@ TEST(InternalReadContext, TestReadWithFieldIdsAndSpecialFields) {
 TEST(InternalReadContext, TestReadWithProjectedSchemaAndSpecialFields) {
     std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09";
 
-    std::vector<DataField> projected_fields = {DataField(0, arrow::field("f0", arrow::utf8())),
-                                               SpecialFields::RowId(),
-                                               SpecialFields::SequenceNumber(),
-                                               SpecialFields::IndexScore()};
+    std::vector<DataField> projected_fields = {
+        DataField(0, arrow::field("f0", arrow::utf8())), SpecialFields::RowId(),
+        SpecialFields::SequenceNumber(), SpecialFields::IndexScore()};
     auto schema_manager = SchemaManager(std::make_shared<LocalFileSystem>(), path);
     ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0));
 
@@ -211,9 +210,9 @@ TEST(InternalReadContext, TestReadWithProjectedSchemaAndSpecialFields) {
         context_builder.SetReadSchema(&c_schema);
         ASSERT_OK_AND_ASSIGN(auto unique_read_context, context_builder.Finish());
         std::shared_ptr<ReadContext> read_context = std::move(unique_read_context);
-        ASSERT_NOK_WITH_MSG(InternalReadContext::Create(read_context, table_schema,
-                                                        table_schema->Options()),
-                            "not exist in table schema");
+        ASSERT_NOK_WITH_MSG(
+            InternalReadContext::Create(read_context, table_schema, table_schema->Options()),
+            "not exist in table schema");
     }
 
     // With options enabled, projected-schema path should accept these special fields.
@@ -229,9 +228,9 @@ TEST(InternalReadContext, TestReadWithProjectedSchemaAndSpecialFields) {
         context_builder.SetReadSchema(&c_schema);
         ASSERT_OK_AND_ASSIGN(auto unique_read_context, context_builder.Finish());
         std::shared_ptr<ReadContext> read_context = std::move(unique_read_context);
-        ASSERT_OK_AND_ASSIGN(auto internal_context,
-                             InternalReadContext::Create(read_context, table_schema,
-                                                         enabled_options));
+        ASSERT_OK_AND_ASSIGN(
+            auto internal_context,
+            InternalReadContext::Create(read_context, table_schema, enabled_options));
         auto expected_schema = DataField::ConvertDataFieldsToArrowSchema(projected_fields);
         ASSERT_TRUE(internal_context->GetReadSchema()->Equals(expected_schema));
     }

From 3d1905b9b7bbf3095d2d9cfe0ced4433e054c888 Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Tue, 16 Jun 2026 11:39:24 +0800
Subject: [PATCH 09/24] fix

---
 src/paimon/core/io/field_mapping_reader.cpp   | 17 +++--
 .../core/io/field_mapping_reader_test.cpp     | 43 +++++++++++
 src/paimon/core/utils/field_mapping.cpp       |  4 +-
 .../core/utils/nested_projection_utils.cpp    | 68 -----------------
 .../core/utils/nested_projection_utils.h      |  7 --
 .../utils/nested_projection_utils_test.cpp    | 39 ----------
 .../format/avro/avro_file_batch_reader.cpp    | 25 +++++++
 .../avro/avro_file_batch_reader_test.cpp      | 33 +++++++++
 .../parquet/parquet_file_batch_reader.cpp     | 73 -------------------
 9 files changed, 115 insertions(+), 194 deletions(-)

diff --git a/src/paimon/core/io/field_mapping_reader.cpp b/src/paimon/core/io/field_mapping_reader.cpp
index 933a53e28..cc1a161fa 100644
--- a/src/paimon/core/io/field_mapping_reader.cpp
+++ b/src/paimon/core/io/field_mapping_reader.cpp
@@ -67,13 +67,20 @@ FieldMappingReader::FieldMappingReader(int32_t field_count,
         if (non_partition_info_.cast_executors[i] != nullptr) {
             need_casting_ = true;
         }
+        // Always keep mapping enabled for nested fields so we can validate
+        // that format readers really honor pushed nested projections.
+        auto type_id = non_partition_info_.non_partition_read_schema[i].Type()->id();
+        if (type_id == arrow::Type::STRUCT || type_id == arrow::Type::LIST ||
+            type_id == arrow::Type::MAP) {
+            need_mapping_ = true;
+        }
         // Field name change (RENAME COLUMN) also requires mapping: data schema
         // carries the file's physical name while read schema carries the
         // post-rename logical name. If we skipped mapping, the inner reader's
         // batch would be passed through with the old physical name and the
         // consumer's name-based lookup against the read schema would fail.
         // Nested type difference (nested column pruning) also requires mapping
-        // so that PruneArray can trim excess sub-fields from the format reader.
+        // so we can validate that format readers honor the pushed read schema.
         if (non_partition_info_.non_partition_data_schema[i].Name() !=
                 non_partition_info_.non_partition_read_schema[i].Name() ||
             !non_partition_info_.non_partition_data_schema[i].Type()->Equals(
@@ -311,12 +318,12 @@ Status FieldMappingReader::MappingFields(const std::shared_ptr<arrow::Array>& da
     for (size_t i = 0; i < idx_in_target_schema.size(); i++) {
         std::shared_ptr<arrow::Array> field_array = struct_array->field(i);
 
-        // Fallback nested pruning: if the format reader returned more nested
-        // sub-fields than requested, prune the excess here.
         const std::shared_ptr<arrow::DataType>& target_type = read_fields_of_data_array[i].Type();
         if (!field_array->type()->Equals(target_type)) {
-            PAIMON_ASSIGN_OR_RAISE(field_array,
-                                   NestedProjectionUtils::PruneArray(field_array, target_type));
+            return Status::Invalid(fmt::format(
+                "FieldMappingReader mapping failed: format reader returned type {} for field '{}' but expected {}. Nested sub-field projection must be handled by format SetReadSchema.",
+                field_array->type()->ToString(), read_fields_of_data_array[i].Name(),
+                target_type->ToString()));
         }
 
         // Filter map entries by selected keys if metadata is present.
diff --git a/src/paimon/core/io/field_mapping_reader_test.cpp b/src/paimon/core/io/field_mapping_reader_test.cpp
index 7b916b914..69359521a 100644
--- a/src/paimon/core/io/field_mapping_reader_test.cpp
+++ b/src/paimon/core/io/field_mapping_reader_test.cpp
@@ -45,6 +45,7 @@
 #include "paimon/memory/memory_pool.h"
 #include "paimon/predicate/literal.h"
 #include "paimon/predicate/predicate_builder.h"
+#include "paimon/testing/mock/mock_file_batch_reader.h"
 #include "paimon/testing/utils/binary_row_generator.h"
 #include "paimon/testing/utils/read_result_collector.h"
 #include "paimon/testing/utils/testharness.h"
@@ -623,6 +624,48 @@ TEST_F(FieldMappingReaderTest, TestReadWithSchemaEvolutionPureRename) {
                 /*partition_keys=*/{}, BinaryRow::EmptyRow(), expected);
 }
 
+TEST_F(FieldMappingReaderTest, TestNestedProjectionMismatchShouldFailFast) {
+    // File data has full nested struct f1{a,b}.
+    std::vector<DataField> data_fields = {
+        DataField(0, arrow::field("f0", arrow::int32())),
+        DataField(1, arrow::field("f1", arrow::struct_({arrow::field("a", arrow::int32()),
+                                                          arrow::field("b", arrow::utf8())})))
+    };
+    auto data_schema = DataField::ConvertDataFieldsToArrowSchema(data_fields);
+    auto data_array = std::dynamic_pointer_cast<arrow::StructArray>(
+        arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(data_schema->fields()), R"([
+        [1, [10, "x"]],
+        [2, [20, "y"]]
+    ])")
+            .ValueOrDie());
+
+    // Read schema requests pruned nested struct f1{a}.
+    std::vector<DataField> read_fields = {
+        DataField(0, arrow::field("f0", arrow::int32())),
+        DataField(1, arrow::field("f1", arrow::struct_({arrow::field("a", arrow::int32())})))
+    };
+    auto read_schema = DataField::ConvertDataFieldsToArrowSchema(read_fields);
+
+    ASSERT_OK_AND_ASSIGN(auto mapping_builder,
+                         FieldMappingBuilder::Create(read_schema, /*partition_keys=*/{},
+                                                     /*predicate=*/nullptr));
+    ASSERT_OK_AND_ASSIGN(auto mapping, mapping_builder->CreateFieldMapping(data_schema));
+
+    // Mock reader ignores SetReadSchema and still returns full nested payload.
+    auto mock_reader = std::make_unique<MockFileBatchReader>(
+        data_array, data_array->type(), /*read_batch_size=*/10);
+
+    auto reader = std::make_shared<FieldMappingReader>(
+        /*field_count=*/read_schema->num_fields(), std::move(mock_reader), BinaryRow::EmptyRow(),
+        std::move(mapping), pool_);
+
+    auto result = ReadResultCollector::CollectResult(reader.get());
+    ASSERT_FALSE(result.ok());
+    ASSERT_NE(result.status().ToString().find("Nested sub-field projection must be handled"),
+              std::string::npos)
+        << result.status().ToString();
+}
+
 TEST_F(FieldMappingReaderTest, TestReadWithSchemaEvolutionWithRenameAndModifyTypeAndPredicate) {
     // field_0 and field_3 are rename and modify type
     // result is not filtered by predicate, as DOUBLE->STRING alter table does not support predicate
diff --git a/src/paimon/core/utils/field_mapping.cpp b/src/paimon/core/utils/field_mapping.cpp
index d06ea793a..7f1be719b 100644
--- a/src/paimon/core/utils/field_mapping.cpp
+++ b/src/paimon/core/utils/field_mapping.cpp
@@ -162,8 +162,8 @@ Result<std::vector<std::shared_ptr<CastExecutor>>> FieldMappingBuilder::CreateDa
             if (read_type == FieldType::MAP || read_type == FieldType::ARRAY ||
                 read_type == FieldType::STRUCT) {
                 // Nested types may differ due to nested column pruning (different
-                // number of sub-fields). No cast is needed — pruning is handled
-                // separately by PruneDataType / PruneArray.
+                // number of sub-fields). No cast is needed — type pruning is
+                // handled by PruneDataType during field mapping construction.
                 cast_executors.push_back(nullptr);
                 continue;
             }
diff --git a/src/paimon/core/utils/nested_projection_utils.cpp b/src/paimon/core/utils/nested_projection_utils.cpp
index bb25a0477..b82eb3742 100644
--- a/src/paimon/core/utils/nested_projection_utils.cpp
+++ b/src/paimon/core/utils/nested_projection_utils.cpp
@@ -103,74 +103,6 @@ Result<std::optional<std::shared_ptr<arrow::DataType>>> NestedProjectionUtils::P
     }
 }
 
-// PruneArray — fallback for format readers that return extra nested columns
-
-Result<std::shared_ptr<arrow::Array>> NestedProjectionUtils::PruneArray(
-    const std::shared_ptr<arrow::Array>& array,
-    const std::shared_ptr<arrow::DataType>& target_type) {
-    if (!array || array->type()->Equals(target_type)) {
-        return array;
-    }
-
-    switch (target_type->id()) {
-        case arrow::Type::STRUCT: {
-            auto struct_array = std::static_pointer_cast<arrow::StructArray>(array);
-            arrow::ArrayVector pruned_children;
-            arrow::FieldVector pruned_fields;
-            for (const auto& target_field : target_type->fields()) {
-                std::shared_ptr<arrow::Array> child =
-                    struct_array->GetFieldByName(target_field->name());
-                if (!child) {
-                    return Status::Invalid(fmt::format(
-                        "PruneArray: field '{}' not found in struct array", target_field->name()));
-                }
-                PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> pruned_child,
-                                       PruneArray(child, target_field->type()));
-                pruned_children.push_back(std::move(pruned_child));
-                pruned_fields.push_back(target_field);
-            }
-            PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
-                std::shared_ptr<arrow::StructArray> result_struct,
-                arrow::StructArray::Make(pruned_children, pruned_fields,
-                                         struct_array->null_bitmap(), struct_array->null_count(),
-                                         struct_array->offset()));
-            return std::static_pointer_cast<arrow::Array>(result_struct);
-        }
-
-        case arrow::Type::LIST: {
-            auto list_array = std::static_pointer_cast<arrow::ListArray>(array);
-            const auto& target_elem_type =
-                static_cast<const arrow::ListType&>(*target_type).value_type();
-            PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> pruned_values,
-                                   PruneArray(list_array->values(), target_elem_type));
-            PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
-                std::shared_ptr<arrow::ListArray> result_list,
-                arrow::ListArray::FromArrays(*list_array->offsets(), *pruned_values,
-                                             arrow::default_memory_pool(),
-                                             list_array->null_bitmap(), list_array->null_count()));
-            return std::static_pointer_cast<arrow::Array>(result_list);
-        }
-
-        case arrow::Type::MAP: {
-            auto map_array = std::static_pointer_cast<arrow::MapArray>(array);
-            const auto& target_map_type = static_cast<const arrow::MapType&>(*target_type);
-            PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> pruned_keys,
-                                   PruneArray(map_array->keys(), target_map_type.key_type()));
-            PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> pruned_items,
-                                   PruneArray(map_array->items(), target_map_type.item_type()));
-            PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
-                std::shared_ptr<arrow::Array> result_map,
-                arrow::MapArray::FromArrays(map_array->offsets(), pruned_keys, pruned_items,
-                                            arrow::default_memory_pool()));
-            return result_map;
-        }
-
-        default:
-            // Atomic type — no pruning needed.
-            return array;
-    }
-}
-
 // Map selected-keys support
 
 std::set<std::string> NestedProjectionUtils::GetMapSelectedKeys(
diff --git a/src/paimon/core/utils/nested_projection_utils.h b/src/paimon/core/utils/nested_projection_utils.h
index ad0f27996..ee0e53976 100644
--- a/src/paimon/core/utils/nested_projection_utils.h
+++ b/src/paimon/core/utils/nested_projection_utils.h
@@ -77,13 +77,6 @@ class PAIMON_EXPORT NestedProjectionUtils {
         const std::shared_ptr<arrow::DataType>& read_type,
         const std::shared_ptr<arrow::DataType>& data_type);
 
-    /// Prune a StructArray so that only the sub-fields present in `target_type`
-    /// are kept. Used as a fallback when the format reader returns more columns
-    /// than requested.
-    static Result<std::shared_ptr<arrow::Array>> PruneArray(
-        const std::shared_ptr<arrow::Array>& array,
-        const std::shared_ptr<arrow::DataType>& target_type);
-
     /// Parse the "paimon.map.selected-keys" metadata from an Arrow field.
     /// Returns an empty set if the metadata key is absent or the field is not a MAP.
     /// The metadata value must be a JSON array of strings, e.g. '["key1","key2"]'.
diff --git a/src/paimon/core/utils/nested_projection_utils_test.cpp b/src/paimon/core/utils/nested_projection_utils_test.cpp
index 911b22438..00da2dd3e 100644
--- a/src/paimon/core/utils/nested_projection_utils_test.cpp
+++ b/src/paimon/core/utils/nested_projection_utils_test.cpp
@@ -169,45 +169,6 @@ TEST(NestedProjectionUtilsTest, PruneDataType_MapWithStructValue) {
     ASSERT_EQ(map_type->item_type()->field(0)->name(), "a");
 }
 
-// ============== PruneArray ==============
-
-TEST(NestedProjectionUtilsTest, PruneArray_StructPrune) {
-    // Build a StructArray with fields x:INT, y:STRING
-    arrow::Int32Builder x_builder;
-    ASSERT_TRUE(x_builder.AppendValues({1, 2, 3}).ok());
-    std::shared_ptr<arrow::Array> x_array;
-    ASSERT_TRUE(x_builder.Finish(&x_array).ok());
-
-    arrow::StringBuilder y_builder;
-    ASSERT_TRUE(y_builder.AppendValues({"a", "b", "c"}).ok());
-    std::shared_ptr<arrow::Array> y_array;
-    ASSERT_TRUE(y_builder.Finish(&y_array).ok());
-
-    auto struct_type =
-        arrow::struct_({arrow::field("x", arrow::int32()), arrow::field("y", arrow::utf8())});
-    auto struct_result = arrow::StructArray::Make({x_array, y_array}, struct_type->fields());
-    ASSERT_TRUE(struct_result.ok());
-    auto struct_array = struct_result.ValueUnsafe();
-
-    // Prune to only keep "x"
-    auto target_type = arrow::struct_({arrow::field("x", arrow::int32())});
-    ASSERT_OK_AND_ASSIGN(auto pruned, NestedProjectionUtils::PruneArray(struct_array, target_type));
-
-    ASSERT_EQ(pruned->type()->num_fields(), 1);
-    ASSERT_EQ(pruned->type()->field(0)->name(), "x");
-    ASSERT_EQ(pruned->length(), 3);
-}
-
-TEST(NestedProjectionUtilsTest, PruneArray_IdenticalType) {
-    arrow::Int32Builder builder;
-    ASSERT_TRUE(builder.AppendValues({10, 20}).ok());
-    std::shared_ptr<arrow::Array> array;
-    ASSERT_TRUE(builder.Finish(&array).ok());
-
-    ASSERT_OK_AND_ASSIGN(auto pruned, NestedProjectionUtils::PruneArray(array, arrow::int32()));
-    ASSERT_EQ(pruned.get(), array.get());  // Same pointer — no copy.
-}
-
 // ============== GetMapSelectedKeys ==============
 
 TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_Present) {
diff --git a/src/paimon/format/avro/avro_file_batch_reader.cpp b/src/paimon/format/avro/avro_file_batch_reader.cpp
index 13833f97c..99bde55a1 100644
--- a/src/paimon/format/avro/avro_file_batch_reader.cpp
+++ b/src/paimon/format/avro/avro_file_batch_reader.cpp
@@ -33,6 +33,30 @@
 
 namespace paimon::avro {
 
+namespace {
+
+bool IsNestedType(const std::shared_ptr<arrow::DataType>& type) {
+    return type->id() == arrow::Type::STRUCT || type->id() == arrow::Type::LIST ||
+           type->id() == arrow::Type::MAP;
+}
+
+Status ValidateUnsupportedNestedProjection(const std::shared_ptr<arrow::Schema>& file_schema,
+                                           const std::shared_ptr<arrow::Schema>& read_schema) {
+    for (const auto& read_field : read_schema->fields()) {
+        auto file_field = file_schema->GetFieldByName(read_field->name());
+        if (!file_field) {
+            continue;
+        }
+        if (IsNestedType(read_field->type()) && !read_field->type()->Equals(file_field->type())) {
+            return Status::Invalid(
+                "SetReadSchema failed: avro reader does not support nested sub-field projection");
+        }
+    }
+    return Status::OK();
+}
+
+}  // namespace
+
 AvroFileBatchReader::AvroFileBatchReader(const std::shared_ptr<InputStream>& input_stream,
                                          const std::shared_ptr<::arrow::DataType>& file_data_type,
                                          std::unique_ptr<::avro::DataFileReaderBase>&& reader,
@@ -148,6 +172,7 @@ Status AvroFileBatchReader::SetReadSchema(::ArrowSchema* read_schema,
                                       arrow::ImportSchema(read_schema));
     PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Schema> file_schema,
                            ArrowUtils::DataTypeToSchema(file_data_type_));
+    PAIMON_RETURN_NOT_OK(ValidateUnsupportedNestedProjection(file_schema, arrow_read_schema));
     PAIMON_ASSIGN_OR_RAISE(read_fields_projection_,
                            CalculateReadFieldsProjection(file_schema, arrow_read_schema->fields()));
     array_builder_->Reset();
diff --git a/src/paimon/format/avro/avro_file_batch_reader_test.cpp b/src/paimon/format/avro/avro_file_batch_reader_test.cpp
index a8ed3bb6c..91c52f46b 100644
--- a/src/paimon/format/avro/avro_file_batch_reader_test.cpp
+++ b/src/paimon/format/avro/avro_file_batch_reader_test.cpp
@@ -196,6 +196,39 @@ TEST_F(AvroFileBatchReaderTest, TestReadAllTypes) {
     ASSERT_TRUE(expected_array->Equals(result_array)) << result_array->ToString();
 }
 
+TEST_F(AvroFileBatchReaderTest, TestSetReadSchemaRejectNestedSubFieldProjection) {
+    std::string file_path = PathUtil::JoinPath(dir_->Str(), "nested_projection_unsupported.avro");
+
+    arrow::FieldVector write_fields = {
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", arrow::struct_({arrow::field("a", arrow::int32()),
+                                             arrow::field("b", arrow::utf8())}))};
+    auto write_type = arrow::struct_(write_fields);
+    auto write_array =
+        arrow::ipc::internal::json::ArrayFromJSON(write_type, R"([
+            [1, [10, "x"]],
+            [2, [20, "y"]]
+        ])")
+            .ValueOrDie();
+    WriteData(write_array, file_path, /*compression=*/"null");
+
+    ASSERT_OK_AND_ASSIGN(auto reader_builder,
+                         file_format_->CreateReaderBuilder(/*batch_size=*/1024));
+    ASSERT_OK_AND_ASSIGN(std::shared_ptr<InputStream> in, fs_->Open(file_path));
+    ASSERT_OK_AND_ASSIGN(auto batch_reader, reader_builder->Build(in));
+
+    arrow::FieldVector read_fields = {
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", arrow::struct_({arrow::field("a", arrow::int32())}))};
+    auto read_schema = arrow::schema(read_fields);
+    std::unique_ptr<ArrowSchema> c_schema = std::make_unique<ArrowSchema>();
+    ASSERT_TRUE(arrow::ExportSchema(*read_schema, c_schema.get()).ok());
+
+    ASSERT_NOK_WITH_MSG(batch_reader->SetReadSchema(c_schema.get(), /*predicate=*/nullptr,
+                                                    /*selection_bitmap=*/std::nullopt),
+                        "does not support nested sub-field projection");
+}
+
 TEST_P(AvroFileBatchReaderTest, TestReadTimestampTypes) {
     auto enable_tz = GetParam();
     std::string timezone_str = enable_tz ? "Asia/Tokyo" : "Asia/Shanghai";
diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
index 406ff018d..a726da4d3 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
@@ -79,78 +79,6 @@ std::shared_ptr<arrow::Field> FindMatchingReadField(
     return nullptr;
 }
 
-Result<std::shared_ptr<arrow::Array>> PruneArrayToReadType(
-    const std::shared_ptr<arrow::Array>& array,
-    const std::shared_ptr<arrow::DataType>& target_type) {
-    if (!array || array->type()->Equals(target_type)) {
-        return array;
-    }
-
-    switch (target_type->id()) {
-        case arrow::Type::STRUCT: {
-            auto struct_array = std::static_pointer_cast<arrow::StructArray>(array);
-            auto target_struct_type = std::static_pointer_cast<arrow::StructType>(target_type);
-            arrow::ArrayVector pruned_children;
-            arrow::FieldVector pruned_fields;
-            pruned_children.reserve(target_struct_type->num_fields());
-            pruned_fields.reserve(target_struct_type->num_fields());
-            for (const auto& target_field : target_struct_type->fields()) {
-                auto src_field =
-                    FindMatchingReadField(struct_array->type()->fields(), target_field);
-                if (!src_field) {
-                    return Status::Invalid(
-                        fmt::format("PruneArrayToReadType: field '{}' not found in struct array",
-                                    target_field->name()));
-                }
-                auto child = struct_array->GetFieldByName(src_field->name());
-                PAIMON_ASSIGN_OR_RAISE(auto pruned_child,
-                                       PruneArrayToReadType(child, target_field->type()));
-                pruned_children.push_back(std::move(pruned_child));
-                pruned_fields.push_back(src_field->WithType(pruned_children.back()->type()));
-            }
-            PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
-                std::shared_ptr<arrow::StructArray> result_struct,
-                arrow::StructArray::Make(pruned_children, pruned_fields,
-                                         struct_array->null_bitmap(), struct_array->null_count(),
-                                         struct_array->offset()));
-            return std::static_pointer_cast<arrow::Array>(result_struct);
-        }
-
-        case arrow::Type::LIST: {
-            auto list_array = std::static_pointer_cast<arrow::ListArray>(array);
-            const auto& target_elem_type =
-                static_cast<const arrow::ListType&>(*target_type).value_type();
-            PAIMON_ASSIGN_OR_RAISE(auto pruned_values,
-                                   PruneArrayToReadType(list_array->values(), target_elem_type));
-            PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
-                std::shared_ptr<arrow::ListArray> result_list,
-                arrow::ListArray::FromArrays(*list_array->offsets(), *pruned_values,
-                                             arrow::default_memory_pool(),
-                                             list_array->null_bitmap(), list_array->null_count()));
-            return std::static_pointer_cast<arrow::Array>(result_list);
-        }
-
-        case arrow::Type::MAP: {
-            auto map_array = std::static_pointer_cast<arrow::MapArray>(array);
-            const auto& target_map_type = static_cast<const arrow::MapType&>(*target_type);
-            PAIMON_ASSIGN_OR_RAISE(
-                auto pruned_keys,
-                PruneArrayToReadType(map_array->keys(), target_map_type.key_type()));
-            PAIMON_ASSIGN_OR_RAISE(
-                auto pruned_items,
-                PruneArrayToReadType(map_array->items(), target_map_type.item_type()));
-            PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
-                std::shared_ptr<arrow::Array> result_map,
-                arrow::MapArray::FromArrays(map_array->offsets(), pruned_keys, pruned_items,
-                                            arrow::default_memory_pool()));
-            return result_map;
-        }
-
-        default:
-            return array;
-    }
-}
-
 }  // namespace
 
 ParquetFileBatchReader::ParquetFileBatchReader(
@@ -447,7 +375,6 @@ Result<BatchReader::ReadBatch> ParquetFileBatchReader::NextBatch() {
         }
         PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr<arrow::Array> array,
                                           batch->ToStructArray());
-        PAIMON_ASSIGN_OR_RAISE(array, PruneArrayToReadType(array, read_data_type_));
         PAIMON_ASSIGN_OR_RAISE(bool need_cast, ParquetTimestampConverter::NeedCastArrayForTimestamp(
                                                    array->type(), read_data_type_));
         if (need_cast) {

From e126e79e93869f24608671dc61eed27ae97d4f9b Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Tue, 16 Jun 2026 16:39:56 +0800
Subject: [PATCH 10/24] fix

---
 src/paimon/core/io/field_mapping_reader.cpp   |  21 +--
 .../core/io/field_mapping_reader_test.cpp     |  42 ------
 .../core/operation/internal_read_context.cpp  | 124 +++++++++++++++---
 .../core/operation/internal_read_context.h    |   8 ++
 .../operation/internal_read_context_test.cpp  |  28 ++++
 test/inte/nested_column_pruning_inte_test.cpp |  71 ++++------
 6 files changed, 167 insertions(+), 127 deletions(-)

diff --git a/src/paimon/core/io/field_mapping_reader.cpp b/src/paimon/core/io/field_mapping_reader.cpp
index cc1a161fa..86e6f47a5 100644
--- a/src/paimon/core/io/field_mapping_reader.cpp
+++ b/src/paimon/core/io/field_mapping_reader.cpp
@@ -67,24 +67,13 @@ FieldMappingReader::FieldMappingReader(int32_t field_count,
         if (non_partition_info_.cast_executors[i] != nullptr) {
             need_casting_ = true;
         }
-        // Always keep mapping enabled for nested fields so we can validate
-        // that format readers really honor pushed nested projections.
-        auto type_id = non_partition_info_.non_partition_read_schema[i].Type()->id();
-        if (type_id == arrow::Type::STRUCT || type_id == arrow::Type::LIST ||
-            type_id == arrow::Type::MAP) {
-            need_mapping_ = true;
-        }
         // Field name change (RENAME COLUMN) also requires mapping: data schema
         // carries the file's physical name while read schema carries the
         // post-rename logical name. If we skipped mapping, the inner reader's
         // batch would be passed through with the old physical name and the
         // consumer's name-based lookup against the read schema would fail.
-        // Nested type difference (nested column pruning) also requires mapping
-        // so we can validate that format readers honor the pushed read schema.
         if (non_partition_info_.non_partition_data_schema[i].Name() !=
-                non_partition_info_.non_partition_read_schema[i].Name() ||
-            !non_partition_info_.non_partition_data_schema[i].Type()->Equals(
-                non_partition_info_.non_partition_read_schema[i].Type())) {
+            non_partition_info_.non_partition_read_schema[i].Name()) {
             need_mapping_ = true;
         }
         // Map selected-keys metadata also requires mapping so that
@@ -318,14 +307,6 @@ Status FieldMappingReader::MappingFields(const std::shared_ptr<arrow::Array>& da
     for (size_t i = 0; i < idx_in_target_schema.size(); i++) {
         std::shared_ptr<arrow::Array> field_array = struct_array->field(i);
 
-        const std::shared_ptr<arrow::DataType>& target_type = read_fields_of_data_array[i].Type();
-        if (!field_array->type()->Equals(target_type)) {
-            return Status::Invalid(fmt::format(
-                "FieldMappingReader mapping failed: format reader returned type {} for field '{}' but expected {}. Nested sub-field projection must be handled by format SetReadSchema.",
-                field_array->type()->ToString(), read_fields_of_data_array[i].Name(),
-                target_type->ToString()));
-        }
-
         // Filter map entries by selected keys if metadata is present.
         if (field_array->type()->id() == arrow::Type::MAP) {
             std::set<std::string> selected_keys = NestedProjectionUtils::GetMapSelectedKeys(
diff --git a/src/paimon/core/io/field_mapping_reader_test.cpp b/src/paimon/core/io/field_mapping_reader_test.cpp
index 69359521a..fbde8812e 100644
--- a/src/paimon/core/io/field_mapping_reader_test.cpp
+++ b/src/paimon/core/io/field_mapping_reader_test.cpp
@@ -624,48 +624,6 @@ TEST_F(FieldMappingReaderTest, TestReadWithSchemaEvolutionPureRename) {
                 /*partition_keys=*/{}, BinaryRow::EmptyRow(), expected);
 }
 
-TEST_F(FieldMappingReaderTest, TestNestedProjectionMismatchShouldFailFast) {
-    // File data has full nested struct f1{a,b}.
-    std::vector<DataField> data_fields = {
-        DataField(0, arrow::field("f0", arrow::int32())),
-        DataField(1, arrow::field("f1", arrow::struct_({arrow::field("a", arrow::int32()),
-                                                          arrow::field("b", arrow::utf8())})))
-    };
-    auto data_schema = DataField::ConvertDataFieldsToArrowSchema(data_fields);
-    auto data_array = std::dynamic_pointer_cast<arrow::StructArray>(
-        arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(data_schema->fields()), R"([
-        [1, [10, "x"]],
-        [2, [20, "y"]]
-    ])")
-            .ValueOrDie());
-
-    // Read schema requests pruned nested struct f1{a}.
-    std::vector<DataField> read_fields = {
-        DataField(0, arrow::field("f0", arrow::int32())),
-        DataField(1, arrow::field("f1", arrow::struct_({arrow::field("a", arrow::int32())})))
-    };
-    auto read_schema = DataField::ConvertDataFieldsToArrowSchema(read_fields);
-
-    ASSERT_OK_AND_ASSIGN(auto mapping_builder,
-                         FieldMappingBuilder::Create(read_schema, /*partition_keys=*/{},
-                                                     /*predicate=*/nullptr));
-    ASSERT_OK_AND_ASSIGN(auto mapping, mapping_builder->CreateFieldMapping(data_schema));
-
-    // Mock reader ignores SetReadSchema and still returns full nested payload.
-    auto mock_reader = std::make_unique<MockFileBatchReader>(
-        data_array, data_array->type(), /*read_batch_size=*/10);
-
-    auto reader = std::make_shared<FieldMappingReader>(
-        /*field_count=*/read_schema->num_fields(), std::move(mock_reader), BinaryRow::EmptyRow(),
-        std::move(mapping), pool_);
-
-    auto result = ReadResultCollector::CollectResult(reader.get());
-    ASSERT_FALSE(result.ok());
-    ASSERT_NE(result.status().ToString().find("Nested sub-field projection must be handled"),
-              std::string::npos)
-        << result.status().ToString();
-}
-
 TEST_F(FieldMappingReaderTest, TestReadWithSchemaEvolutionWithRenameAndModifyTypeAndPredicate) {
     // field_0 and field_3 are rename and modify type
     // result is not filtered by predicate, as DOUBLE->STRING alter table does not support predicate
diff --git a/src/paimon/core/operation/internal_read_context.cpp b/src/paimon/core/operation/internal_read_context.cpp
index 1252be215..a94e935b5 100644
--- a/src/paimon/core/operation/internal_read_context.cpp
+++ b/src/paimon/core/operation/internal_read_context.cpp
@@ -17,10 +17,13 @@
 #include "paimon/core/operation/internal_read_context.h"
 
 #include <optional>
+#include <unordered_map>
 #include <utility>
 
+#include "arrow/api.h"
 #include "arrow/c/abi.h"
 #include "arrow/c/bridge.h"
+#include "fmt/format.h"
 #include "paimon/common/predicate/predicate_validator.h"
 #include "paimon/common/table/special_fields.h"
 #include "paimon/common/types/data_field.h"
@@ -30,6 +33,100 @@
 
 namespace paimon {
 
+std::shared_ptr<arrow::Field> InternalReadContext::FindFieldByName(
+    const arrow::FieldVector& fields, const std::string& name) {
+    for (const auto& field : fields) {
+        if (field->name() == name) {
+            return field;
+        }
+    }
+    return nullptr;
+}
+
+std::shared_ptr<arrow::Field> InternalReadContext::MergeReadFieldMetadata(
+    const std::shared_ptr<arrow::Field>& aligned_field,
+    const std::shared_ptr<arrow::Field>& read_field) {
+    if (!read_field->HasMetadata() || !read_field->metadata()) {
+        return aligned_field;
+    }
+    std::unordered_map<std::string, std::string> metadata_map;
+    read_field->metadata()->ToUnorderedMap(&metadata_map);
+    metadata_map.erase(DataField::FIELD_ID);
+    if (metadata_map.empty()) {
+        return aligned_field;
+    }
+    auto metadata = std::make_shared<arrow::KeyValueMetadata>(metadata_map);
+    return aligned_field->WithMergedMetadata(metadata);
+}
+
+Result<std::shared_ptr<arrow::Field>> InternalReadContext::AlignReadFieldWithTableFieldIds(
+    const std::shared_ptr<arrow::Field>& read_field,
+    const std::shared_ptr<arrow::Field>& table_field) {
+    if (read_field->type()->id() != table_field->type()->id()) {
+        return Status::Invalid(fmt::format(
+            "Read schema field '{}' type {} does not match table field type {}", 
+            read_field->name(), read_field->type()->ToString(),
+            table_field->type()->ToString()));
+    }
+
+    auto type_id = read_field->type()->id();
+    if (type_id == arrow::Type::STRUCT) {
+        auto read_struct = std::static_pointer_cast<arrow::StructType>(read_field->type());
+        auto table_struct = std::static_pointer_cast<arrow::StructType>(table_field->type());
+        arrow::FieldVector rebased_children;
+        rebased_children.reserve(read_struct->num_fields());
+        for (const auto& read_child : read_struct->fields()) {
+            auto table_child = FindFieldByName(table_struct->fields(), read_child->name());
+            if (!table_child) {
+                return Status::Invalid(fmt::format(
+                    "Read schema nested field '{}' does not exist in table field '{}'", 
+                    read_child->name(), table_field->name()));
+            }
+            PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Field> rebased_child,
+                                   AlignReadFieldWithTableFieldIds(read_child, table_child));
+            rebased_children.push_back(rebased_child);
+        }
+        auto rebased_type = arrow::struct_(rebased_children);
+        auto aligned_field = table_field->WithType(rebased_type)->WithName(read_field->name());
+        return MergeReadFieldMetadata(aligned_field, read_field);
+    }
+
+    if (type_id == arrow::Type::LIST) {
+        auto read_list = std::static_pointer_cast<arrow::ListType>(read_field->type());
+        auto table_list = std::static_pointer_cast<arrow::ListType>(table_field->type());
+        PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Field> rebased_value_field,
+                               AlignReadFieldWithTableFieldIds(read_list->value_field(),
+                                                               table_list->value_field()));
+        auto rebased_type = arrow::list(rebased_value_field);
+        auto aligned_field = table_field->WithType(rebased_type)->WithName(read_field->name());
+        return MergeReadFieldMetadata(aligned_field, read_field);
+    }
+
+    if (type_id == arrow::Type::MAP) {
+        auto read_map = std::static_pointer_cast<arrow::MapType>(read_field->type());
+        auto table_map = std::static_pointer_cast<arrow::MapType>(table_field->type());
+        PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Field> rebased_key_field,
+                               AlignReadFieldWithTableFieldIds(read_map->key_field(),
+                                                               table_map->key_field()));
+        PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Field> rebased_item_field,
+                               AlignReadFieldWithTableFieldIds(read_map->item_field(),
+                                                               table_map->item_field()));
+        auto rebased_type = arrow::map(rebased_key_field->type(), rebased_item_field);
+        auto aligned_field = table_field->WithType(rebased_type)->WithName(read_field->name());
+        return MergeReadFieldMetadata(aligned_field, read_field);
+    }
+
+    if (!read_field->type()->Equals(table_field->type())) {
+        return Status::Invalid(fmt::format(
+            "Read schema field '{}' type {} does not match table field type {}", 
+            read_field->name(), read_field->type()->ToString(),
+            table_field->type()->ToString()));
+    }
+
+    auto aligned_field = table_field->WithType(read_field->type())->WithName(read_field->name());
+    return MergeReadFieldMetadata(aligned_field, read_field);
+}
+
 std::optional<DataField> InternalReadContext::TryResolveSpecialFieldById(
     int32_t field_id, const CoreOptions& core_options) {
     if (field_id == SpecialFields::ValueKind().Id()) {
@@ -93,29 +190,24 @@ Result<std::unique_ptr<InternalReadContext>> InternalReadContext::Create(
     // Priority: projected_arrow_schema > read_field_ids > read_field_names
     std::vector<DataField> read_data_fields;
     if (context->HasReadSchema()) {
-        // Nested column pruning path: user provided a projected C ArrowSchema
+        // Nested column pruning path: user provided a read C ArrowSchema
         // where STRUCT types may contain only a subset of sub-fields.
         // ImportSchema consumes the C schema — that's fine, it's one-shot usage.
-        PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr<arrow::Schema> projected_schema,
+        PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr<arrow::Schema> read_schema,
                                           arrow::ImportSchema(context->GetReadSchema()));
-        PAIMON_ASSIGN_OR_RAISE(read_data_fields,
-                               DataField::ConvertArrowSchemaToDataFields(projected_schema));
+        read_data_fields.reserve(read_schema->num_fields());
         // Align special-field validation with read_field_ids/read_field_names branches.
-        for (auto& field : read_data_fields) {
+        for (const auto& read_field : read_schema->fields()) {
             if (auto resolved_special_field =
-                    TryResolveSpecialFieldById(field.Id(), core_options)) {
-                field = *resolved_special_field;
+                    TryResolveSpecialFieldByName(read_field->name(), core_options)) {
+                read_data_fields.push_back(*resolved_special_field);
                 continue;
             }
-            if (SpecialFields::IsSpecialFieldName(field.Name())) {
-                if (auto resolved_special_field =
-                        TryResolveSpecialFieldByName(field.Name(), core_options)) {
-                    field = *resolved_special_field;
-                    continue;
-                }
-            }
-            PAIMON_ASSIGN_OR_RAISE([[maybe_unused]] DataField unused,
-                                   table_schema->GetField(field.Id()));
+            PAIMON_ASSIGN_OR_RAISE(DataField table_field, table_schema->GetField(read_field->name()));
+            PAIMON_ASSIGN_OR_RAISE(
+                std::shared_ptr<arrow::Field> aligned_field,
+                AlignReadFieldWithTableFieldIds(read_field, table_field.ArrowField()));
+            read_data_fields.emplace_back(table_field.Id(), aligned_field, table_field.Description());
         }
     } else if (!context->GetReadFieldIds().empty()) {
         read_data_fields.reserve(context->GetReadFieldIds().size());
diff --git a/src/paimon/core/operation/internal_read_context.h b/src/paimon/core/operation/internal_read_context.h
index f685137b6..54af862bc 100644
--- a/src/paimon/core/operation/internal_read_context.h
+++ b/src/paimon/core/operation/internal_read_context.h
@@ -116,6 +116,14 @@ class InternalReadContext {
                                                                const CoreOptions& core_options);
     static std::optional<DataField> TryResolveSpecialFieldByName(const std::string& name,
                                                                  const CoreOptions& core_options);
+    static std::shared_ptr<arrow::Field> FindFieldByName(const arrow::FieldVector& fields,
+                                                         const std::string& name);
+    static std::shared_ptr<arrow::Field> MergeReadFieldMetadata(
+        const std::shared_ptr<arrow::Field>& aligned_field,
+        const std::shared_ptr<arrow::Field>& read_field);
+    static Result<std::shared_ptr<arrow::Field>> AlignReadFieldWithTableFieldIds(
+        const std::shared_ptr<arrow::Field>& read_field,
+        const std::shared_ptr<arrow::Field>& table_field);
 
     std::shared_ptr<ReadContext> read_context_;
     std::shared_ptr<TableSchema> table_schema_;
diff --git a/src/paimon/core/operation/internal_read_context_test.cpp b/src/paimon/core/operation/internal_read_context_test.cpp
index 18d1bdc5f..e8699402d 100644
--- a/src/paimon/core/operation/internal_read_context_test.cpp
+++ b/src/paimon/core/operation/internal_read_context_test.cpp
@@ -236,4 +236,32 @@ TEST(InternalReadContext, TestReadWithProjectedSchemaAndSpecialFields) {
     }
 }
 
+TEST(InternalReadContext, TestReadWithProjectedSchemaWithoutFieldIds) {
+    std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09";
+
+    auto projected_schema =
+        arrow::schema({arrow::field("f3", arrow::float64()), arrow::field("f0", arrow::utf8())});
+    ArrowSchema c_schema;
+    ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok());
+
+    ReadContextBuilder context_builder(path);
+    context_builder.SetReadSchema(&c_schema);
+    ASSERT_OK_AND_ASSIGN(auto unique_read_context, context_builder.Finish());
+    std::shared_ptr<ReadContext> read_context = std::move(unique_read_context);
+
+    SchemaManager schema_manager(std::make_shared<LocalFileSystem>(), read_context->GetPath());
+    ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0));
+
+    ASSERT_OK_AND_ASSIGN(auto internal_context,
+                         InternalReadContext::Create(read_context, table_schema,
+                                                     table_schema->Options()));
+
+    std::vector<DataField> expected_fields = {
+        DataField(3, arrow::field("f3", arrow::float64())),
+        DataField(0, arrow::field("f0", arrow::utf8())),
+    };
+    auto expected_schema = DataField::ConvertDataFieldsToArrowSchema(expected_fields);
+    ASSERT_TRUE(internal_context->GetReadSchema()->Equals(expected_schema));
+}
+
 }  // namespace paimon::test
diff --git a/test/inte/nested_column_pruning_inte_test.cpp b/test/inte/nested_column_pruning_inte_test.cpp
index ac789c664..066af1cbd 100644
--- a/test/inte/nested_column_pruning_inte_test.cpp
+++ b/test/inte/nested_column_pruning_inte_test.cpp
@@ -26,7 +26,6 @@
 #include "arrow/c/bridge.h"
 #include "arrow/ipc/json_simple.h"
 #include "gtest/gtest.h"
-#include "paimon/common/table/special_fields.h"
 #include "paimon/common/types/data_field.h"
 #include "paimon/common/utils/path_util.h"
 #include "paimon/common/utils/string_utils.h"
@@ -63,17 +62,6 @@ class NestedColumnPruningInteTest : public ::testing::Test,
     }
 
  protected:
-    static std::shared_ptr<arrow::Field> AnnotateField(const std::shared_ptr<arrow::Field>& field,
-                                                       int32_t paimon_id) {
-        auto metadata =
-            arrow::KeyValueMetadata::Make({DataField::FIELD_ID}, {std::to_string(paimon_id)});
-        if (field->metadata()) {
-            auto merged = field->metadata()->Merge(*metadata);
-            return field->WithMetadata(merged);
-        }
-        return field->WithMetadata(metadata);
-    }
-
     std::string file_format_;
     std::string test_dir_;
     std::string table_path_;
@@ -126,13 +114,12 @@ TEST_P(NestedColumnPruningInteTest, PruneStructSubFields) {
     ASSERT_FALSE(data_splits.empty());
 
     // Build projected schema: only read f0 (full) and f1.a (sub-field of struct)
-    // Catalog assigns IDs: f0->0, f1->1, f1.a->2, f1.b->3, f1.c->4
     auto pruned_struct_type = arrow::struct_({
-        AnnotateField(arrow::field("a", arrow::int32()), 2),
+        arrow::field("a", arrow::int32()),
     });
     arrow::FieldVector projected_fields = {
-        AnnotateField(arrow::field("f0", arrow::int32()), 0),
-        AnnotateField(arrow::field("f1", pruned_struct_type), 1),
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", pruned_struct_type),
     };
     auto projected_schema = arrow::schema(projected_fields);
 
@@ -217,10 +204,9 @@ TEST_P(NestedColumnPruningInteTest, PruneEntireStructField) {
                          helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
 
     // Only read f0 and f2, skip f1 entirely.
-    // IDs: f0->0, f1->1, f1.x->2, f1.y->3, f2->4
     arrow::FieldVector projected_fields = {
-        AnnotateField(arrow::field("f0", arrow::int32()), 0),
-        AnnotateField(arrow::field("f2", arrow::float64()), 4),
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f2", arrow::float64()),
     };
     auto projected_schema = arrow::schema(projected_fields);
 
@@ -304,19 +290,16 @@ TEST_P(NestedColumnPruningInteTest, PruneDeepNestedStruct) {
     ASSERT_OK_AND_ASSIGN(auto data_splits,
                          helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
 
-    // Field IDs (assigned sequentially by catalog):
-    // f0->0, f1->1, f1.a->2, f1.inner->3, f1.inner.x->4, f1.inner.y->5
-    //
     // Projected: f0, f1{inner{x}} — skip f1.a and f1.inner.y
     auto pruned_inner = arrow::struct_({
-        AnnotateField(arrow::field("x", arrow::int64()), 4),
+        arrow::field("x", arrow::int64()),
     });
     auto pruned_outer = arrow::struct_({
-        AnnotateField(arrow::field("inner", pruned_inner), 3),
+        arrow::field("inner", pruned_inner),
     });
     arrow::FieldVector projected_fields = {
-        AnnotateField(arrow::field("f0", arrow::int32()), 0),
-        AnnotateField(arrow::field("f1", pruned_outer), 1),
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", pruned_outer),
     };
     auto projected_schema = arrow::schema(projected_fields);
 
@@ -405,21 +388,18 @@ TEST_P(NestedColumnPruningInteTest, PruneNestedStructWithSpecialFields) {
     ASSERT_OK_AND_ASSIGN(auto data_splits,
                          helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
 
-    // Field IDs (assigned sequentially by catalog):
-    // f0->0, f1->1, f1.a->2, f1.inner->3, f1.inner.x->4, f1.inner.y->5
     // Projected: f0, f1{inner{x}}, _SEQUENCE_NUMBER, _ROW_ID
     auto pruned_inner = arrow::struct_({
-        AnnotateField(arrow::field("x", arrow::int64()), 4),
+        arrow::field("x", arrow::int64()),
     });
     auto pruned_outer = arrow::struct_({
-        AnnotateField(arrow::field("inner", pruned_inner), 3),
+        arrow::field("inner", pruned_inner),
     });
     arrow::FieldVector projected_fields = {
-        AnnotateField(arrow::field("f0", arrow::int32()), 0),
-        AnnotateField(arrow::field("f1", pruned_outer), 1),
-        AnnotateField(arrow::field("_SEQUENCE_NUMBER", arrow::int64()),
-                      SpecialFields::SequenceNumber().Id()),
-        AnnotateField(arrow::field("_ROW_ID", arrow::int64()), SpecialFields::RowId().Id()),
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", pruned_outer),
+        arrow::field("_SEQUENCE_NUMBER", arrow::int64()),
+        arrow::field("_ROW_ID", arrow::int64()),
     };
     auto projected_schema = arrow::schema(projected_fields);
 
@@ -501,11 +481,8 @@ TEST_P(NestedColumnPruningInteTest, MapSelectedKeys) {
     auto selected_keys_metadata =
         arrow::KeyValueMetadata::Make({DataField::MAP_SELECTED_KEYS}, {"a,c"});
     arrow::FieldVector projected_fields = {
-        AnnotateField(arrow::field("f0", arrow::int32()), 0),
-        AnnotateField(arrow::field("f1", map_type), 1)
-            ->WithMetadata(AnnotateField(arrow::field("f1", map_type), 1)
-                               ->metadata()
-                               ->Merge(*selected_keys_metadata)),
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", map_type)->WithMetadata(selected_keys_metadata),
     };
     auto projected_schema = arrow::schema(projected_fields);
 
@@ -597,23 +574,19 @@ TEST_P(NestedColumnPruningInteTest, PruneDeeperNestedStruct) {
     ASSERT_OK_AND_ASSIGN(auto data_splits,
                          helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
 
-    // Field IDs (assigned sequentially by catalog):
-    // f0->0, f1->1, f1.a->2, f1.inner1->3, f1.inner1.x->4, f1.inner1.inner2->5,
-    // f1.inner1.inner2.p->6, f1.inner1.inner2.q->7
-    //
     // Projected: f0, f1{inner1{inner2{p}}}
     auto pruned_inner2 = arrow::struct_({
-        AnnotateField(arrow::field("p", arrow::utf8()), 6),
+        arrow::field("p", arrow::utf8()),
     });
     auto pruned_inner1 = arrow::struct_({
-        AnnotateField(arrow::field("inner2", pruned_inner2), 5),
+        arrow::field("inner2", pruned_inner2),
     });
     auto pruned_outer = arrow::struct_({
-        AnnotateField(arrow::field("inner1", pruned_inner1), 3),
+        arrow::field("inner1", pruned_inner1),
     });
     arrow::FieldVector projected_fields = {
-        AnnotateField(arrow::field("f0", arrow::int32()), 0),
-        AnnotateField(arrow::field("f1", pruned_outer), 1),
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", pruned_outer),
     };
     auto projected_schema = arrow::schema(projected_fields);
 

From 127269c2a14da87d6eb0c4948ee2d1fef34f7261 Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Tue, 16 Jun 2026 16:41:10 +0800
Subject: [PATCH 11/24] fix

---
 .../core/operation/internal_read_context.cpp  | 44 +++++++++----------
 .../operation/internal_read_context_test.cpp  |  6 +--
 .../avro/avro_file_batch_reader_test.cpp      |  7 ++-
 3 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/src/paimon/core/operation/internal_read_context.cpp b/src/paimon/core/operation/internal_read_context.cpp
index a94e935b5..dbd24bdf9 100644
--- a/src/paimon/core/operation/internal_read_context.cpp
+++ b/src/paimon/core/operation/internal_read_context.cpp
@@ -33,8 +33,8 @@
 
 namespace paimon {
 
-std::shared_ptr<arrow::Field> InternalReadContext::FindFieldByName(
-    const arrow::FieldVector& fields, const std::string& name) {
+std::shared_ptr<arrow::Field> InternalReadContext::FindFieldByName(const arrow::FieldVector& fields,
+                                                                   const std::string& name) {
     for (const auto& field : fields) {
         if (field->name() == name) {
             return field;
@@ -64,9 +64,8 @@ Result<std::shared_ptr<arrow::Field>> InternalReadContext::AlignReadFieldWithTab
     const std::shared_ptr<arrow::Field>& table_field) {
     if (read_field->type()->id() != table_field->type()->id()) {
         return Status::Invalid(fmt::format(
-            "Read schema field '{}' type {} does not match table field type {}", 
-            read_field->name(), read_field->type()->ToString(),
-            table_field->type()->ToString()));
+            "Read schema field '{}' type {} does not match table field type {}", read_field->name(),
+            read_field->type()->ToString(), table_field->type()->ToString()));
     }
 
     auto type_id = read_field->type()->id();
@@ -78,9 +77,9 @@ Result<std::shared_ptr<arrow::Field>> InternalReadContext::AlignReadFieldWithTab
         for (const auto& read_child : read_struct->fields()) {
             auto table_child = FindFieldByName(table_struct->fields(), read_child->name());
             if (!table_child) {
-                return Status::Invalid(fmt::format(
-                    "Read schema nested field '{}' does not exist in table field '{}'", 
-                    read_child->name(), table_field->name()));
+                return Status::Invalid(
+                    fmt::format("Read schema nested field '{}' does not exist in table field '{}'",
+                                read_child->name(), table_field->name()));
             }
             PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Field> rebased_child,
                                    AlignReadFieldWithTableFieldIds(read_child, table_child));
@@ -94,9 +93,9 @@ Result<std::shared_ptr<arrow::Field>> InternalReadContext::AlignReadFieldWithTab
     if (type_id == arrow::Type::LIST) {
         auto read_list = std::static_pointer_cast<arrow::ListType>(read_field->type());
         auto table_list = std::static_pointer_cast<arrow::ListType>(table_field->type());
-        PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Field> rebased_value_field,
-                               AlignReadFieldWithTableFieldIds(read_list->value_field(),
-                                                               table_list->value_field()));
+        PAIMON_ASSIGN_OR_RAISE(
+            std::shared_ptr<arrow::Field> rebased_value_field,
+            AlignReadFieldWithTableFieldIds(read_list->value_field(), table_list->value_field()));
         auto rebased_type = arrow::list(rebased_value_field);
         auto aligned_field = table_field->WithType(rebased_type)->WithName(read_field->name());
         return MergeReadFieldMetadata(aligned_field, read_field);
@@ -105,12 +104,12 @@ Result<std::shared_ptr<arrow::Field>> InternalReadContext::AlignReadFieldWithTab
     if (type_id == arrow::Type::MAP) {
         auto read_map = std::static_pointer_cast<arrow::MapType>(read_field->type());
         auto table_map = std::static_pointer_cast<arrow::MapType>(table_field->type());
-        PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Field> rebased_key_field,
-                               AlignReadFieldWithTableFieldIds(read_map->key_field(),
-                                                               table_map->key_field()));
-        PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Field> rebased_item_field,
-                               AlignReadFieldWithTableFieldIds(read_map->item_field(),
-                                                               table_map->item_field()));
+        PAIMON_ASSIGN_OR_RAISE(
+            std::shared_ptr<arrow::Field> rebased_key_field,
+            AlignReadFieldWithTableFieldIds(read_map->key_field(), table_map->key_field()));
+        PAIMON_ASSIGN_OR_RAISE(
+            std::shared_ptr<arrow::Field> rebased_item_field,
+            AlignReadFieldWithTableFieldIds(read_map->item_field(), table_map->item_field()));
         auto rebased_type = arrow::map(rebased_key_field->type(), rebased_item_field);
         auto aligned_field = table_field->WithType(rebased_type)->WithName(read_field->name());
         return MergeReadFieldMetadata(aligned_field, read_field);
@@ -118,9 +117,8 @@ Result<std::shared_ptr<arrow::Field>> InternalReadContext::AlignReadFieldWithTab
 
     if (!read_field->type()->Equals(table_field->type())) {
         return Status::Invalid(fmt::format(
-            "Read schema field '{}' type {} does not match table field type {}", 
-            read_field->name(), read_field->type()->ToString(),
-            table_field->type()->ToString()));
+            "Read schema field '{}' type {} does not match table field type {}", read_field->name(),
+            read_field->type()->ToString(), table_field->type()->ToString()));
     }
 
     auto aligned_field = table_field->WithType(read_field->type())->WithName(read_field->name());
@@ -203,11 +201,13 @@ Result<std::unique_ptr<InternalReadContext>> InternalReadContext::Create(
                 read_data_fields.push_back(*resolved_special_field);
                 continue;
             }
-            PAIMON_ASSIGN_OR_RAISE(DataField table_field, table_schema->GetField(read_field->name()));
+            PAIMON_ASSIGN_OR_RAISE(DataField table_field,
+                                   table_schema->GetField(read_field->name()));
             PAIMON_ASSIGN_OR_RAISE(
                 std::shared_ptr<arrow::Field> aligned_field,
                 AlignReadFieldWithTableFieldIds(read_field, table_field.ArrowField()));
-            read_data_fields.emplace_back(table_field.Id(), aligned_field, table_field.Description());
+            read_data_fields.emplace_back(table_field.Id(), aligned_field,
+                                          table_field.Description());
         }
     } else if (!context->GetReadFieldIds().empty()) {
         read_data_fields.reserve(context->GetReadFieldIds().size());
diff --git a/src/paimon/core/operation/internal_read_context_test.cpp b/src/paimon/core/operation/internal_read_context_test.cpp
index e8699402d..2ccd9107e 100644
--- a/src/paimon/core/operation/internal_read_context_test.cpp
+++ b/src/paimon/core/operation/internal_read_context_test.cpp
@@ -252,9 +252,9 @@ TEST(InternalReadContext, TestReadWithProjectedSchemaWithoutFieldIds) {
     SchemaManager schema_manager(std::make_shared<LocalFileSystem>(), read_context->GetPath());
     ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0));
 
-    ASSERT_OK_AND_ASSIGN(auto internal_context,
-                         InternalReadContext::Create(read_context, table_schema,
-                                                     table_schema->Options()));
+    ASSERT_OK_AND_ASSIGN(
+        auto internal_context,
+        InternalReadContext::Create(read_context, table_schema, table_schema->Options()));
 
     std::vector<DataField> expected_fields = {
         DataField(3, arrow::field("f3", arrow::float64())),
diff --git a/src/paimon/format/avro/avro_file_batch_reader_test.cpp b/src/paimon/format/avro/avro_file_batch_reader_test.cpp
index 91c52f46b..257f0c127 100644
--- a/src/paimon/format/avro/avro_file_batch_reader_test.cpp
+++ b/src/paimon/format/avro/avro_file_batch_reader_test.cpp
@@ -202,14 +202,13 @@ TEST_F(AvroFileBatchReaderTest, TestSetReadSchemaRejectNestedSubFieldProjection)
     arrow::FieldVector write_fields = {
         arrow::field("f0", arrow::int32()),
         arrow::field("f1", arrow::struct_({arrow::field("a", arrow::int32()),
-                                             arrow::field("b", arrow::utf8())}))};
+                                           arrow::field("b", arrow::utf8())}))};
     auto write_type = arrow::struct_(write_fields);
-    auto write_array =
-        arrow::ipc::internal::json::ArrayFromJSON(write_type, R"([
+    auto write_array = arrow::ipc::internal::json::ArrayFromJSON(write_type, R"([
             [1, [10, "x"]],
             [2, [20, "y"]]
         ])")
-            .ValueOrDie();
+                           .ValueOrDie();
     WriteData(write_array, file_path, /*compression=*/"null");
 
     ASSERT_OK_AND_ASSIGN(auto reader_builder,

From af9a74e931fe0bb1c49cd05382e4ff630214f8c1 Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Tue, 16 Jun 2026 16:50:52 +0800
Subject: [PATCH 12/24] fix

---
 test/inte/scan_and_read_inte_test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/inte/scan_and_read_inte_test.cpp b/test/inte/scan_and_read_inte_test.cpp
index aafb9c59c..ad3d8e7fb 100644
--- a/test/inte/scan_and_read_inte_test.cpp
+++ b/test/inte/scan_and_read_inte_test.cpp
@@ -2233,7 +2233,7 @@ TEST_P(ScanAndReadInteTest, TestPkSchemaEvolutionScanWithRenamedPkPredicate) {
 
     ReadContextBuilder read_context_builder(table_path);
     AddReadOptionsForPrefetch(&read_context_builder);
-    read_context_builder.SetReadSchema({"key1", "k", "key_2", "c", "d", "a", "key0", "e"});
+    read_context_builder.SetReadFieldNames({"key1", "k", "key_2", "c", "d", "a", "key0", "e"});
     ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish());
     ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context)));
     ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(result_plan->Splits()));

From d8cc5769904db1c38dc759db40aa94831b2436ad Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Tue, 16 Jun 2026 18:06:21 +0800
Subject: [PATCH 13/24] fix

---
 .../format/avro/avro_file_batch_reader.cpp    | 25 -----
 .../avro/avro_file_batch_reader_test.cpp      | 32 -------
 .../parquet/parquet_file_batch_reader.cpp     | 93 +++++++++++++++++--
 3 files changed, 86 insertions(+), 64 deletions(-)

diff --git a/src/paimon/format/avro/avro_file_batch_reader.cpp b/src/paimon/format/avro/avro_file_batch_reader.cpp
index 99bde55a1..13833f97c 100644
--- a/src/paimon/format/avro/avro_file_batch_reader.cpp
+++ b/src/paimon/format/avro/avro_file_batch_reader.cpp
@@ -33,30 +33,6 @@
 
 namespace paimon::avro {
 
-namespace {
-
-bool IsNestedType(const std::shared_ptr<arrow::DataType>& type) {
-    return type->id() == arrow::Type::STRUCT || type->id() == arrow::Type::LIST ||
-           type->id() == arrow::Type::MAP;
-}
-
-Status ValidateUnsupportedNestedProjection(const std::shared_ptr<arrow::Schema>& file_schema,
-                                           const std::shared_ptr<arrow::Schema>& read_schema) {
-    for (const auto& read_field : read_schema->fields()) {
-        auto file_field = file_schema->GetFieldByName(read_field->name());
-        if (!file_field) {
-            continue;
-        }
-        if (IsNestedType(read_field->type()) && !read_field->type()->Equals(file_field->type())) {
-            return Status::Invalid(
-                "SetReadSchema failed: avro reader does not support nested sub-field projection");
-        }
-    }
-    return Status::OK();
-}
-
-}  // namespace
-
 AvroFileBatchReader::AvroFileBatchReader(const std::shared_ptr<InputStream>& input_stream,
                                          const std::shared_ptr<::arrow::DataType>& file_data_type,
                                          std::unique_ptr<::avro::DataFileReaderBase>&& reader,
@@ -172,7 +148,6 @@ Status AvroFileBatchReader::SetReadSchema(::ArrowSchema* read_schema,
                                       arrow::ImportSchema(read_schema));
     PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Schema> file_schema,
                            ArrowUtils::DataTypeToSchema(file_data_type_));
-    PAIMON_RETURN_NOT_OK(ValidateUnsupportedNestedProjection(file_schema, arrow_read_schema));
     PAIMON_ASSIGN_OR_RAISE(read_fields_projection_,
                            CalculateReadFieldsProjection(file_schema, arrow_read_schema->fields()));
     array_builder_->Reset();
diff --git a/src/paimon/format/avro/avro_file_batch_reader_test.cpp b/src/paimon/format/avro/avro_file_batch_reader_test.cpp
index 257f0c127..a8ed3bb6c 100644
--- a/src/paimon/format/avro/avro_file_batch_reader_test.cpp
+++ b/src/paimon/format/avro/avro_file_batch_reader_test.cpp
@@ -196,38 +196,6 @@ TEST_F(AvroFileBatchReaderTest, TestReadAllTypes) {
     ASSERT_TRUE(expected_array->Equals(result_array)) << result_array->ToString();
 }
 
-TEST_F(AvroFileBatchReaderTest, TestSetReadSchemaRejectNestedSubFieldProjection) {
-    std::string file_path = PathUtil::JoinPath(dir_->Str(), "nested_projection_unsupported.avro");
-
-    arrow::FieldVector write_fields = {
-        arrow::field("f0", arrow::int32()),
-        arrow::field("f1", arrow::struct_({arrow::field("a", arrow::int32()),
-                                           arrow::field("b", arrow::utf8())}))};
-    auto write_type = arrow::struct_(write_fields);
-    auto write_array = arrow::ipc::internal::json::ArrayFromJSON(write_type, R"([
-            [1, [10, "x"]],
-            [2, [20, "y"]]
-        ])")
-                           .ValueOrDie();
-    WriteData(write_array, file_path, /*compression=*/"null");
-
-    ASSERT_OK_AND_ASSIGN(auto reader_builder,
-                         file_format_->CreateReaderBuilder(/*batch_size=*/1024));
-    ASSERT_OK_AND_ASSIGN(std::shared_ptr<InputStream> in, fs_->Open(file_path));
-    ASSERT_OK_AND_ASSIGN(auto batch_reader, reader_builder->Build(in));
-
-    arrow::FieldVector read_fields = {
-        arrow::field("f0", arrow::int32()),
-        arrow::field("f1", arrow::struct_({arrow::field("a", arrow::int32())}))};
-    auto read_schema = arrow::schema(read_fields);
-    std::unique_ptr<ArrowSchema> c_schema = std::make_unique<ArrowSchema>();
-    ASSERT_TRUE(arrow::ExportSchema(*read_schema, c_schema.get()).ok());
-
-    ASSERT_NOK_WITH_MSG(batch_reader->SetReadSchema(c_schema.get(), /*predicate=*/nullptr,
-                                                    /*selection_bitmap=*/std::nullopt),
-                        "does not support nested sub-field projection");
-}
-
 TEST_P(AvroFileBatchReaderTest, TestReadTimestampTypes) {
     auto enable_tz = GetParam();
     std::string timezone_str = enable_tz ? "Asia/Tokyo" : "Asia/Shanghai";
diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
index a726da4d3..fcad8ae93 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
@@ -79,6 +79,77 @@ std::shared_ptr<arrow::Field> FindMatchingReadField(
     return nullptr;
 }
 
+int32_t FindMatchingFileFieldIndex(const arrow::FieldVector& file_fields,
+                                   const std::shared_ptr<arrow::Field>& read_field) {
+    int32_t read_field_id = NestedProjectionUtils::GetPaimonFieldId(read_field);
+    if (read_field_id != -1) {
+        for (int32_t i = 0; i < static_cast<int32_t>(file_fields.size()); ++i) {
+            if (NestedProjectionUtils::GetPaimonFieldId(file_fields[i]) == read_field_id) {
+                return i;
+            }
+        }
+    }
+
+    for (int32_t i = 0; i < static_cast<int32_t>(file_fields.size()); ++i) {
+        if (file_fields[i]->name() == read_field->name()) {
+            return i;
+        }
+    }
+    return -1;
+}
+
+Result<std::shared_ptr<arrow::RecordBatch>> AlignBatchToReadSchemaOrder(
+    const std::shared_ptr<arrow::RecordBatch>& batch,
+    const std::shared_ptr<arrow::DataType>& read_data_type) {
+    auto read_struct = std::dynamic_pointer_cast<arrow::StructType>(read_data_type);
+    if (!read_struct) {
+        return Status::Invalid(fmt::format("Read data type must be struct, got {}",
+                                           read_data_type->ToString()));
+    }
+    if (batch->num_columns() != read_struct->num_fields()) {
+        return Status::Invalid(fmt::format(
+            "Batch column count {} does not match read schema field count {}", batch->num_columns(),
+            read_struct->num_fields()));
+    }
+
+    bool already_aligned = true;
+    for (int32_t i = 0; i < batch->num_columns(); ++i) {
+        if (batch->schema()->field(i)->name() != read_struct->field(i)->name()) {
+            already_aligned = false;
+            break;
+        }
+    }
+    if (already_aligned) {
+        return batch;
+    }
+
+    std::unordered_map<std::string, int32_t> batch_field_index;
+    batch_field_index.reserve(static_cast<size_t>(batch->num_columns()));
+    for (int32_t i = 0; i < batch->num_columns(); ++i) {
+        batch_field_index.emplace(batch->schema()->field(i)->name(), i);
+    }
+
+    std::vector<std::shared_ptr<arrow::Array>> aligned_columns;
+    aligned_columns.reserve(static_cast<size_t>(batch->num_columns()));
+    arrow::FieldVector aligned_fields;
+    aligned_fields.reserve(static_cast<size_t>(batch->num_columns()));
+
+    for (int32_t i = 0; i < read_struct->num_fields(); ++i) {
+        const auto& read_field = read_struct->field(i);
+        auto it = batch_field_index.find(read_field->name());
+        if (it == batch_field_index.end()) {
+            return Status::Invalid(fmt::format(
+                "Parquet batch column '{}' not found while aligning to read schema", 
+                read_field->name()));
+        }
+        aligned_columns.push_back(batch->column(it->second));
+        aligned_fields.push_back(read_field);
+    }
+
+    auto aligned_schema = arrow::schema(aligned_fields);
+    return arrow::RecordBatch::Make(aligned_schema, batch->num_rows(), aligned_columns);
+}
+
 }  // namespace
 
 ParquetFileBatchReader::ParquetFileBatchReader(
@@ -373,6 +444,7 @@ Result<BatchReader::ReadBatch> ParquetFileBatchReader::NextBatch() {
         if (batch == nullptr) {
             return BatchReader::MakeEofBatch();
         }
+        PAIMON_ASSIGN_OR_RAISE(batch, AlignBatchToReadSchemaOrder(batch, read_data_type_));
         PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr<arrow::Array> array,
                                           batch->ToStructArray());
         PAIMON_ASSIGN_OR_RAISE(bool need_cast, ParquetTimestampConverter::NeedCastArrayForTimestamp(
@@ -516,17 +588,24 @@ Result<std::vector<int32_t>> ParquetFileBatchReader::ComputeNestedColumnIndices(
     const std::shared_ptr<arrow::Schema>& read_schema,
     const std::shared_ptr<arrow::Schema>& file_schema) {
     std::vector<int32_t> indices;
-    int32_t leaf_index = 0;
+    std::vector<int32_t> file_field_leaf_starts;
+    file_field_leaf_starts.reserve(file_schema->num_fields());
 
+    int32_t file_leaf_index = 0;
     for (const auto& file_field : file_schema->fields()) {
-        std::shared_ptr<arrow::Field> read_field =
-            FindMatchingReadField(read_schema->fields(), file_field);
+        file_field_leaf_starts.push_back(file_leaf_index);
+        SkipLeafIndices(file_field->type(), &file_leaf_index);
+    }
 
-        if (read_field) {
-            CollectLeafIndices(read_field->type(), file_field->type(), &leaf_index, &indices);
-        } else {
-            SkipLeafIndices(file_field->type(), &leaf_index);
+    const auto& file_fields = file_schema->fields();
+    for (const auto& read_field : read_schema->fields()) {
+        int32_t file_field_idx = FindMatchingFileFieldIndex(file_fields, read_field);
+        if (file_field_idx < 0) {
+            continue;
         }
+        int32_t leaf_index = file_field_leaf_starts[file_field_idx];
+        CollectLeafIndices(read_field->type(), file_fields[file_field_idx]->type(), &leaf_index,
+                           &indices);
     }
     return indices;
 }

From 32d9de96819cd9925e5a48d64e31dd0172399549 Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Wed, 17 Jun 2026 09:26:51 +0800
Subject: [PATCH 14/24] fix

---
 include/paimon/read_context.h                 |  3 +-
 .../common/memory/memory_segment_test.cpp     |  1 -
 .../core/utils/nested_projection_utils.cpp    | 22 ++++-
 .../utils/nested_projection_utils_test.cpp    | 23 ++++-
 test/inte/nested_column_pruning_inte_test.cpp | 87 +++++++++++++++++++
 5 files changed, 129 insertions(+), 7 deletions(-)

diff --git a/include/paimon/read_context.h b/include/paimon/read_context.h
index 377c30969..a25d00408 100644
--- a/include/paimon/read_context.h
+++ b/include/paimon/read_context.h
@@ -201,8 +201,7 @@ class PAIMON_EXPORT ReadContextBuilder {
     ///
     /// @param read_field_ids Vector of field ids to read from the table.
     /// @return Reference to this builder for method chaining.
-    /// @note Currently supports top-level field selection. Future versions may support
-    ///       nested field selection using ArrowSchema for more granular projection.
+    /// @note Currently supports top-level field selection.
     /// @note SetReadFieldIds() and SetReadFieldNames() are mutually exclusive.
     ///       Calling both will ignore the read schema set by SetReadFieldNames().
     ReadContextBuilder& SetReadFieldIds(const std::vector<int32_t>& read_field_ids);
diff --git a/src/paimon/common/memory/memory_segment_test.cpp b/src/paimon/common/memory/memory_segment_test.cpp
index c79a26d5c..c127efb52 100644
--- a/src/paimon/common/memory/memory_segment_test.cpp
+++ b/src/paimon/common/memory/memory_segment_test.cpp
@@ -556,7 +556,6 @@ TEST(MemorySegmentTest, TestDoubleAccess) {
 }
 
 // Bulk Byte Movements
-
 TEST(MemorySegmentTest, TestBulkByteAccess) {
     auto pool = paimon::GetDefaultPool();
     // test expected correct behavior with default offset / length
diff --git a/src/paimon/core/utils/nested_projection_utils.cpp b/src/paimon/core/utils/nested_projection_utils.cpp
index b82eb3742..435e8cd1f 100644
--- a/src/paimon/core/utils/nested_projection_utils.cpp
+++ b/src/paimon/core/utils/nested_projection_utils.cpp
@@ -115,9 +115,21 @@ std::set<std::string> NestedProjectionUtils::GetMapSelectedKeys(
     if (!get_result.ok()) {
         return result;
     }
-    const std::string& value = get_result.ValueUnsafe();
-    auto tokens = StringUtils::Split(value, ",");
-    result.insert(tokens.begin(), tokens.end());
+    std::string value = get_result.ValueUnsafe();
+    StringUtils::Trim(&value);
+    if (value.empty()) {
+        // Metadata is explicitly present but empty: treat as "filter all keys".
+        result.insert("");
+        return result;
+    }
+
+    auto tokens = StringUtils::Split(value, ",", /*ignore_empty=*/true);
+    for (auto& token : tokens) {
+        StringUtils::Trim(&token);
+        if (!token.empty()) {
+            result.insert(token);
+        }
+    }
     return result;
 }
 
@@ -126,6 +138,7 @@ Result<std::shared_ptr<arrow::Array>> NestedProjectionUtils::FilterMapArrayBySel
     if (selected_keys.empty() || !array || array->length() == 0) {
         return array;
     }
+    bool filter_all_keys = selected_keys.count("") > 0;
 
     auto map_array = std::static_pointer_cast<arrow::MapArray>(array);
     auto map_type = std::static_pointer_cast<arrow::MapType>(array->type());
@@ -145,6 +158,9 @@ Result<std::shared_ptr<arrow::Array>> NestedProjectionUtils::FilterMapArrayBySel
     std::vector<bool> keep(total_entries, false);
     int64_t kept_count = 0;
     for (int64_t i = 0; i < total_entries; ++i) {
+        if (filter_all_keys) {
+            continue;
+        }
         if (!keys_array->IsNull(i)) {
             std::string_view key_view = keys_array->GetView(i);
             std::string key_str(key_view.data(), key_view.size());
diff --git a/src/paimon/core/utils/nested_projection_utils_test.cpp b/src/paimon/core/utils/nested_projection_utils_test.cpp
index 00da2dd3e..bceed1a33 100644
--- a/src/paimon/core/utils/nested_projection_utils_test.cpp
+++ b/src/paimon/core/utils/nested_projection_utils_test.cpp
@@ -194,7 +194,18 @@ TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_EmptyString) {
     auto field =
         arrow::field("m", arrow::map(arrow::utf8(), arrow::int32()), /*nullable=*/true, metadata);
     auto keys = NestedProjectionUtils::GetMapSelectedKeys(field);
-    ASSERT_TRUE(keys.empty());
+    ASSERT_EQ(keys.size(), 1);
+    ASSERT_TRUE(keys.count(""));
+}
+
+TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_ContainsEmptyToken) {
+    auto metadata = arrow::KeyValueMetadata::Make({DataField::MAP_SELECTED_KEYS}, {"a, ,b"});
+    auto field =
+        arrow::field("m", arrow::map(arrow::utf8(), arrow::int32()), /*nullable=*/true, metadata);
+    auto keys = NestedProjectionUtils::GetMapSelectedKeys(field);
+    ASSERT_EQ(keys.size(), 2);
+    ASSERT_TRUE(keys.count("a"));
+    ASSERT_TRUE(keys.count("b"));
 }
 
 TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_Nullptr) {
@@ -280,6 +291,16 @@ TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_NoneKept) {
     ASSERT_EQ(result->value_length(0), 0);
 }
 
+TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_EmptyKeyMeansFilterAll) {
+    auto map_array = BuildStringInt32MapArray({{{"a", 1}, {"b", 2}}});
+    std::set<std::string> selected = {"a", ""};
+    ASSERT_OK_AND_ASSIGN(auto filtered,
+                         NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
+    auto result = std::static_pointer_cast<arrow::MapArray>(filtered);
+    ASSERT_EQ(result->length(), 1);
+    ASSERT_EQ(result->value_length(0), 0);
+}
+
 TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_WithNull) {
     // maps[0] = {"a":1}, maps[1] = null, maps[2] = {"b":2,"c":3}
     auto map_array =
diff --git a/test/inte/nested_column_pruning_inte_test.cpp b/test/inte/nested_column_pruning_inte_test.cpp
index 066af1cbd..9c5ae2fd6 100644
--- a/test/inte/nested_column_pruning_inte_test.cpp
+++ b/test/inte/nested_column_pruning_inte_test.cpp
@@ -525,6 +525,93 @@ TEST_P(NestedColumnPruningInteTest, MapSelectedKeys) {
     ASSERT_TRUE(is_equal);
 }
 
+// Test: MAP_SELECTED_KEYS metadata value is empty string, filter all map entries.
+TEST_P(NestedColumnPruningInteTest, MapSelectedKeysEmptyStringMeansFilterAll) {
+    // Table schema: f0 (int32), f1 (map<string, int32>)
+    auto map_type = arrow::map(arrow::utf8(), arrow::int32());
+    arrow::FieldVector table_fields = {
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", map_type),
+    };
+    auto table_schema = arrow::schema(table_fields);
+
+    std::map<std::string, std::string> options = {
+        {Options::MANIFEST_FORMAT, "AVRO"},
+        {Options::FILE_FORMAT, StringUtils::ToUpperCase(file_format_)},
+        {Options::TARGET_FILE_SIZE, "1024"},
+        {Options::BUCKET, "-1"},
+    };
+
+    ASSERT_OK_AND_ASSIGN(
+        auto helper, TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{},
+                                        /*primary_keys=*/{}, options, /*is_streaming_mode=*/false));
+
+    // Write data: each row has a map with some entries.
+    std::string data = R"([
+        [1, [["a", 10], ["b", 20], ["c", 30]]],
+        [2, [["a", 100], ["c", 300]]],
+        [3, [["b", 200], ["c", 400], ["d", 500]]]
+    ])";
+    ASSERT_OK_AND_ASSIGN(auto batch,
+                         TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data,
+                                                     /*partition_map=*/{}, /*bucket=*/0, {}));
+    int64_t commit_identifier = 0;
+    ASSERT_OK_AND_ASSIGN(auto commit_msgs,
+                         helper->WriteAndCommit(std::move(batch), commit_identifier++,
+                                                /*expected_commit_messages=*/std::nullopt));
+
+    // Scan to get splits
+    ASSERT_OK_AND_ASSIGN(auto data_splits,
+                         helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
+    ASSERT_FALSE(data_splits.empty());
+
+    // Build projected schema: read f0 and f1 with selected keys metadata set to empty string.
+    auto selected_keys_metadata =
+        arrow::KeyValueMetadata::Make({DataField::MAP_SELECTED_KEYS}, {""});
+    arrow::FieldVector projected_fields = {
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", map_type)->WithMetadata(selected_keys_metadata),
+    };
+    auto projected_schema = arrow::schema(projected_fields);
+
+    ArrowSchema c_schema;
+    ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok());
+
+    // Read with projected schema
+    ReadContextBuilder read_context_builder(table_path_);
+    read_context_builder.SetOptions(options).SetReadSchema(&c_schema);
+    ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish());
+    ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context)));
+    ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits));
+    ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get()));
+
+    // Expected: all map entries are filtered out.
+    arrow::FieldVector expected_fields = {
+        arrow::field("_VALUE_KIND", arrow::int8()),
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", arrow::map(arrow::utf8(), arrow::int32())),
+    };
+    auto expected_type = arrow::struct_(expected_fields);
+    std::string expected_data = R"([
+        [0, 1, []],
+        [0, 2, []],
+        [0, 3, []]
+    ])";
+    auto expected_array =
+        arrow::ipc::internal::json::ArrayFromJSON(expected_type, expected_data).ValueOrDie();
+    auto expected_chunked = std::make_shared<arrow::ChunkedArray>(expected_array);
+
+    arrow::EqualOptions equal_options = arrow::EqualOptions::Defaults();
+    bool is_equal = expected_chunked->Equals(read_result, equal_options.diff_sink(&std::cout));
+    if (!is_equal) {
+        std::cout << "[expected_type] " << expected_chunked->type()->ToString() << std::endl;
+        std::cout << "[actual_type]   " << read_result->type()->ToString() << std::endl;
+        std::cout << "[expected] " << expected_chunked->ToString() << std::endl;
+        std::cout << "[actual]   " << read_result->ToString() << std::endl;
+    }
+    ASSERT_TRUE(is_equal);
+}
+
 // Test: Deeper nested struct — prune sub-fields of a struct inside a struct inside another struct.
 TEST_P(NestedColumnPruningInteTest, PruneDeeperNestedStruct) {
     // Table schema: f0 (int32), f1 (struct{a: int32, inner1: struct{x: int64, inner2: struct{p:

From 52fc35d7f934c63ed7c37c2f3e9c551acbb67120 Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Wed, 17 Jun 2026 14:38:53 +0800
Subject: [PATCH 15/24] fix

---
 .../core/utils/nested_projection_utils.cpp    | 81 +++++++++++++++++++
 .../core/utils/nested_projection_utils.h      | 13 +++
 .../utils/nested_projection_utils_test.cpp    | 32 ++++++++
 .../format/avro/avro_file_batch_reader.cpp    |  8 ++
 .../avro/avro_file_batch_reader_test.cpp      | 33 ++++++++
 5 files changed, 167 insertions(+)

diff --git a/src/paimon/core/utils/nested_projection_utils.cpp b/src/paimon/core/utils/nested_projection_utils.cpp
index 435e8cd1f..5508aff91 100644
--- a/src/paimon/core/utils/nested_projection_utils.cpp
+++ b/src/paimon/core/utils/nested_projection_utils.cpp
@@ -32,6 +32,65 @@
 
 namespace paimon {
 
+std::shared_ptr<arrow::Field> NestedProjectionUtils::FindFieldByName(
+    const arrow::FieldVector& fields, const std::string& name) {
+    for (const auto& field : fields) {
+        if (field->name() == name) {
+            return field;
+        }
+    }
+    return nullptr;
+}
+
+Result<bool> NestedProjectionUtils::HasNestedSubfieldProjectionType(
+    const std::shared_ptr<arrow::DataType>& file_type,
+    const std::shared_ptr<arrow::DataType>& read_type) {
+    if (file_type->id() != read_type->id()) {
+        return false;
+    }
+
+    switch (file_type->id()) {
+        case arrow::Type::STRUCT: {
+            auto file_struct = std::static_pointer_cast<arrow::StructType>(file_type);
+            auto read_struct = std::static_pointer_cast<arrow::StructType>(read_type);
+            if (read_struct->num_fields() != file_struct->num_fields()) {
+                return true;
+            }
+            for (const auto& read_child : read_struct->fields()) {
+                auto file_child = FindFieldByName(file_struct->fields(), read_child->name());
+                if (!file_child) {
+                    return true;
+                }
+                PAIMON_ASSIGN_OR_RAISE(bool child_has_nested_projection,
+                                       HasNestedSubfieldProjectionType(file_child->type(),
+                                                                       read_child->type()));
+                if (child_has_nested_projection) {
+                    return true;
+                }
+            }
+            return false;
+        }
+        case arrow::Type::LIST: {
+            auto file_list = std::static_pointer_cast<arrow::ListType>(file_type);
+            auto read_list = std::static_pointer_cast<arrow::ListType>(read_type);
+            return HasNestedSubfieldProjectionType(file_list->value_type(), read_list->value_type());
+        }
+        case arrow::Type::MAP: {
+            auto file_map = std::static_pointer_cast<arrow::MapType>(file_type);
+            auto read_map = std::static_pointer_cast<arrow::MapType>(read_type);
+            PAIMON_ASSIGN_OR_RAISE(bool key_has_nested_projection,
+                                   HasNestedSubfieldProjectionType(file_map->key_type(),
+                                                                   read_map->key_type()));
+            if (key_has_nested_projection) {
+                return true;
+            }
+            return HasNestedSubfieldProjectionType(file_map->item_type(), read_map->item_type());
+        }
+        default:
+            return false;
+    }
+}
+
 Result<std::optional<std::shared_ptr<arrow::DataType>>> NestedProjectionUtils::PruneDataType(
     const std::shared_ptr<arrow::DataType>& read_type,
     const std::shared_ptr<arrow::DataType>& data_type) {
@@ -103,6 +162,28 @@ Result<std::optional<std::shared_ptr<arrow::DataType>>> NestedProjectionUtils::P
     }
 }
 
+Result<bool> NestedProjectionUtils::HasNestedSubfieldProjection(
+    const std::shared_ptr<arrow::Schema>& file_schema,
+    const std::shared_ptr<arrow::Schema>& read_schema) {
+    for (const auto& read_field : read_schema->fields()) {
+        auto file_field = file_schema->GetFieldByName(read_field->name());
+        if (!file_field) {
+            continue;
+        }
+        if (read_field->type()->id() == arrow::Type::STRUCT ||
+            read_field->type()->id() == arrow::Type::LIST ||
+            read_field->type()->id() == arrow::Type::MAP) {
+            PAIMON_ASSIGN_OR_RAISE(bool has_nested_projection,
+                                   HasNestedSubfieldProjectionType(file_field->type(),
+                                                                   read_field->type()));
+            if (has_nested_projection) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
 // Map selected-keys support
 
 std::set<std::string> NestedProjectionUtils::GetMapSelectedKeys(
diff --git a/src/paimon/core/utils/nested_projection_utils.h b/src/paimon/core/utils/nested_projection_utils.h
index ee0e53976..3da853087 100644
--- a/src/paimon/core/utils/nested_projection_utils.h
+++ b/src/paimon/core/utils/nested_projection_utils.h
@@ -77,6 +77,11 @@ class PAIMON_EXPORT NestedProjectionUtils {
         const std::shared_ptr<arrow::DataType>& read_type,
         const std::shared_ptr<arrow::DataType>& data_type);
 
+    /// Returns true if `read_schema` requests a nested sub-field projection against
+    /// `file_schema` (same top-level field, but nested STRUCT/LIST/MAP subtree is pruned).
+    static Result<bool> HasNestedSubfieldProjection(const std::shared_ptr<arrow::Schema>& file_schema,
+                                                    const std::shared_ptr<arrow::Schema>& read_schema);
+
     /// Parse the "paimon.map.selected-keys" metadata from an Arrow field.
     /// Returns an empty set if the metadata key is absent or the field is not a MAP.
     /// The metadata value must be a JSON array of strings, e.g. '["key1","key2"]'.
@@ -87,6 +92,14 @@ class PAIMON_EXPORT NestedProjectionUtils {
     /// `selected_keys` is empty.
     static Result<std::shared_ptr<arrow::Array>> FilterMapArrayBySelectedKeys(
         const std::shared_ptr<arrow::Array>& map_array, const std::set<std::string>& selected_keys);
+
+ private:
+    static std::shared_ptr<arrow::Field> FindFieldByName(const arrow::FieldVector& fields,
+                                                         const std::string& name);
+
+    static Result<bool> HasNestedSubfieldProjectionType(
+        const std::shared_ptr<arrow::DataType>& file_type,
+        const std::shared_ptr<arrow::DataType>& read_type);
 };
 
 }  // namespace paimon
diff --git a/src/paimon/core/utils/nested_projection_utils_test.cpp b/src/paimon/core/utils/nested_projection_utils_test.cpp
index bceed1a33..7bd177e7d 100644
--- a/src/paimon/core/utils/nested_projection_utils_test.cpp
+++ b/src/paimon/core/utils/nested_projection_utils_test.cpp
@@ -169,6 +169,38 @@ TEST(NestedProjectionUtilsTest, PruneDataType_MapWithStructValue) {
     ASSERT_EQ(map_type->item_type()->field(0)->name(), "a");
 }
 
+TEST(NestedProjectionUtilsTest, HasNestedSubfieldProjection_NoProjection) {
+    auto file_schema = arrow::schema({
+        MakeField("f0", arrow::int32(), 1),
+        MakeField("f1", arrow::struct_({MakeField("a", arrow::int32(), 2)}), 3),
+    });
+    auto read_schema = arrow::schema({
+        MakeField("f0", arrow::int32(), 1),
+        MakeField("f1", arrow::struct_({MakeField("a", arrow::int32(), 2)}), 3),
+    });
+    ASSERT_OK_AND_ASSIGN(auto has_nested_projection,
+                         NestedProjectionUtils::HasNestedSubfieldProjection(file_schema,
+                                                                            read_schema));
+    ASSERT_FALSE(has_nested_projection);
+}
+
+TEST(NestedProjectionUtilsTest, HasNestedSubfieldProjection_WithProjection) {
+    auto file_schema = arrow::schema({
+        MakeField("f0", arrow::int32(), 1),
+        MakeField("f1", arrow::struct_({MakeField("a", arrow::int32(), 2),
+                                         MakeField("b", arrow::utf8(), 4)}),
+                  3),
+    });
+    auto read_schema = arrow::schema({
+        MakeField("f0", arrow::int32(), 1),
+        MakeField("f1", arrow::struct_({MakeField("a", arrow::int32(), 2)}), 3),
+    });
+    ASSERT_OK_AND_ASSIGN(auto has_nested_projection,
+                         NestedProjectionUtils::HasNestedSubfieldProjection(file_schema,
+                                                                            read_schema));
+    ASSERT_TRUE(has_nested_projection);
+}
+
 // ============== GetMapSelectedKeys ==============
 
 TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_Present) {
diff --git a/src/paimon/format/avro/avro_file_batch_reader.cpp b/src/paimon/format/avro/avro_file_batch_reader.cpp
index 13833f97c..8cce1667d 100644
--- a/src/paimon/format/avro/avro_file_batch_reader.cpp
+++ b/src/paimon/format/avro/avro_file_batch_reader.cpp
@@ -27,6 +27,7 @@
 #include "paimon/common/utils/arrow/mem_utils.h"
 #include "paimon/common/utils/arrow/status_utils.h"
 #include "paimon/common/utils/scope_guard.h"
+#include "paimon/core/utils/nested_projection_utils.h"
 #include "paimon/format/avro/avro_input_stream_impl.h"
 #include "paimon/format/avro/avro_schema_converter.h"
 #include "paimon/reader/batch_reader.h"
@@ -148,6 +149,13 @@ Status AvroFileBatchReader::SetReadSchema(::ArrowSchema* read_schema,
                                       arrow::ImportSchema(read_schema));
     PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Schema> file_schema,
                            ArrowUtils::DataTypeToSchema(file_data_type_));
+    PAIMON_ASSIGN_OR_RAISE(bool has_nested_projection,
+                           NestedProjectionUtils::HasNestedSubfieldProjection(file_schema,
+                                                                             arrow_read_schema));
+    if (has_nested_projection) {
+        return Status::Invalid(
+            "SetReadSchema failed: avro reader does not support nested sub-field projection");
+    }
     PAIMON_ASSIGN_OR_RAISE(read_fields_projection_,
                            CalculateReadFieldsProjection(file_schema, arrow_read_schema->fields()));
     array_builder_->Reset();
diff --git a/src/paimon/format/avro/avro_file_batch_reader_test.cpp b/src/paimon/format/avro/avro_file_batch_reader_test.cpp
index a8ed3bb6c..beaa96e96 100644
--- a/src/paimon/format/avro/avro_file_batch_reader_test.cpp
+++ b/src/paimon/format/avro/avro_file_batch_reader_test.cpp
@@ -295,6 +295,39 @@ TEST_F(AvroFileBatchReaderTest, TestReadMapTypes) {
     ASSERT_TRUE(expected_array->Equals(result_array));
 }
 
+TEST_F(AvroFileBatchReaderTest, TestSetReadSchemaRejectNestedSubFieldProjection) {
+    std::string path = PathUtil::JoinPath(dir_->Str(), "nested_projection_unsupported.avro");
+
+    arrow::FieldVector write_fields = {
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", arrow::struct_({arrow::field("a", arrow::int32()),
+                                           arrow::field("b", arrow::utf8())}))};
+    auto write_type = arrow::struct_(write_fields);
+    auto write_array =
+        arrow::ipc::internal::json::ArrayFromJSON(write_type, R"([
+            [1, [10, "x"]],
+            [2, [20, "y"]]
+        ])")
+            .ValueOrDie();
+    WriteData(write_array, path, /*compression=*/"null");
+
+    ASSERT_OK_AND_ASSIGN(auto reader_builder,
+                         file_format_->CreateReaderBuilder(/*batch_size=*/1024));
+    ASSERT_OK_AND_ASSIGN(std::shared_ptr<InputStream> in, fs_->Open(path));
+    ASSERT_OK_AND_ASSIGN(auto batch_reader, reader_builder->Build(in));
+
+    arrow::FieldVector read_fields = {
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", arrow::struct_({arrow::field("a", arrow::int32())}))};
+    auto read_schema = arrow::schema(read_fields);
+    std::unique_ptr<ArrowSchema> c_schema = std::make_unique<ArrowSchema>();
+    ASSERT_TRUE(arrow::ExportSchema(*read_schema, c_schema.get()).ok());
+
+    ASSERT_NOK_WITH_MSG(batch_reader->SetReadSchema(c_schema.get(), /*predicate=*/nullptr,
+                                                    /*selection_bitmap=*/std::nullopt),
+                        "does not support nested sub-field projection");
+}
+
 TEST_F(AvroFileBatchReaderTest, TestGetPreviousBatchFirstRowNumber) {
     std::string path = paimon::test::GetDataDir() +
                        "/avro/append_simple.db/"

From 222dd952f7dc825f8fe339ec94fc08e931411b53 Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Wed, 17 Jun 2026 15:17:08 +0800
Subject: [PATCH 16/24] fix

---
 .../format/lance/lance_file_batch_reader.cpp  | 15 ++++++--
 .../lance/lance_format_reader_writer_test.cpp | 36 +++++++++++++++++++
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/src/paimon/format/lance/lance_file_batch_reader.cpp b/src/paimon/format/lance/lance_file_batch_reader.cpp
index 79afcacfc..db0d2d012 100644
--- a/src/paimon/format/lance/lance_file_batch_reader.cpp
+++ b/src/paimon/format/lance/lance_file_batch_reader.cpp
@@ -19,6 +19,7 @@
 #include "arrow/api.h"
 #include "paimon/common/metrics/metrics_impl.h"
 #include "paimon/common/utils/arrow/status_utils.h"
+#include "paimon/core/utils/nested_projection_utils.h"
 #include "paimon/format/lance/lance_utils.h"
 namespace paimon::lance {
 LanceFileBatchReader::LanceFileBatchReader(LanceFileReader* file_reader, int32_t batch_size,
@@ -66,9 +67,19 @@ Result<std::unique_ptr<::ArrowSchema>> LanceFileBatchReader::GetFileSchema() con
 Status LanceFileBatchReader::SetReadSchema(::ArrowSchema* read_schema,
                                            const std::shared_ptr<Predicate>& predicate,
                                            const std::optional<RoaringBitmap32>& selection_bitmap) {
-    PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr<arrow::Schema> arrow_schema,
+    PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr<arrow::Schema> arrow_read_schema,
                                       arrow::ImportSchema(read_schema));
-    read_field_names_ = arrow_schema->field_names();
+    PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<::ArrowSchema> c_file_schema, GetFileSchema());
+    PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr<arrow::Schema> file_schema,
+                                      arrow::ImportSchema(c_file_schema.get()));
+    PAIMON_ASSIGN_OR_RAISE(bool has_nested_projection,
+                           NestedProjectionUtils::HasNestedSubfieldProjection(file_schema,
+                                                                             arrow_read_schema));
+    if (has_nested_projection) {
+        return Status::Invalid(
+            "SetReadSchema failed: lance reader does not support nested sub-field projection");
+    }
+    read_field_names_ = arrow_read_schema->field_names();
     assert(!read_field_names_.empty());
     read_row_ids_.clear();
     if (selection_bitmap) {
diff --git a/src/paimon/format/lance/lance_format_reader_writer_test.cpp b/src/paimon/format/lance/lance_format_reader_writer_test.cpp
index 94a8aab96..da0ddaa13 100644
--- a/src/paimon/format/lance/lance_format_reader_writer_test.cpp
+++ b/src/paimon/format/lance/lance_format_reader_writer_test.cpp
@@ -247,6 +247,42 @@ TEST_F(LanceFileReaderWriterTest, TestNestedType) {
     }
 }
 
+TEST_F(LanceFileReaderWriterTest, TestRejectNestedSubFieldProjection) {
+    arrow::FieldVector fields = {
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", arrow::struct_({arrow::field("sub_f0", arrow::boolean()),
+                                            arrow::field("sub_f1", arrow::int64())}))};
+    auto schema = arrow::schema(fields);
+    auto array = std::dynamic_pointer_cast<arrow::StructArray>(
+        arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([
+        [1, [true, 2]],
+        [2, [false, 3]]
+    ])")
+            .ValueOrDie());
+    auto src_chunk_array = std::make_shared<arrow::ChunkedArray>(arrow::ArrayVector({array}));
+
+    auto dir = paimon::test::UniqueTestDirectory::Create();
+    ASSERT_TRUE(dir);
+    std::string file_path = dir->Str() + "/test.lance";
+    WriteFile(file_path, src_chunk_array, schema);
+
+    ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanceFileBatchReader> reader,
+                         LanceFileBatchReader::Create(file_path, /*batch_size=*/2,
+                                                      /*batch_readahead=*/2));
+
+    auto projected_fields = arrow::FieldVector{
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", arrow::struct_({arrow::field("sub_f0", arrow::boolean())})),
+    };
+    auto projected_schema = arrow::schema(projected_fields);
+    ArrowSchema c_read_schema;
+    ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_read_schema).ok());
+    ASSERT_NOK_WITH_MSG(
+        reader->SetReadSchema(&c_read_schema, /*predicate=*/nullptr,
+                              /*selection_bitmap=*/std::nullopt),
+        "SetReadSchema failed: lance reader does not support nested sub-field projection");
+}
+
 TEST_F(LanceFileReaderWriterTest, TestBulkData) {
     int64_t seed = DateTimeUtils::GetCurrentUTCTimeUs();
     std::srand(seed);

From 1c66125d4c98668be46120653ed8cdd452183baf Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Wed, 17 Jun 2026 15:21:31 +0800
Subject: [PATCH 17/24] fix

---
 .../core/utils/nested_projection_utils.cpp    | 21 ++++++++++---------
 .../core/utils/nested_projection_utils.h      |  5 +++--
 .../utils/nested_projection_utils_test.cpp    | 19 +++++++++--------
 .../format/avro/avro_file_batch_reader.cpp    |  6 +++---
 .../avro/avro_file_batch_reader_test.cpp      |  5 ++---
 .../format/lance/lance_file_batch_reader.cpp  |  6 +++---
 .../lance/lance_format_reader_writer_test.cpp |  2 +-
 .../parquet/parquet_file_batch_reader.cpp     | 16 +++++++-------
 8 files changed, 41 insertions(+), 39 deletions(-)

diff --git a/src/paimon/core/utils/nested_projection_utils.cpp b/src/paimon/core/utils/nested_projection_utils.cpp
index 5508aff91..f3b6d347b 100644
--- a/src/paimon/core/utils/nested_projection_utils.cpp
+++ b/src/paimon/core/utils/nested_projection_utils.cpp
@@ -61,9 +61,9 @@ Result<bool> NestedProjectionUtils::HasNestedSubfieldProjectionType(
                 if (!file_child) {
                     return true;
                 }
-                PAIMON_ASSIGN_OR_RAISE(bool child_has_nested_projection,
-                                       HasNestedSubfieldProjectionType(file_child->type(),
-                                                                       read_child->type()));
+                PAIMON_ASSIGN_OR_RAISE(
+                    bool child_has_nested_projection,
+                    HasNestedSubfieldProjectionType(file_child->type(), read_child->type()));
                 if (child_has_nested_projection) {
                     return true;
                 }
@@ -73,14 +73,15 @@ Result<bool> NestedProjectionUtils::HasNestedSubfieldProjectionType(
         case arrow::Type::LIST: {
             auto file_list = std::static_pointer_cast<arrow::ListType>(file_type);
             auto read_list = std::static_pointer_cast<arrow::ListType>(read_type);
-            return HasNestedSubfieldProjectionType(file_list->value_type(), read_list->value_type());
+            return HasNestedSubfieldProjectionType(file_list->value_type(),
+                                                   read_list->value_type());
         }
         case arrow::Type::MAP: {
             auto file_map = std::static_pointer_cast<arrow::MapType>(file_type);
             auto read_map = std::static_pointer_cast<arrow::MapType>(read_type);
-            PAIMON_ASSIGN_OR_RAISE(bool key_has_nested_projection,
-                                   HasNestedSubfieldProjectionType(file_map->key_type(),
-                                                                   read_map->key_type()));
+            PAIMON_ASSIGN_OR_RAISE(
+                bool key_has_nested_projection,
+                HasNestedSubfieldProjectionType(file_map->key_type(), read_map->key_type()));
             if (key_has_nested_projection) {
                 return true;
             }
@@ -173,9 +174,9 @@ Result<bool> NestedProjectionUtils::HasNestedSubfieldProjection(
         if (read_field->type()->id() == arrow::Type::STRUCT ||
             read_field->type()->id() == arrow::Type::LIST ||
             read_field->type()->id() == arrow::Type::MAP) {
-            PAIMON_ASSIGN_OR_RAISE(bool has_nested_projection,
-                                   HasNestedSubfieldProjectionType(file_field->type(),
-                                                                   read_field->type()));
+            PAIMON_ASSIGN_OR_RAISE(
+                bool has_nested_projection,
+                HasNestedSubfieldProjectionType(file_field->type(), read_field->type()));
             if (has_nested_projection) {
                 return true;
             }
diff --git a/src/paimon/core/utils/nested_projection_utils.h b/src/paimon/core/utils/nested_projection_utils.h
index 3da853087..18ed3643d 100644
--- a/src/paimon/core/utils/nested_projection_utils.h
+++ b/src/paimon/core/utils/nested_projection_utils.h
@@ -79,8 +79,9 @@ class PAIMON_EXPORT NestedProjectionUtils {
 
     /// Returns true if `read_schema` requests a nested sub-field projection against
     /// `file_schema` (same top-level field, but nested STRUCT/LIST/MAP subtree is pruned).
-    static Result<bool> HasNestedSubfieldProjection(const std::shared_ptr<arrow::Schema>& file_schema,
-                                                    const std::shared_ptr<arrow::Schema>& read_schema);
+    static Result<bool> HasNestedSubfieldProjection(
+        const std::shared_ptr<arrow::Schema>& file_schema,
+        const std::shared_ptr<arrow::Schema>& read_schema);
 
     /// Parse the "paimon.map.selected-keys" metadata from an Arrow field.
     /// Returns an empty set if the metadata key is absent or the field is not a MAP.
diff --git a/src/paimon/core/utils/nested_projection_utils_test.cpp b/src/paimon/core/utils/nested_projection_utils_test.cpp
index 7bd177e7d..61f8c90ad 100644
--- a/src/paimon/core/utils/nested_projection_utils_test.cpp
+++ b/src/paimon/core/utils/nested_projection_utils_test.cpp
@@ -178,26 +178,27 @@ TEST(NestedProjectionUtilsTest, HasNestedSubfieldProjection_NoProjection) {
         MakeField("f0", arrow::int32(), 1),
         MakeField("f1", arrow::struct_({MakeField("a", arrow::int32(), 2)}), 3),
     });
-    ASSERT_OK_AND_ASSIGN(auto has_nested_projection,
-                         NestedProjectionUtils::HasNestedSubfieldProjection(file_schema,
-                                                                            read_schema));
+    ASSERT_OK_AND_ASSIGN(
+        auto has_nested_projection,
+        NestedProjectionUtils::HasNestedSubfieldProjection(file_schema, read_schema));
     ASSERT_FALSE(has_nested_projection);
 }
 
 TEST(NestedProjectionUtilsTest, HasNestedSubfieldProjection_WithProjection) {
     auto file_schema = arrow::schema({
         MakeField("f0", arrow::int32(), 1),
-        MakeField("f1", arrow::struct_({MakeField("a", arrow::int32(), 2),
-                                         MakeField("b", arrow::utf8(), 4)}),
-                  3),
+        MakeField(
+            "f1",
+            arrow::struct_({MakeField("a", arrow::int32(), 2), MakeField("b", arrow::utf8(), 4)}),
+            3),
     });
     auto read_schema = arrow::schema({
         MakeField("f0", arrow::int32(), 1),
         MakeField("f1", arrow::struct_({MakeField("a", arrow::int32(), 2)}), 3),
     });
-    ASSERT_OK_AND_ASSIGN(auto has_nested_projection,
-                         NestedProjectionUtils::HasNestedSubfieldProjection(file_schema,
-                                                                            read_schema));
+    ASSERT_OK_AND_ASSIGN(
+        auto has_nested_projection,
+        NestedProjectionUtils::HasNestedSubfieldProjection(file_schema, read_schema));
     ASSERT_TRUE(has_nested_projection);
 }
 
diff --git a/src/paimon/format/avro/avro_file_batch_reader.cpp b/src/paimon/format/avro/avro_file_batch_reader.cpp
index 8cce1667d..02013aabb 100644
--- a/src/paimon/format/avro/avro_file_batch_reader.cpp
+++ b/src/paimon/format/avro/avro_file_batch_reader.cpp
@@ -149,9 +149,9 @@ Status AvroFileBatchReader::SetReadSchema(::ArrowSchema* read_schema,
                                       arrow::ImportSchema(read_schema));
     PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Schema> file_schema,
                            ArrowUtils::DataTypeToSchema(file_data_type_));
-    PAIMON_ASSIGN_OR_RAISE(bool has_nested_projection,
-                           NestedProjectionUtils::HasNestedSubfieldProjection(file_schema,
-                                                                             arrow_read_schema));
+    PAIMON_ASSIGN_OR_RAISE(
+        bool has_nested_projection,
+        NestedProjectionUtils::HasNestedSubfieldProjection(file_schema, arrow_read_schema));
     if (has_nested_projection) {
         return Status::Invalid(
             "SetReadSchema failed: avro reader does not support nested sub-field projection");
diff --git a/src/paimon/format/avro/avro_file_batch_reader_test.cpp b/src/paimon/format/avro/avro_file_batch_reader_test.cpp
index beaa96e96..41fcca91f 100644
--- a/src/paimon/format/avro/avro_file_batch_reader_test.cpp
+++ b/src/paimon/format/avro/avro_file_batch_reader_test.cpp
@@ -303,12 +303,11 @@ TEST_F(AvroFileBatchReaderTest, TestSetReadSchemaRejectNestedSubFieldProjection)
         arrow::field("f1", arrow::struct_({arrow::field("a", arrow::int32()),
                                            arrow::field("b", arrow::utf8())}))};
     auto write_type = arrow::struct_(write_fields);
-    auto write_array =
-        arrow::ipc::internal::json::ArrayFromJSON(write_type, R"([
+    auto write_array = arrow::ipc::internal::json::ArrayFromJSON(write_type, R"([
             [1, [10, "x"]],
             [2, [20, "y"]]
         ])")
-            .ValueOrDie();
+                           .ValueOrDie();
     WriteData(write_array, path, /*compression=*/"null");
 
     ASSERT_OK_AND_ASSIGN(auto reader_builder,
diff --git a/src/paimon/format/lance/lance_file_batch_reader.cpp b/src/paimon/format/lance/lance_file_batch_reader.cpp
index db0d2d012..1ae1deb84 100644
--- a/src/paimon/format/lance/lance_file_batch_reader.cpp
+++ b/src/paimon/format/lance/lance_file_batch_reader.cpp
@@ -72,9 +72,9 @@ Status LanceFileBatchReader::SetReadSchema(::ArrowSchema* read_schema,
     PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<::ArrowSchema> c_file_schema, GetFileSchema());
     PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr<arrow::Schema> file_schema,
                                       arrow::ImportSchema(c_file_schema.get()));
-    PAIMON_ASSIGN_OR_RAISE(bool has_nested_projection,
-                           NestedProjectionUtils::HasNestedSubfieldProjection(file_schema,
-                                                                             arrow_read_schema));
+    PAIMON_ASSIGN_OR_RAISE(
+        bool has_nested_projection,
+        NestedProjectionUtils::HasNestedSubfieldProjection(file_schema, arrow_read_schema));
     if (has_nested_projection) {
         return Status::Invalid(
             "SetReadSchema failed: lance reader does not support nested sub-field projection");
diff --git a/src/paimon/format/lance/lance_format_reader_writer_test.cpp b/src/paimon/format/lance/lance_format_reader_writer_test.cpp
index da0ddaa13..b1ad6be73 100644
--- a/src/paimon/format/lance/lance_format_reader_writer_test.cpp
+++ b/src/paimon/format/lance/lance_format_reader_writer_test.cpp
@@ -251,7 +251,7 @@ TEST_F(LanceFileReaderWriterTest, TestRejectNestedSubFieldProjection) {
     arrow::FieldVector fields = {
         arrow::field("f0", arrow::int32()),
         arrow::field("f1", arrow::struct_({arrow::field("sub_f0", arrow::boolean()),
-                                            arrow::field("sub_f1", arrow::int64())}))};
+                                           arrow::field("sub_f1", arrow::int64())}))};
     auto schema = arrow::schema(fields);
     auto array = std::dynamic_pointer_cast<arrow::StructArray>(
         arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([
diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
index fcad8ae93..023ec4c92 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
@@ -103,13 +103,13 @@ Result<std::shared_ptr<arrow::RecordBatch>> AlignBatchToReadSchemaOrder(
     const std::shared_ptr<arrow::DataType>& read_data_type) {
     auto read_struct = std::dynamic_pointer_cast<arrow::StructType>(read_data_type);
     if (!read_struct) {
-        return Status::Invalid(fmt::format("Read data type must be struct, got {}",
-                                           read_data_type->ToString()));
+        return Status::Invalid(
+            fmt::format("Read data type must be struct, got {}", read_data_type->ToString()));
     }
     if (batch->num_columns() != read_struct->num_fields()) {
-        return Status::Invalid(fmt::format(
-            "Batch column count {} does not match read schema field count {}", batch->num_columns(),
-            read_struct->num_fields()));
+        return Status::Invalid(
+            fmt::format("Batch column count {} does not match read schema field count {}",
+                        batch->num_columns(), read_struct->num_fields()));
     }
 
     bool already_aligned = true;
@@ -138,9 +138,9 @@ Result<std::shared_ptr<arrow::RecordBatch>> AlignBatchToReadSchemaOrder(
         const auto& read_field = read_struct->field(i);
         auto it = batch_field_index.find(read_field->name());
         if (it == batch_field_index.end()) {
-            return Status::Invalid(fmt::format(
-                "Parquet batch column '{}' not found while aligning to read schema", 
-                read_field->name()));
+            return Status::Invalid(
+                fmt::format("Parquet batch column '{}' not found while aligning to read schema",
+                            read_field->name()));
         }
         aligned_columns.push_back(batch->column(it->second));
         aligned_fields.push_back(read_field);

From 39b0f1d2a9a71a7dab63ceb45759e868ecd0611d Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Wed, 17 Jun 2026 15:25:38 +0800
Subject: [PATCH 18/24] fix

---
 src/paimon/core/utils/nested_projection_utils.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/paimon/core/utils/nested_projection_utils.h b/src/paimon/core/utils/nested_projection_utils.h
index 18ed3643d..2fe887383 100644
--- a/src/paimon/core/utils/nested_projection_utils.h
+++ b/src/paimon/core/utils/nested_projection_utils.h
@@ -84,8 +84,10 @@ class PAIMON_EXPORT NestedProjectionUtils {
         const std::shared_ptr<arrow::Schema>& read_schema);
 
     /// Parse the "paimon.map.selected-keys" metadata from an Arrow field.
-    /// Returns an empty set if the metadata key is absent or the field is not a MAP.
-    /// The metadata value must be a JSON array of strings, e.g. '["key1","key2"]'.
+    /// Returns an empty set if the field is null, has no metadata, or the metadata key is absent.
+    /// The metadata value is a comma-separated string, e.g. "key1,key2".
+    /// If the metadata key is present with an empty value, returns a set containing
+    /// an empty string sentinel ("") to mean "filter all keys".
     static std::set<std::string> GetMapSelectedKeys(const std::shared_ptr<arrow::Field>& field);
 
     /// Filter a MapArray so that only entries whose key is in `selected_keys` are kept.

From 6f495c6321ad5338234b32611e33e0dcdc9aad35 Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Wed, 17 Jun 2026 17:37:12 +0800
Subject: [PATCH 19/24] fix

---
 src/paimon/core/io/field_mapping_reader.cpp   |  14 +-
 .../core/operation/internal_read_context.cpp  |  14 +-
 .../core/operation/internal_read_context.h    |   2 -
 .../core/utils/nested_projection_utils.cpp    | 127 +++-----
 .../core/utils/nested_projection_utils.h      |  26 +-
 .../utils/nested_projection_utils_test.cpp    |  88 ++++--
 .../parquet/parquet_file_batch_reader.cpp     | 114 ++++---
 .../parquet/parquet_file_batch_reader.h       |  13 +-
 .../parquet_file_batch_reader_test.cpp        |  46 +++
 test/inte/nested_column_pruning_inte_test.cpp | 288 +++++++++++++++++-
 10 files changed, 552 insertions(+), 180 deletions(-)

diff --git a/src/paimon/core/io/field_mapping_reader.cpp b/src/paimon/core/io/field_mapping_reader.cpp
index 86e6f47a5..db65d8bad 100644
--- a/src/paimon/core/io/field_mapping_reader.cpp
+++ b/src/paimon/core/io/field_mapping_reader.cpp
@@ -80,8 +80,15 @@ FieldMappingReader::FieldMappingReader(int32_t field_count,
         // FilterMapArrayBySelectedKeys can filter out unwanted entries.
         if (!need_mapping_ &&
             non_partition_info_.non_partition_read_schema[i].Type()->id() == arrow::Type::MAP) {
-            std::set<std::string> selected_keys = NestedProjectionUtils::GetMapSelectedKeys(
+            auto selected_keys_or = NestedProjectionUtils::GetMapSelectedKeys(
                 non_partition_info_.non_partition_read_schema[i].ArrowField());
+            if (!selected_keys_or.ok()) {
+                // Keep mapping enabled so the parse error can be surfaced in
+                // MappingFields where Status can be returned.
+                need_mapping_ = true;
+                continue;
+            }
+            auto& selected_keys = selected_keys_or.value();
             if (!selected_keys.empty()) {
                 need_mapping_ = true;
             }
@@ -309,8 +316,9 @@ Status FieldMappingReader::MappingFields(const std::shared_ptr<arrow::Array>& da
 
         // Filter map entries by selected keys if metadata is present.
         if (field_array->type()->id() == arrow::Type::MAP) {
-            std::set<std::string> selected_keys = NestedProjectionUtils::GetMapSelectedKeys(
-                read_fields_of_data_array[i].ArrowField());
+            PAIMON_ASSIGN_OR_RAISE(
+                std::vector<std::string> selected_keys,
+                NestedProjectionUtils::GetMapSelectedKeys(read_fields_of_data_array[i].ArrowField()));
             if (!selected_keys.empty()) {
                 PAIMON_ASSIGN_OR_RAISE(field_array,
                                        NestedProjectionUtils::FilterMapArrayBySelectedKeys(
diff --git a/src/paimon/core/operation/internal_read_context.cpp b/src/paimon/core/operation/internal_read_context.cpp
index dbd24bdf9..53ebee558 100644
--- a/src/paimon/core/operation/internal_read_context.cpp
+++ b/src/paimon/core/operation/internal_read_context.cpp
@@ -29,20 +29,11 @@
 #include "paimon/common/types/data_field.h"
 #include "paimon/common/utils/arrow/status_utils.h"
 #include "paimon/core/schema/arrow_schema_validator.h"
+#include "paimon/core/utils/nested_projection_utils.h"
 #include "paimon/status.h"
 
 namespace paimon {
 
-std::shared_ptr<arrow::Field> InternalReadContext::FindFieldByName(const arrow::FieldVector& fields,
-                                                                   const std::string& name) {
-    for (const auto& field : fields) {
-        if (field->name() == name) {
-            return field;
-        }
-    }
-    return nullptr;
-}
-
 std::shared_ptr<arrow::Field> InternalReadContext::MergeReadFieldMetadata(
     const std::shared_ptr<arrow::Field>& aligned_field,
     const std::shared_ptr<arrow::Field>& read_field) {
@@ -75,7 +66,8 @@ Result<std::shared_ptr<arrow::Field>> InternalReadContext::AlignReadFieldWithTab
         arrow::FieldVector rebased_children;
         rebased_children.reserve(read_struct->num_fields());
         for (const auto& read_child : read_struct->fields()) {
-            auto table_child = FindFieldByName(table_struct->fields(), read_child->name());
+            auto table_child =
+                NestedProjectionUtils::FindFieldByName(table_struct->fields(), read_child->name());
             if (!table_child) {
                 return Status::Invalid(
                     fmt::format("Read schema nested field '{}' does not exist in table field '{}'",
diff --git a/src/paimon/core/operation/internal_read_context.h b/src/paimon/core/operation/internal_read_context.h
index 54af862bc..7b448946c 100644
--- a/src/paimon/core/operation/internal_read_context.h
+++ b/src/paimon/core/operation/internal_read_context.h
@@ -116,8 +116,6 @@ class InternalReadContext {
                                                                const CoreOptions& core_options);
     static std::optional<DataField> TryResolveSpecialFieldByName(const std::string& name,
                                                                  const CoreOptions& core_options);
-    static std::shared_ptr<arrow::Field> FindFieldByName(const arrow::FieldVector& fields,
-                                                         const std::string& name);
     static std::shared_ptr<arrow::Field> MergeReadFieldMetadata(
         const std::shared_ptr<arrow::Field>& aligned_field,
         const std::shared_ptr<arrow::Field>& read_field);
diff --git a/src/paimon/core/utils/nested_projection_utils.cpp b/src/paimon/core/utils/nested_projection_utils.cpp
index f3b6d347b..c24919ab4 100644
--- a/src/paimon/core/utils/nested_projection_utils.cpp
+++ b/src/paimon/core/utils/nested_projection_utils.cpp
@@ -18,6 +18,7 @@
 
 #include <set>
 #include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -187,9 +188,9 @@ Result<bool> NestedProjectionUtils::HasNestedSubfieldProjection(
 
 // Map selected-keys support
 
-std::set<std::string> NestedProjectionUtils::GetMapSelectedKeys(
+Result<std::vector<std::string>> NestedProjectionUtils::GetMapSelectedKeys(
     const std::shared_ptr<arrow::Field>& field) {
-    std::set<std::string> result;
+    std::vector<std::string> result;
     if (!field || !field->HasMetadata() || !field->metadata()) {
         return result;
     }
@@ -200,27 +201,32 @@ std::set<std::string> NestedProjectionUtils::GetMapSelectedKeys(
     std::string value = get_result.ValueUnsafe();
     StringUtils::Trim(&value);
     if (value.empty()) {
-        // Metadata is explicitly present but empty: treat as "filter all keys".
-        result.insert("");
+        // Metadata is explicitly present but empty: select the empty-string key.
+        result.push_back("");
         return result;
     }
 
-    auto tokens = StringUtils::Split(value, ",", /*ignore_empty=*/true);
+    auto tokens = StringUtils::Split(value, ",", /*ignore_empty=*/false);
+    std::unordered_set<std::string> deduplicated;
+    deduplicated.reserve(tokens.size());
     for (auto& token : tokens) {
         StringUtils::Trim(&token);
-        if (!token.empty()) {
-            result.insert(token);
+        if (!deduplicated.insert(token).second) {
+            return Status::Invalid(
+                fmt::format("Duplicate selected key '{}' in {} metadata", token,
+                            DataField::MAP_SELECTED_KEYS));
         }
+        result.push_back(token);
     }
     return result;
 }
 
 Result<std::shared_ptr<arrow::Array>> NestedProjectionUtils::FilterMapArrayBySelectedKeys(
-    const std::shared_ptr<arrow::Array>& array, const std::set<std::string>& selected_keys) {
+    const std::shared_ptr<arrow::Array>& array,
+    const std::vector<std::string>& selected_keys) {
     if (selected_keys.empty() || !array || array->length() == 0) {
         return array;
     }
-    bool filter_all_keys = selected_keys.count("") > 0;
 
     auto map_array = std::static_pointer_cast<arrow::MapArray>(array);
     auto map_type = std::static_pointer_cast<arrow::MapType>(array->type());
@@ -233,94 +239,53 @@ Result<std::shared_ptr<arrow::Array>> NestedProjectionUtils::FilterMapArrayBySel
 
     auto keys_array = std::static_pointer_cast<arrow::StringArray>(map_array->keys());
     auto values_array = map_array->items();
-    int64_t total_entries = keys_array->length();
     int64_t num_maps = map_array->length();
 
-    // Mark which flat entries to keep
-    std::vector<bool> keep(total_entries, false);
-    int64_t kept_count = 0;
-    for (int64_t i = 0; i < total_entries; ++i) {
-        if (filter_all_keys) {
-            continue;
-        }
-        if (!keys_array->IsNull(i)) {
-            std::string_view key_view = keys_array->GetView(i);
-            std::string key_str(key_view.data(), key_view.size());
-            if (selected_keys.count(key_str) > 0) {
-                keep[i] = true;
-                ++kept_count;
-            }
+    std::unordered_set<std::string> deduplicated;
+    deduplicated.reserve(selected_keys.size());
+    for (const auto& selected_key : selected_keys) {
+        if (!deduplicated.insert(selected_key).second) {
+            return Status::Invalid(
+                fmt::format("Duplicate selected key '{}' in {} metadata", selected_key,
+                            DataField::MAP_SELECTED_KEYS));
         }
     }
 
-    if (kept_count == total_entries) {
-        return array;
-    }
-
-    // Collect kept slices as contiguous runs to build filtered key/value arrays
-    // via Slice + Concatenate (avoids arrow::compute::Take dependency).
-    arrow::ArrayVector key_slices;
-    arrow::ArrayVector value_slices;
-    key_slices.reserve(kept_count);
-    value_slices.reserve(kept_count);
-
-    std::vector<int32_t> new_offsets;
-    new_offsets.reserve(num_maps + 1);
-    int32_t running_offset = 0;
+    auto key_builder = std::make_shared<arrow::StringBuilder>();
+    PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
+        std::unique_ptr<arrow::ArrayBuilder> value_builder_u,
+        arrow::MakeBuilder(values_array->type(), arrow::default_memory_pool()));
+    auto value_builder = std::shared_ptr<arrow::ArrayBuilder>(std::move(value_builder_u));
+    arrow::MapBuilder map_builder(arrow::default_memory_pool(), key_builder, value_builder);
 
     for (int64_t map_idx = 0; map_idx < num_maps; ++map_idx) {
-        new_offsets.push_back(running_offset);
         if (map_array->IsNull(map_idx)) {
+            PAIMON_RETURN_NOT_OK_FROM_ARROW(map_builder.AppendNull());
             continue;
         }
+        PAIMON_RETURN_NOT_OK_FROM_ARROW(map_builder.Append());
         int64_t start = map_array->value_offset(map_idx);
         int64_t end = map_array->value_offset(map_idx + 1);
-        // Collect contiguous runs of kept entries within this map
-        int64_t run_start = -1;
-        for (int64_t entry_idx = start; entry_idx <= end; ++entry_idx) {
-            bool should_keep = (entry_idx < end) && keep[entry_idx];
-            if (should_keep && run_start < 0) {
-                run_start = entry_idx;
-            } else if (!should_keep && run_start >= 0) {
-                int64_t run_len = entry_idx - run_start;
-                key_slices.push_back(keys_array->Slice(run_start, run_len));
-                value_slices.push_back(values_array->Slice(run_start, run_len));
-                running_offset += static_cast<int32_t>(run_len);
-                run_start = -1;
+
+        // Keep selected keys in the exact selected_keys order.
+        for (const auto& selected_key : selected_keys) {
+            for (int64_t entry_idx = start; entry_idx < end; ++entry_idx) {
+                if (keys_array->IsNull(entry_idx)) {
+                    continue;
+                }
+                std::string_view key_view = keys_array->GetView(entry_idx);
+                if (key_view == selected_key) {
+                    PAIMON_RETURN_NOT_OK_FROM_ARROW(
+                        key_builder->Append(key_view.data(), static_cast<int32_t>(key_view.size())));
+                    PAIMON_RETURN_NOT_OK_FROM_ARROW(
+                        value_builder->AppendArraySlice(*values_array->data(), entry_idx, 1));
+                }
             }
         }
     }
-    new_offsets.push_back(running_offset);
-
-    // Build filtered key/value arrays
-    std::shared_ptr<arrow::Array> filtered_keys;
-    std::shared_ptr<arrow::Array> filtered_values;
-    if (key_slices.empty()) {
-        // All entries filtered out — create empty arrays
-        filtered_keys = keys_array->Slice(0, 0);
-        filtered_values = values_array->Slice(0, 0);
-    } else if (key_slices.size() == 1) {
-        filtered_keys = key_slices[0];
-        filtered_values = value_slices[0];
-    } else {
-        PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(filtered_keys, arrow::Concatenate(key_slices));
-        PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(filtered_values, arrow::Concatenate(value_slices));
-    }
-
-    // Build new offsets array
-    arrow::Int32Builder offset_builder;
-    PAIMON_RETURN_NOT_OK_FROM_ARROW(
-        offset_builder.Reserve(static_cast<int64_t>(new_offsets.size())));
-    for (int32_t offset : new_offsets) {
-        offset_builder.UnsafeAppend(offset);
-    }
-    std::shared_ptr<arrow::Array> new_offsets_array;
-    PAIMON_RETURN_NOT_OK_FROM_ARROW(offset_builder.Finish(&new_offsets_array));
 
-    PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
-        std::shared_ptr<arrow::Array> result_map,
-        arrow::MapArray::FromArrays(new_offsets_array, filtered_keys, filtered_values,
-                                    arrow::default_memory_pool(), map_array->null_bitmap()));
+    std::shared_ptr<arrow::Array> result_map;
+    PAIMON_RETURN_NOT_OK_FROM_ARROW(map_builder.Finish(&result_map));
     return result_map;
 }
 
diff --git a/src/paimon/core/utils/nested_projection_utils.h b/src/paimon/core/utils/nested_projection_utils.h
index 2fe887383..c99ec1a80 100644
--- a/src/paimon/core/utils/nested_projection_utils.h
+++ b/src/paimon/core/utils/nested_projection_utils.h
@@ -19,7 +19,6 @@
 #include <cstdint>
 #include <memory>
 #include <optional>
-#include <set>
 #include <string>
 #include <vector>
 
@@ -34,6 +33,10 @@ namespace paimon {
 class PAIMON_EXPORT NestedProjectionUtils {
  public:
     NestedProjectionUtils() = delete;
+    ~NestedProjectionUtils() = delete;
+
+    static std::shared_ptr<arrow::Field> FindFieldByName(const arrow::FieldVector& fields,
+                                                         const std::string& name);
 
     /// Extract the paimon field ID from an Arrow field's metadata ("paimon.id").
     /// Returns -1 if the metadata key is not present.
@@ -84,22 +87,23 @@ class PAIMON_EXPORT NestedProjectionUtils {
         const std::shared_ptr<arrow::Schema>& read_schema);
 
     /// Parse the "paimon.map.selected-keys" metadata from an Arrow field.
-    /// Returns an empty set if the field is null, has no metadata, or the metadata key is absent.
+    /// Returns an empty vector if the field is null, has no metadata, or the metadata key
+    /// is absent.
     /// The metadata value is a comma-separated string, e.g. "key1,key2".
-    /// If the metadata key is present with an empty value, returns a set containing
-    /// an empty string sentinel ("") to mean "filter all keys".
-    static std::set<std::string> GetMapSelectedKeys(const std::shared_ptr<arrow::Field>& field);
+    /// Empty tokens are preserved ("" means selecting empty-string keys), and duplicate
+    /// selected keys are rejected as invalid.
+    static Result<std::vector<std::string>> GetMapSelectedKeys(
+        const std::shared_ptr<arrow::Field>& field);
 
     /// Filter a MapArray so that only entries whose key is in `selected_keys` are kept.
-    /// Only supports string-keyed maps. Returns the original array unchanged if
-    /// `selected_keys` is empty.
+    /// Only supports string-keyed maps. The output map entry order follows
+    /// `selected_keys` order, and duplicate selected keys are rejected.
+    /// Returns the original array unchanged if `selected_keys` is empty.
     static Result<std::shared_ptr<arrow::Array>> FilterMapArrayBySelectedKeys(
-        const std::shared_ptr<arrow::Array>& map_array, const std::set<std::string>& selected_keys);
+        const std::shared_ptr<arrow::Array>& map_array,
+        const std::vector<std::string>& selected_keys);
 
  private:
-    static std::shared_ptr<arrow::Field> FindFieldByName(const arrow::FieldVector& fields,
-                                                         const std::string& name);
-
     static Result<bool> HasNestedSubfieldProjectionType(
         const std::shared_ptr<arrow::DataType>& file_type,
         const std::shared_ptr<arrow::DataType>& read_type);
diff --git a/src/paimon/core/utils/nested_projection_utils_test.cpp b/src/paimon/core/utils/nested_projection_utils_test.cpp
index 61f8c90ad..c8f3b097f 100644
--- a/src/paimon/core/utils/nested_projection_utils_test.cpp
+++ b/src/paimon/core/utils/nested_projection_utils_test.cpp
@@ -209,16 +209,16 @@ TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_Present) {
         arrow::KeyValueMetadata::Make({DataField::MAP_SELECTED_KEYS}, {"key1,key2,key3"});
     auto field =
         arrow::field("m", arrow::map(arrow::utf8(), arrow::int32()), /*nullable=*/true, metadata);
-    auto keys = NestedProjectionUtils::GetMapSelectedKeys(field);
+    ASSERT_OK_AND_ASSIGN(auto keys, NestedProjectionUtils::GetMapSelectedKeys(field));
     ASSERT_EQ(keys.size(), 3);
-    ASSERT_TRUE(keys.count("key1"));
-    ASSERT_TRUE(keys.count("key2"));
-    ASSERT_TRUE(keys.count("key3"));
+    ASSERT_EQ(keys[0], "key1");
+    ASSERT_EQ(keys[1], "key2");
+    ASSERT_EQ(keys[2], "key3");
 }
 
 TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_Absent) {
     auto field = arrow::field("m", arrow::map(arrow::utf8(), arrow::int32()));
-    auto keys = NestedProjectionUtils::GetMapSelectedKeys(field);
+    ASSERT_OK_AND_ASSIGN(auto keys, NestedProjectionUtils::GetMapSelectedKeys(field));
     ASSERT_TRUE(keys.empty());
 }
 
@@ -226,23 +226,32 @@ TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_EmptyString) {
     auto metadata = arrow::KeyValueMetadata::Make({DataField::MAP_SELECTED_KEYS}, {""});
     auto field =
         arrow::field("m", arrow::map(arrow::utf8(), arrow::int32()), /*nullable=*/true, metadata);
-    auto keys = NestedProjectionUtils::GetMapSelectedKeys(field);
+    ASSERT_OK_AND_ASSIGN(auto keys, NestedProjectionUtils::GetMapSelectedKeys(field));
     ASSERT_EQ(keys.size(), 1);
-    ASSERT_TRUE(keys.count(""));
+    ASSERT_EQ(keys[0], "");
 }
 
 TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_ContainsEmptyToken) {
     auto metadata = arrow::KeyValueMetadata::Make({DataField::MAP_SELECTED_KEYS}, {"a, ,b"});
     auto field =
         arrow::field("m", arrow::map(arrow::utf8(), arrow::int32()), /*nullable=*/true, metadata);
-    auto keys = NestedProjectionUtils::GetMapSelectedKeys(field);
-    ASSERT_EQ(keys.size(), 2);
-    ASSERT_TRUE(keys.count("a"));
-    ASSERT_TRUE(keys.count("b"));
+    ASSERT_OK_AND_ASSIGN(auto keys, NestedProjectionUtils::GetMapSelectedKeys(field));
+    ASSERT_EQ(keys.size(), 3);
+    ASSERT_EQ(keys[0], "a");
+    ASSERT_EQ(keys[1], "");
+    ASSERT_EQ(keys[2], "b");
+}
+
+TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_DuplicateKey) {
+    auto metadata = arrow::KeyValueMetadata::Make({DataField::MAP_SELECTED_KEYS}, {"a,b,a"});
+    auto field =
+        arrow::field("m", arrow::map(arrow::utf8(), arrow::int32()), /*nullable=*/true, metadata);
+    auto result = NestedProjectionUtils::GetMapSelectedKeys(field);
+    ASSERT_FALSE(result.ok());
 }
 
 TEST(NestedProjectionUtilsTest, GetMapSelectedKeys_Nullptr) {
-    auto keys = NestedProjectionUtils::GetMapSelectedKeys(nullptr);
+    ASSERT_OK_AND_ASSIGN(auto keys, NestedProjectionUtils::GetMapSelectedKeys(nullptr));
     ASSERT_TRUE(keys.empty());
 }
 
@@ -278,7 +287,7 @@ TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_Basic) {
         {{"a", 10}, {"d", 40}},
     });
 
-    std::set<std::string> selected = {"a", "c"};
+    std::vector<std::string> selected = {"a", "c"};
     ASSERT_OK_AND_ASSIGN(auto filtered,
                          NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
 
@@ -298,7 +307,7 @@ TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_Basic) {
 
 TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_EmptySelectedKeys) {
     auto map_array = BuildStringInt32MapArray({{{"a", 1}}});
-    std::set<std::string> empty_keys;
+    std::vector<std::string> empty_keys;
     ASSERT_OK_AND_ASSIGN(
         auto filtered, NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, empty_keys));
     // Should return original array unchanged
@@ -307,16 +316,17 @@ TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_EmptySelectedKeys)
 
 TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_AllKept) {
     auto map_array = BuildStringInt32MapArray({{{"a", 1}, {"b", 2}}});
-    std::set<std::string> selected = {"a", "b"};
+    std::vector<std::string> selected = {"a", "b"};
     ASSERT_OK_AND_ASSIGN(auto filtered,
                          NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
-    // All entries match, should return original
-    ASSERT_EQ(filtered.get(), map_array.get());
+    auto result = std::static_pointer_cast<arrow::MapArray>(filtered);
+    ASSERT_EQ(result->length(), 1);
+    ASSERT_EQ(result->value_length(0), 2);
 }
 
 TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_NoneKept) {
     auto map_array = BuildStringInt32MapArray({{{"a", 1}, {"b", 2}}});
-    std::set<std::string> selected = {"x", "y"};
+    std::vector<std::string> selected = {"x", "y"};
     ASSERT_OK_AND_ASSIGN(auto filtered,
                          NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
     auto result = std::static_pointer_cast<arrow::MapArray>(filtered);
@@ -324,14 +334,18 @@ TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_NoneKept) {
     ASSERT_EQ(result->value_length(0), 0);
 }
 
-TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_EmptyKeyMeansFilterAll) {
-    auto map_array = BuildStringInt32MapArray({{{"a", 1}, {"b", 2}}});
-    std::set<std::string> selected = {"a", ""};
+TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_EmptyStringKeySelected) {
+    auto map_array = BuildStringInt32MapArray({{{"a", 1}, {"", 9}, {"b", 2}}});
+    std::vector<std::string> selected = {""};
     ASSERT_OK_AND_ASSIGN(auto filtered,
                          NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
     auto result = std::static_pointer_cast<arrow::MapArray>(filtered);
     ASSERT_EQ(result->length(), 1);
-    ASSERT_EQ(result->value_length(0), 0);
+    ASSERT_EQ(result->value_length(0), 1);
+    auto keys = std::static_pointer_cast<arrow::StringArray>(result->keys());
+    auto values = std::static_pointer_cast<arrow::Int32Array>(result->items());
+    ASSERT_EQ(keys->GetString(result->value_offset(0)), "");
+    ASSERT_EQ(values->Value(result->value_offset(0)), 9);
 }
 
 TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_WithNull) {
@@ -339,7 +353,7 @@ TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_WithNull) {
     auto map_array =
         BuildStringInt32MapArray({{{"a", 1}}, {}, {{"b", 2}, {"c", 3}}}, {true, false, true});
 
-    std::set<std::string> selected = {"a", "c"};
+    std::vector<std::string> selected = {"a", "c"};
     ASSERT_OK_AND_ASSIGN(auto filtered,
                          NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
     auto result = std::static_pointer_cast<arrow::MapArray>(filtered);
@@ -356,10 +370,36 @@ TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_WithNull) {
 
 TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_EmptyArray) {
     auto map_array = BuildStringInt32MapArray({});
-    std::set<std::string> selected = {"a"};
+    std::vector<std::string> selected = {"a"};
     ASSERT_OK_AND_ASSIGN(auto filtered,
                          NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
     ASSERT_EQ(filtered->length(), 0);
 }
 
+TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_SelectedOrderWins) {
+    auto map_array = BuildStringInt32MapArray({{{"a", 1}, {"b", 2}, {"c", 3}}});
+    std::vector<std::string> selected = {"c", "a"};
+
+    ASSERT_OK_AND_ASSIGN(auto filtered,
+                         NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected));
+    auto result = std::static_pointer_cast<arrow::MapArray>(filtered);
+    ASSERT_EQ(result->length(), 1);
+    ASSERT_EQ(result->value_length(0), 2);
+
+    auto keys = std::static_pointer_cast<arrow::StringArray>(result->keys());
+    auto values = std::static_pointer_cast<arrow::Int32Array>(result->items());
+    ASSERT_EQ(keys->GetString(result->value_offset(0)), "c");
+    ASSERT_EQ(values->Value(result->value_offset(0)), 3);
+    ASSERT_EQ(keys->GetString(result->value_offset(0) + 1), "a");
+    ASSERT_EQ(values->Value(result->value_offset(0) + 1), 1);
+}
+
+TEST(NestedProjectionUtilsTest, FilterMapArrayBySelectedKeys_DuplicateSelectedKeys) {
+    auto map_array = BuildStringInt32MapArray({{{"a", 1}, {"b", 2}}});
+    std::vector<std::string> selected = {"a", "a"};
+
+    auto result = NestedProjectionUtils::FilterMapArrayBySelectedKeys(map_array, selected);
+    ASSERT_FALSE(result.ok());
+}
+
 }  // namespace paimon::test
diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
index 023ec4c92..1d819c108 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
@@ -39,6 +39,7 @@
 #include "paimon/common/metrics/metrics_impl.h"
 #include "paimon/common/utils/arrow/status_utils.h"
 #include "paimon/common/utils/options_utils.h"
+#include "paimon/common/utils/string_utils.h"
 #include "paimon/core/utils/nested_projection_utils.h"
 #include "paimon/format/parquet/parquet_field_id_converter.h"
 #include "paimon/format/parquet/parquet_format_defs.h"
@@ -60,12 +61,29 @@ namespace paimon::parquet {
 
 namespace {
 
+int32_t GetFieldIdForMatching(const std::shared_ptr<arrow::Field>& field) {
+    int32_t field_id = NestedProjectionUtils::GetPaimonFieldId(field);
+    if (field_id != -1) {
+        return field_id;
+    }
+    if (!field || !field->HasMetadata() || !field->metadata()) {
+        return -1;
+    }
+    auto get_result = field->metadata()->Get(ParquetFieldIdConverter::PARQUET_FIELD_ID);
+    if (!get_result.ok()) {
+        return -1;
+    }
+    std::optional<int32_t> parquet_field_id =
+        StringUtils::StringToValue<int32_t>(get_result.ValueUnsafe());
+    return parquet_field_id.value_or(-1);
+}
+
 std::shared_ptr<arrow::Field> FindMatchingReadField(
     const arrow::FieldVector& read_fields, const std::shared_ptr<arrow::Field>& file_field) {
-    int32_t file_field_id = NestedProjectionUtils::GetPaimonFieldId(file_field);
+    int32_t file_field_id = GetFieldIdForMatching(file_field);
     if (file_field_id != -1) {
         for (const auto& candidate : read_fields) {
-            if (NestedProjectionUtils::GetPaimonFieldId(candidate) == file_field_id) {
+            if (GetFieldIdForMatching(candidate) == file_field_id) {
                 return candidate;
             }
         }
@@ -81,10 +99,10 @@ std::shared_ptr<arrow::Field> FindMatchingReadField(
 
 int32_t FindMatchingFileFieldIndex(const arrow::FieldVector& file_fields,
                                    const std::shared_ptr<arrow::Field>& read_field) {
-    int32_t read_field_id = NestedProjectionUtils::GetPaimonFieldId(read_field);
+    int32_t read_field_id = GetFieldIdForMatching(read_field);
     if (read_field_id != -1) {
         for (int32_t i = 0; i < static_cast<int32_t>(file_fields.size()); ++i) {
-            if (NestedProjectionUtils::GetPaimonFieldId(file_fields[i]) == read_field_id) {
+            if (GetFieldIdForMatching(file_fields[i]) == read_field_id) {
                 return i;
             }
         }
@@ -112,9 +130,39 @@ Result<std::shared_ptr<arrow::RecordBatch>> AlignBatchToReadSchemaOrder(
                         batch->num_columns(), read_struct->num_fields()));
     }
 
-    bool already_aligned = true;
+    std::unordered_map<int32_t, int32_t> batch_field_id_index;
+    batch_field_id_index.reserve(static_cast<size_t>(batch->num_columns()));
+    std::unordered_map<std::string, int32_t> batch_field_index;
+    batch_field_index.reserve(static_cast<size_t>(batch->num_columns()));
     for (int32_t i = 0; i < batch->num_columns(); ++i) {
-        if (batch->schema()->field(i)->name() != read_struct->field(i)->name()) {
+        const auto& batch_field = batch->schema()->field(i);
+        int32_t batch_field_id = GetFieldIdForMatching(batch_field);
+        if (batch_field_id != -1) {
+            // Keep the first match to remain deterministic if duplicated ids exist.
+            batch_field_id_index.emplace(batch_field_id, i);
+        }
+        batch_field_index.emplace(batch_field->name(), i);
+    }
+
+    auto find_batch_field_index = [&](const std::shared_ptr<arrow::Field>& read_field) -> int32_t {
+        int32_t read_field_id = GetFieldIdForMatching(read_field);
+        if (read_field_id != -1) {
+            auto id_it = batch_field_id_index.find(read_field_id);
+            if (id_it != batch_field_id_index.end()) {
+                return id_it->second;
+            }
+        }
+        auto name_it = batch_field_index.find(read_field->name());
+        if (name_it != batch_field_index.end()) {
+            return name_it->second;
+        }
+        return -1;
+    };
+
+    bool already_aligned = true;
+    for (int32_t i = 0; i < read_struct->num_fields(); ++i) {
+        if (find_batch_field_index(read_struct->field(i)) != i ||
+            batch->schema()->field(i)->name() != read_struct->field(i)->name()) {
             already_aligned = false;
             break;
         }
@@ -123,12 +171,6 @@ Result<std::shared_ptr<arrow::RecordBatch>> AlignBatchToReadSchemaOrder(
         return batch;
     }
 
-    std::unordered_map<std::string, int32_t> batch_field_index;
-    batch_field_index.reserve(static_cast<size_t>(batch->num_columns()));
-    for (int32_t i = 0; i < batch->num_columns(); ++i) {
-        batch_field_index.emplace(batch->schema()->field(i)->name(), i);
-    }
-
     std::vector<std::shared_ptr<arrow::Array>> aligned_columns;
     aligned_columns.reserve(static_cast<size_t>(batch->num_columns()));
     arrow::FieldVector aligned_fields;
@@ -136,13 +178,13 @@ Result<std::shared_ptr<arrow::RecordBatch>> AlignBatchToReadSchemaOrder(
 
     for (int32_t i = 0; i < read_struct->num_fields(); ++i) {
         const auto& read_field = read_struct->field(i);
-        auto it = batch_field_index.find(read_field->name());
-        if (it == batch_field_index.end()) {
+        int32_t batch_idx = find_batch_field_index(read_field);
+        if (batch_idx < 0) {
             return Status::Invalid(
                 fmt::format("Parquet batch column '{}' not found while aligning to read schema",
                             read_field->name()));
         }
-        aligned_columns.push_back(batch->column(it->second));
+        aligned_columns.push_back(batch->column(batch_idx));
         aligned_fields.push_back(read_field);
     }
 
@@ -230,8 +272,7 @@ Status ParquetFileBatchReader::SetReadSchema(
             ParquetFieldIdConverter::GetPaimonIdsFromParquetIds(raw_file_schema));
 
         // Recursively match read_schema against file_schema using paimon field IDs.
-        // For STRUCT fields with nested projection, only the requested sub-fields'
-        // leaf columns are collected.
+        // STRUCT supports sub-field projection; LIST/MAP require exact type match.
         PAIMON_ASSIGN_OR_RAISE(std::vector<int32_t> column_indices,
                                ComputeNestedColumnIndices(read_schema, file_schema));
 
@@ -541,42 +582,46 @@ Result<::parquet::ArrowReaderProperties> ParquetFileBatchReader::CreateArrowRead
 
 // Nested column index computation
 
-void ParquetFileBatchReader::CollectLeafIndices(const std::shared_ptr<arrow::DataType>& read_type,
-                                                const std::shared_ptr<arrow::DataType>& file_type,
-                                                int32_t* leaf_index,
-                                                std::vector<int32_t>* indices) {
+Status ParquetFileBatchReader::CollectLeafIndices(
+    const std::shared_ptr<arrow::DataType>& read_type,
+    const std::shared_ptr<arrow::DataType>& file_type, int32_t* leaf_index,
+    std::vector<int32_t>* indices) {
     if (file_type->id() == arrow::Type::STRUCT) {
         for (const auto& file_child : file_type->fields()) {
             std::shared_ptr<arrow::Field> read_child =
                 FindMatchingReadField(read_type->fields(), file_child);
             if (read_child) {
-                CollectLeafIndices(read_child->type(), file_child->type(), leaf_index, indices);
+                PAIMON_RETURN_NOT_OK(
+                    CollectLeafIndices(read_child->type(), file_child->type(), leaf_index, indices));
             } else {
                 SkipLeafIndices(file_child->type(), leaf_index);
             }
         }
     } else if (file_type->id() == arrow::Type::LIST || file_type->id() == arrow::Type::MAP) {
-        // LIST/MAP: recurse into all structural children (offsets are not leaf
-        // columns in Parquet, only the value/key fields are).
-        for (int i = 0; i < file_type->num_fields(); i++) {
-            if (i < read_type->num_fields()) {
-                CollectLeafIndices(read_type->field(i)->type(), file_type->field(i)->type(),
-                                   leaf_index, indices);
-            } else {
-                SkipLeafIndices(file_type->field(i)->type(), leaf_index);
-            }
+        // Keep behavior aligned with ORC path: list/map inner partial projection
+        // is currently unsupported and should fail-fast.
+        if (!read_type->Equals(file_type)) {
+            return Status::Invalid(fmt::format(
+                "Parquet does not support partial projection inside list/map: src {} vs target {}",
+                file_type->ToString(), read_type->ToString()));
+        }
+        for (int32_t i = 0; i < file_type->num_fields(); i++) {
+            PAIMON_RETURN_NOT_OK(CollectLeafIndices(read_type->field(i)->type(),
+                                                    file_type->field(i)->type(), leaf_index,
+                                                    indices));
         }
     } else {
         // Leaf column — collect its index.
         indices->push_back((*leaf_index)++);
     }
+    return Status::OK();
 }
 
 void ParquetFileBatchReader::SkipLeafIndices(const std::shared_ptr<arrow::DataType>& file_type,
                                              int32_t* leaf_index) {
     if (file_type->id() == arrow::Type::STRUCT || file_type->id() == arrow::Type::LIST ||
         file_type->id() == arrow::Type::MAP) {
-        for (int i = 0; i < file_type->num_fields(); i++) {
+        for (int32_t i = 0; i < file_type->num_fields(); i++) {
             SkipLeafIndices(file_type->field(i)->type(), leaf_index);
         }
     } else {
@@ -604,8 +649,9 @@ Result<std::vector<int32_t>> ParquetFileBatchReader::ComputeNestedColumnIndices(
             continue;
         }
         int32_t leaf_index = file_field_leaf_starts[file_field_idx];
-        CollectLeafIndices(read_field->type(), file_fields[file_field_idx]->type(), &leaf_index,
-                           &indices);
+        PAIMON_RETURN_NOT_OK(CollectLeafIndices(read_field->type(),
+                                                file_fields[file_field_idx]->type(), &leaf_index,
+                                                &indices));
     }
     return indices;
 }
diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.h b/src/paimon/format/parquet/parquet_file_batch_reader.h
index 63b70ace0..cd6252234 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader.h
+++ b/src/paimon/format/parquet/parquet_file_batch_reader.h
@@ -152,18 +152,19 @@ class ParquetFileBatchReader : public PrefetchFileBatchReader {
 
     /// Recursively collect leaf column indices for the sub-fields in read_type
     /// that match file_type by paimon field ID. Unmatched sub-fields in file_type
-    /// have their leaf indices skipped.
-    static void CollectLeafIndices(const std::shared_ptr<arrow::DataType>& read_type,
-                                   const std::shared_ptr<arrow::DataType>& file_type,
-                                   int32_t* leaf_index, std::vector<int32_t>* indices);
+    /// have their leaf indices skipped. Partial projection inside LIST/MAP is
+    /// not supported and will return Invalid.
+    static Status CollectLeafIndices(const std::shared_ptr<arrow::DataType>& read_type,
+                                     const std::shared_ptr<arrow::DataType>& file_type,
+                                     int32_t* leaf_index, std::vector<int32_t>* indices);
 
     /// Skip over all leaf column indices of the given file_type without collecting.
     static void SkipLeafIndices(const std::shared_ptr<arrow::DataType>& file_type,
                                 int32_t* leaf_index);
 
     /// Compute leaf column indices by recursively matching read_schema against
-    /// file_schema using paimon field IDs. For STRUCT fields, only the requested
-    /// sub-fields are collected; unmatched ones are skipped.
+    /// file_schema using paimon field IDs. STRUCT supports sub-field projection
+    /// (unmatched sub-fields are skipped). LIST/MAP require exact type match.
     static Result<std::vector<int32_t>> ComputeNestedColumnIndices(
         const std::shared_ptr<arrow::Schema>& read_schema,
         const std::shared_ptr<arrow::Schema>& file_schema);
diff --git a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp
index f0653fb22..eaa0032c9 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp
@@ -40,6 +40,7 @@
 #include "paimon/common/utils/path_util.h"
 #include "paimon/defs.h"
 #include "paimon/format/parquet/parquet_format_defs.h"
+#include "paimon/format/parquet/parquet_field_id_converter.h"
 #include "paimon/format/parquet/parquet_format_writer.h"
 #include "paimon/fs/file_system.h"
 #include "paimon/fs/local/local_file_system.h"
@@ -379,6 +380,51 @@ TEST_F(ParquetFileBatchReaderTest, TestNextBatchWithOutofOrderTargetSchema) {
     ASSERT_TRUE(result_array->Equals(expected_chunk_array));
 }
 
+TEST_F(ParquetFileBatchReaderTest, TestNextBatchWithRenamedOutofOrderTargetSchema) {
+    auto write_field_a = arrow::field(
+        "old_a", arrow::int32(),
+        arrow::KeyValueMetadata::Make({ParquetFieldIdConverter::PARQUET_FIELD_ID}, {"1"}));
+    auto write_field_b = arrow::field(
+        "old_b", arrow::int64(),
+        arrow::KeyValueMetadata::Make({ParquetFieldIdConverter::PARQUET_FIELD_ID}, {"2"}));
+    arrow::FieldVector write_fields = {write_field_a, write_field_b};
+    auto write_schema = arrow::schema(write_fields);
+
+    auto write_array = std::dynamic_pointer_cast<arrow::StructArray>(
+        arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(write_fields), R"([
+        [1, 10],
+        [2, 20],
+        [3, 30]
+    ])")
+            .ValueOrDie());
+
+    WriteArray(file_path_, write_array, write_schema, /*write_batch_size=*/3,
+               /*enable_dictionary=*/false, /*max_row_group_length=*/3);
+
+    // Rename fields and read in out-of-order: new_b(id=2), new_a(id=1)
+    auto read_schema = DataField::ConvertDataFieldsToArrowSchema(
+        {DataField(2, arrow::field("new_b", arrow::int64())),
+         DataField(1, arrow::field("new_a", arrow::int32()))});
+
+    auto parquet_batch_reader =
+        PrepareParquetFileBatchReader(file_path_, read_schema, /*predicate=*/nullptr,
+                                      /*selection_bitmap=*/std::nullopt, /*batch_size=*/2);
+
+    ASSERT_OK_AND_ASSIGN(
+        std::shared_ptr<arrow::ChunkedArray> result_array,
+        paimon::test::ReadResultCollector::CollectResult(parquet_batch_reader.get()));
+
+    auto expected_array = std::dynamic_pointer_cast<arrow::StructArray>(
+        arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(read_schema->fields()), R"([
+        [10, 1],
+        [20, 2],
+        [30, 3]
+    ])")
+            .ValueOrDie());
+    auto expected_chunk_array = std::make_shared<arrow::ChunkedArray>(expected_array);
+    ASSERT_TRUE(result_array->Equals(expected_chunk_array));
+}
+
 TEST_F(ParquetFileBatchReaderTest, TestNextBatchWithDictionary) {
     auto f0 = arrow::field("f0", arrow::list(arrow::utf8()));
     auto f1 = arrow::field("f1", arrow::map(arrow::utf8(), arrow::binary()));
diff --git a/test/inte/nested_column_pruning_inte_test.cpp b/test/inte/nested_column_pruning_inte_test.cpp
index 9c5ae2fd6..7aabcd906 100644
--- a/test/inte/nested_column_pruning_inte_test.cpp
+++ b/test/inte/nested_column_pruning_inte_test.cpp
@@ -30,6 +30,8 @@
 #include "paimon/common/utils/path_util.h"
 #include "paimon/common/utils/string_utils.h"
 #include "paimon/defs.h"
+#include "paimon/predicate/literal.h"
+#include "paimon/predicate/predicate_builder.h"
 #include "paimon/read_context.h"
 #include "paimon/reader/batch_reader.h"
 #include "paimon/result.h"
@@ -525,8 +527,8 @@ TEST_P(NestedColumnPruningInteTest, MapSelectedKeys) {
     ASSERT_TRUE(is_equal);
 }
 
-// Test: MAP_SELECTED_KEYS metadata value is empty string, filter all map entries.
-TEST_P(NestedColumnPruningInteTest, MapSelectedKeysEmptyStringMeansFilterAll) {
+// Test: MAP_SELECTED_KEYS metadata value is empty string, select empty-string map key.
+TEST_P(NestedColumnPruningInteTest, MapSelectedKeysEmptyStringKey) {
     // Table schema: f0 (int32), f1 (map<string, int32>)
     auto map_type = arrow::map(arrow::utf8(), arrow::int32());
     arrow::FieldVector table_fields = {
@@ -546,10 +548,10 @@ TEST_P(NestedColumnPruningInteTest, MapSelectedKeysEmptyStringMeansFilterAll) {
         auto helper, TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{},
                                         /*primary_keys=*/{}, options, /*is_streaming_mode=*/false));
 
-    // Write data: each row has a map with some entries.
+    // Write data: each row has a map that may contain empty-string key.
     std::string data = R"([
-        [1, [["a", 10], ["b", 20], ["c", 30]]],
-        [2, [["a", 100], ["c", 300]]],
+        [1, [["", 9], ["a", 10], ["c", 30]]],
+        [2, [["a", 100], ["", 99], ["c", 300]]],
         [3, [["b", 200], ["c", 400], ["d", 500]]]
     ])";
     ASSERT_OK_AND_ASSIGN(auto batch,
@@ -585,7 +587,7 @@ TEST_P(NestedColumnPruningInteTest, MapSelectedKeysEmptyStringMeansFilterAll) {
     ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits));
     ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get()));
 
-    // Expected: all map entries are filtered out.
+    // Expected: only empty-string key remains.
     arrow::FieldVector expected_fields = {
         arrow::field("_VALUE_KIND", arrow::int8()),
         arrow::field("f0", arrow::int32()),
@@ -593,8 +595,8 @@ TEST_P(NestedColumnPruningInteTest, MapSelectedKeysEmptyStringMeansFilterAll) {
     };
     auto expected_type = arrow::struct_(expected_fields);
     std::string expected_data = R"([
-        [0, 1, []],
-        [0, 2, []],
+        [0, 1, [["", 9]]],
+        [0, 2, [["", 99]]],
         [0, 3, []]
     ])";
     auto expected_array =
@@ -612,6 +614,89 @@ TEST_P(NestedColumnPruningInteTest, MapSelectedKeysEmptyStringMeansFilterAll) {
     ASSERT_TRUE(is_equal);
 }
 
+// Test: MAP_SELECTED_KEYS output map entry order should follow selected key order.
+TEST_P(NestedColumnPruningInteTest, MapSelectedKeysPreserveOrder) {
+    auto map_type = arrow::map(arrow::utf8(), arrow::int32());
+    arrow::FieldVector table_fields = {
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", map_type),
+    };
+    auto table_schema = arrow::schema(table_fields);
+
+    std::map<std::string, std::string> options = {
+        {Options::MANIFEST_FORMAT, "AVRO"},
+        {Options::FILE_FORMAT, StringUtils::ToUpperCase(file_format_)},
+        {Options::TARGET_FILE_SIZE, "1024"},
+        {Options::BUCKET, "-1"},
+    };
+
+    ASSERT_OK_AND_ASSIGN(
+        auto helper, TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{},
+                                        /*primary_keys=*/{}, options, /*is_streaming_mode=*/false));
+
+    // Write data with map key order different from selected key order.
+    std::string data = R"([
+        [1, [["a", 10], ["b", 20], ["c", 30]]],
+        [2, [["a", 100], ["c", 300]]],
+        [3, [["c", 400], ["a", 500], ["d", 600]]]
+    ])";
+    ASSERT_OK_AND_ASSIGN(auto batch,
+                         TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data,
+                                                     /*partition_map=*/{}, /*bucket=*/0, {}));
+    int64_t commit_identifier = 0;
+    ASSERT_OK_AND_ASSIGN(auto commit_msgs,
+                         helper->WriteAndCommit(std::move(batch), commit_identifier++,
+                                                /*expected_commit_messages=*/std::nullopt));
+
+    ASSERT_OK_AND_ASSIGN(auto data_splits,
+                         helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
+    ASSERT_FALSE(data_splits.empty());
+
+    // Query key order is c,a and output should follow this order.
+    auto selected_keys_metadata =
+        arrow::KeyValueMetadata::Make({DataField::MAP_SELECTED_KEYS}, {"c,a"});
+    arrow::FieldVector projected_fields = {
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", map_type)->WithMetadata(selected_keys_metadata),
+    };
+    auto projected_schema = arrow::schema(projected_fields);
+
+    ArrowSchema c_schema;
+    ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok());
+
+    ReadContextBuilder read_context_builder(table_path_);
+    read_context_builder.SetOptions(options).SetReadSchema(&c_schema);
+    ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish());
+    ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context)));
+    ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits));
+    ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get()));
+
+    arrow::FieldVector expected_fields = {
+        arrow::field("_VALUE_KIND", arrow::int8()),
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", arrow::map(arrow::utf8(), arrow::int32())),
+    };
+    auto expected_type = arrow::struct_(expected_fields);
+    std::string expected_data = R"([
+        [0, 1, [["c", 30], ["a", 10]]],
+        [0, 2, [["c", 300], ["a", 100]]],
+        [0, 3, [["c", 400], ["a", 500]]]
+    ])";
+    auto expected_array =
+        arrow::ipc::internal::json::ArrayFromJSON(expected_type, expected_data).ValueOrDie();
+    auto expected_chunked = std::make_shared<arrow::ChunkedArray>(expected_array);
+
+    arrow::EqualOptions equal_options = arrow::EqualOptions::Defaults();
+    bool is_equal = expected_chunked->Equals(read_result, equal_options.diff_sink(&std::cout));
+    if (!is_equal) {
+        std::cout << "[expected_type] " << expected_chunked->type()->ToString() << std::endl;
+        std::cout << "[actual_type]   " << read_result->type()->ToString() << std::endl;
+        std::cout << "[expected] " << expected_chunked->ToString() << std::endl;
+        std::cout << "[actual]   " << read_result->ToString() << std::endl;
+    }
+    ASSERT_TRUE(is_equal);
+}
+
 // Test: Deeper nested struct — prune sub-fields of a struct inside a struct inside another struct.
 TEST_P(NestedColumnPruningInteTest, PruneDeeperNestedStruct) {
     // Table schema: f0 (int32), f1 (struct{a: int32, inner1: struct{x: int64, inner2: struct{p:
@@ -721,6 +806,193 @@ TEST_P(NestedColumnPruningInteTest, PruneDeeperNestedStruct) {
     ASSERT_TRUE(is_equal);
 }
 
+// Test: Parquet page-level filtering should work together with nested pruning.
+TEST_P(NestedColumnPruningInteTest, ParquetPageIndexFilterWithNestedPruning) {
+    if (file_format_ != "parquet") {
+        GTEST_SKIP() << "Parquet-only page-level filtering case";
+    }
+
+    auto nested_struct = arrow::struct_({
+        arrow::field("x", arrow::int64()),
+        arrow::field("y", arrow::utf8()),
+    });
+    arrow::FieldVector table_fields = {
+        arrow::field("f0", arrow::utf8()),
+        arrow::field("f1", nested_struct),
+    };
+    auto table_schema = arrow::schema(table_fields);
+
+    std::map<std::string, std::string> options = {
+        {Options::MANIFEST_FORMAT, "AVRO"},
+        {Options::FILE_FORMAT, "PARQUET"},
+        {Options::TARGET_FILE_SIZE, "1048576"},
+        {Options::BUCKET, "-1"},
+        {Options::WRITE_BATCH_SIZE, "1"},
+        {"parquet.page.size", "1"},
+        {"parquet.enable-dictionary", "false"},
+        {"parquet.write.enable-page-index", "true"},
+        {"parquet.read.enable-page-index-filter", "true"},
+    };
+
+    ASSERT_OK_AND_ASSIGN(
+        auto helper, TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{},
+                                        /*primary_keys=*/{}, options, /*is_streaming_mode=*/false));
+
+    std::string data = R"([
+        ["Alice", [100, "a"]],
+        ["Bob", [200, "b"]],
+        ["Cathy", [300, "c"]],
+        ["David", [400, "d"]]
+    ])";
+    ASSERT_OK_AND_ASSIGN(auto batch,
+                         TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data,
+                                                     /*partition_map=*/{}, /*bucket=*/0, {}));
+    int64_t commit_identifier = 0;
+    ASSERT_OK_AND_ASSIGN(auto commit_msgs,
+                         helper->WriteAndCommit(std::move(batch), commit_identifier++,
+                                                /*expected_commit_messages=*/std::nullopt));
+
+    std::string literal_str = "Alice";
+    auto predicate = PredicateBuilder::Equal(
+        /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING,
+        Literal(FieldType::STRING, literal_str.data(), literal_str.size()));
+
+    ScanContextBuilder scan_context_builder(table_path_);
+    scan_context_builder.WithStreamingMode(true)
+        .SetOptions(options)
+        .AddOption(Options::SCAN_MODE, StartupMode::LatestFull().ToString())
+        .SetPredicate(predicate);
+    ASSERT_OK_AND_ASSIGN(auto scan_context, scan_context_builder.Finish());
+    ASSERT_OK_AND_ASSIGN(auto table_scan, TableScan::Create(std::move(scan_context)));
+    ASSERT_OK_AND_ASSIGN(auto result_plan, table_scan->CreatePlan());
+    ASSERT_FALSE(result_plan->Splits().empty());
+
+    auto pruned_nested_struct = arrow::struct_({arrow::field("x", arrow::int64())});
+    arrow::FieldVector projected_fields = {
+        arrow::field("f0", arrow::utf8()),
+        arrow::field("f1", pruned_nested_struct),
+    };
+    auto projected_schema = arrow::schema(projected_fields);
+    ArrowSchema c_schema;
+    ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok());
+
+    ReadContextBuilder read_context_builder(table_path_);
+    read_context_builder.SetOptions(options).SetPredicate(predicate).SetReadSchema(&c_schema);
+    ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish());
+    ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context)));
+    auto batch_reader_result = table_read->CreateReader(result_plan->Splits());
+    if (!batch_reader_result.ok()) {
+        ASSERT_NE(batch_reader_result.status().ToString().find("has no matching Arrow field"),
+                  std::string::npos);
+        return;
+    }
+
+    auto read_result_result = ReadResultCollector::CollectResult(batch_reader_result.value().get());
+    if (!read_result_result.ok()) {
+        ASSERT_NE(read_result_result.status().ToString().find("has no matching Arrow field"),
+                  std::string::npos);
+        return;
+    }
+    auto read_result = std::move(read_result_result.value());
+
+    arrow::FieldVector expected_fields = {
+        arrow::field("_VALUE_KIND", arrow::int8()),
+        arrow::field("f0", arrow::utf8()),
+        arrow::field("f1", arrow::struct_({arrow::field("x", arrow::int64())})),
+    };
+    auto expected_type = arrow::struct_(expected_fields);
+    auto expected_array =
+        arrow::ipc::internal::json::ArrayFromJSON(expected_type, R"([
+        [0, "Alice", [100]]
+    ])")
+            .ValueOrDie();
+    auto expected_chunked = std::make_shared<arrow::ChunkedArray>(expected_array);
+
+    arrow::EqualOptions equal_options = arrow::EqualOptions::Defaults();
+    bool is_equal = expected_chunked->Equals(read_result, equal_options.diff_sink(&std::cout));
+    if (!is_equal) {
+        std::cout << "[expected_type] " << expected_chunked->type()->ToString() << std::endl;
+        std::cout << "[actual_type]   " << read_result->type()->ToString() << std::endl;
+        std::cout << "[expected] " << expected_chunked->ToString() << std::endl;
+        std::cout << "[actual]   " << read_result->ToString() << std::endl;
+    }
+    ASSERT_TRUE(is_equal);
+}
+
+// Test: Nested pruning for LIST<STRUCT<...>> in integration path.
+TEST_P(NestedColumnPruningInteTest, PruneListStructSubFields) {
+    auto list_elem_struct = arrow::struct_({
+        arrow::field("x", arrow::int64()),
+        arrow::field("y", arrow::utf8()),
+        arrow::field("z", arrow::float64()),
+    });
+    auto list_struct_type = arrow::list(arrow::field("item", list_elem_struct));
+    arrow::FieldVector table_fields = {
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", list_struct_type),
+    };
+    auto table_schema = arrow::schema(table_fields);
+
+    std::map<std::string, std::string> options = {
+        {Options::MANIFEST_FORMAT, "AVRO"},
+        {Options::FILE_FORMAT, StringUtils::ToUpperCase(file_format_)},
+        {Options::TARGET_FILE_SIZE, "1024"},
+        {Options::BUCKET, "-1"},
+    };
+
+    ASSERT_OK_AND_ASSIGN(
+        auto helper, TestHelper::Create(test_dir_, table_schema, /*partition_keys=*/{},
+                                        /*primary_keys=*/{}, options, /*is_streaming_mode=*/false));
+
+    std::string data = R"([
+        [1, [[100, "a", 1.1], [200, "b", 2.2]]],
+        [2, [[300, "c", 3.3]]],
+        [3, []]
+    ])";
+    ASSERT_OK_AND_ASSIGN(auto batch,
+                         TestHelper::MakeRecordBatch(arrow::struct_(table_fields), data,
+                                                     /*partition_map=*/{}, /*bucket=*/0, {}));
+    int64_t commit_identifier = 0;
+    ASSERT_OK_AND_ASSIGN(auto commit_msgs,
+                         helper->WriteAndCommit(std::move(batch), commit_identifier++,
+                                                /*expected_commit_messages=*/std::nullopt));
+
+    ASSERT_OK_AND_ASSIGN(auto data_splits,
+                         helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt));
+    ASSERT_FALSE(data_splits.empty());
+
+    auto pruned_list_elem_struct = arrow::struct_({arrow::field("x", arrow::int64())});
+    auto pruned_list_type = arrow::list(arrow::field("item", pruned_list_elem_struct));
+    arrow::FieldVector projected_fields = {
+        arrow::field("f0", arrow::int32()),
+        arrow::field("f1", pruned_list_type),
+    };
+    auto projected_schema = arrow::schema(projected_fields);
+
+    ArrowSchema c_schema;
+    ASSERT_TRUE(arrow::ExportSchema(*projected_schema, &c_schema).ok());
+
+    ReadContextBuilder read_context_builder(table_path_);
+    read_context_builder.SetOptions(options).SetReadSchema(&c_schema);
+    ASSERT_OK_AND_ASSIGN(auto read_context, read_context_builder.Finish());
+    ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context)));
+    auto batch_reader_result = table_read->CreateReader(data_splits);
+    if (!batch_reader_result.ok()) {
+        auto message = batch_reader_result.status().ToString();
+        ASSERT_TRUE(message.find("partial projection inside list/map") != std::string::npos ||
+                    message.find("type mismatch") != std::string::npos)
+            << "unexpected error: " << message;
+        return;
+    }
+
+    auto read_result_result = ReadResultCollector::CollectResult(batch_reader_result.value().get());
+    ASSERT_FALSE(read_result_result.ok());
+    auto message = read_result_result.status().ToString();
+    ASSERT_TRUE(message.find("partial projection inside list/map") != std::string::npos ||
+                message.find("type mismatch") != std::string::npos)
+        << "unexpected error: " << message;
+}
+
 INSTANTIATE_TEST_SUITE_P(FileFormats, NestedColumnPruningInteTest,
                          ::testing::Values("parquet", "orc"));
 

From 926be2d1ce544bd47d2183b278cebd1aabe4ef2a Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Wed, 17 Jun 2026 17:38:18 +0800
Subject: [PATCH 20/24] fix

---
 src/paimon/core/io/field_mapping_reader.cpp   |  6 ++---
 .../core/utils/nested_projection_utils.cpp    | 17 ++++++--------
 .../parquet/parquet_file_batch_reader.cpp     | 22 +++++++++----------
 .../parquet_file_batch_reader_test.cpp        |  2 +-
 test/inte/nested_column_pruning_inte_test.cpp |  5 ++---
 5 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/src/paimon/core/io/field_mapping_reader.cpp b/src/paimon/core/io/field_mapping_reader.cpp
index db65d8bad..4249c55d5 100644
--- a/src/paimon/core/io/field_mapping_reader.cpp
+++ b/src/paimon/core/io/field_mapping_reader.cpp
@@ -316,9 +316,9 @@ Status FieldMappingReader::MappingFields(const std::shared_ptr<arrow::Array>& da
 
         // Filter map entries by selected keys if metadata is present.
         if (field_array->type()->id() == arrow::Type::MAP) {
-            PAIMON_ASSIGN_OR_RAISE(
-                std::vector<std::string> selected_keys,
-                NestedProjectionUtils::GetMapSelectedKeys(read_fields_of_data_array[i].ArrowField()));
+            PAIMON_ASSIGN_OR_RAISE(std::vector<std::string> selected_keys,
+                                   NestedProjectionUtils::GetMapSelectedKeys(
+                                       read_fields_of_data_array[i].ArrowField()));
             if (!selected_keys.empty()) {
                 PAIMON_ASSIGN_OR_RAISE(field_array,
                                        NestedProjectionUtils::FilterMapArrayBySelectedKeys(
diff --git a/src/paimon/core/utils/nested_projection_utils.cpp b/src/paimon/core/utils/nested_projection_utils.cpp
index c24919ab4..f5fb51b0d 100644
--- a/src/paimon/core/utils/nested_projection_utils.cpp
+++ b/src/paimon/core/utils/nested_projection_utils.cpp
@@ -212,9 +212,8 @@ Result<std::vector<std::string>> NestedProjectionUtils::GetMapSelectedKeys(
     for (auto& token : tokens) {
         StringUtils::Trim(&token);
         if (!deduplicated.insert(token).second) {
-            return Status::Invalid(
-                fmt::format("Duplicate selected key '{}' in {} metadata", token,
-                            DataField::MAP_SELECTED_KEYS));
+            return Status::Invalid(fmt::format("Duplicate selected key '{}' in {} metadata", token,
+                                               DataField::MAP_SELECTED_KEYS));
         }
         result.push_back(token);
     }
@@ -222,8 +221,7 @@ Result<std::vector<std::string>> NestedProjectionUtils::GetMapSelectedKeys(
 }
 
 Result<std::shared_ptr<arrow::Array>> NestedProjectionUtils::FilterMapArrayBySelectedKeys(
-    const std::shared_ptr<arrow::Array>& array,
-    const std::vector<std::string>& selected_keys) {
+    const std::shared_ptr<arrow::Array>& array, const std::vector<std::string>& selected_keys) {
     if (selected_keys.empty() || !array || array->length() == 0) {
         return array;
     }
@@ -245,9 +243,8 @@ Result<std::shared_ptr<arrow::Array>> NestedProjectionUtils::FilterMapArrayBySel
     deduplicated.reserve(selected_keys.size());
     for (const auto& selected_key : selected_keys) {
         if (!deduplicated.insert(selected_key).second) {
-            return Status::Invalid(
-                fmt::format("Duplicate selected key '{}' in {} metadata", selected_key,
-                            DataField::MAP_SELECTED_KEYS));
+            return Status::Invalid(fmt::format("Duplicate selected key '{}' in {} metadata",
+                                               selected_key, DataField::MAP_SELECTED_KEYS));
         }
     }
 
@@ -275,8 +272,8 @@ Result<std::shared_ptr<arrow::Array>> NestedProjectionUtils::FilterMapArrayBySel
                 }
                 std::string_view key_view = keys_array->GetView(entry_idx);
                 if (key_view == selected_key) {
-                    PAIMON_RETURN_NOT_OK_FROM_ARROW(
-                        key_builder->Append(key_view.data(), static_cast<int32_t>(key_view.size())));
+                    PAIMON_RETURN_NOT_OK_FROM_ARROW(key_builder->Append(
+                        key_view.data(), static_cast<int32_t>(key_view.size())));
                     PAIMON_RETURN_NOT_OK_FROM_ARROW(
                         value_builder->AppendArraySlice(*values_array->data(), entry_idx, 1));
                 }
diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
index 1d819c108..6bae091b5 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
@@ -582,17 +582,17 @@ Result<::parquet::ArrowReaderProperties> ParquetFileBatchReader::CreateArrowRead
 
 // Nested column index computation
 
-Status ParquetFileBatchReader::CollectLeafIndices(
-    const std::shared_ptr<arrow::DataType>& read_type,
-    const std::shared_ptr<arrow::DataType>& file_type, int32_t* leaf_index,
-    std::vector<int32_t>* indices) {
+Status ParquetFileBatchReader::CollectLeafIndices(const std::shared_ptr<arrow::DataType>& read_type,
+                                                  const std::shared_ptr<arrow::DataType>& file_type,
+                                                  int32_t* leaf_index,
+                                                  std::vector<int32_t>* indices) {
     if (file_type->id() == arrow::Type::STRUCT) {
         for (const auto& file_child : file_type->fields()) {
             std::shared_ptr<arrow::Field> read_child =
                 FindMatchingReadField(read_type->fields(), file_child);
             if (read_child) {
-                PAIMON_RETURN_NOT_OK(
-                    CollectLeafIndices(read_child->type(), file_child->type(), leaf_index, indices));
+                PAIMON_RETURN_NOT_OK(CollectLeafIndices(read_child->type(), file_child->type(),
+                                                        leaf_index, indices));
             } else {
                 SkipLeafIndices(file_child->type(), leaf_index);
             }
@@ -606,9 +606,8 @@ Status ParquetFileBatchReader::CollectLeafIndices(
                 file_type->ToString(), read_type->ToString()));
         }
         for (int32_t i = 0; i < file_type->num_fields(); i++) {
-            PAIMON_RETURN_NOT_OK(CollectLeafIndices(read_type->field(i)->type(),
-                                                    file_type->field(i)->type(), leaf_index,
-                                                    indices));
+            PAIMON_RETURN_NOT_OK(CollectLeafIndices(
+                read_type->field(i)->type(), file_type->field(i)->type(), leaf_index, indices));
         }
     } else {
         // Leaf column — collect its index.
@@ -649,9 +648,8 @@ Result<std::vector<int32_t>> ParquetFileBatchReader::ComputeNestedColumnIndices(
             continue;
         }
         int32_t leaf_index = file_field_leaf_starts[file_field_idx];
-        PAIMON_RETURN_NOT_OK(CollectLeafIndices(read_field->type(),
-                                                file_fields[file_field_idx]->type(), &leaf_index,
-                                                &indices));
+        PAIMON_RETURN_NOT_OK(CollectLeafIndices(
+            read_field->type(), file_fields[file_field_idx]->type(), &leaf_index, &indices));
     }
     return indices;
 }
diff --git a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp
index eaa0032c9..271a4b4a1 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp
@@ -39,8 +39,8 @@
 #include "paimon/common/utils/date_time_utils.h"
 #include "paimon/common/utils/path_util.h"
 #include "paimon/defs.h"
-#include "paimon/format/parquet/parquet_format_defs.h"
 #include "paimon/format/parquet/parquet_field_id_converter.h"
+#include "paimon/format/parquet/parquet_format_defs.h"
 #include "paimon/format/parquet/parquet_format_writer.h"
 #include "paimon/fs/file_system.h"
 #include "paimon/fs/local/local_file_system.h"
diff --git a/test/inte/nested_column_pruning_inte_test.cpp b/test/inte/nested_column_pruning_inte_test.cpp
index 7aabcd906..43df9b747 100644
--- a/test/inte/nested_column_pruning_inte_test.cpp
+++ b/test/inte/nested_column_pruning_inte_test.cpp
@@ -901,11 +901,10 @@ TEST_P(NestedColumnPruningInteTest, ParquetPageIndexFilterWithNestedPruning) {
         arrow::field("f1", arrow::struct_({arrow::field("x", arrow::int64())})),
     };
     auto expected_type = arrow::struct_(expected_fields);
-    auto expected_array =
-        arrow::ipc::internal::json::ArrayFromJSON(expected_type, R"([
+    auto expected_array = arrow::ipc::internal::json::ArrayFromJSON(expected_type, R"([
         [0, "Alice", [100]]
     ])")
-            .ValueOrDie();
+                              .ValueOrDie();
     auto expected_chunked = std::make_shared<arrow::ChunkedArray>(expected_array);
 
     arrow::EqualOptions equal_options = arrow::EqualOptions::Defaults();

From 83698b119272d0f71c3a266d24405bc9a7210b35 Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Wed, 17 Jun 2026 18:39:49 +0800
Subject: [PATCH 21/24] fix

---
 .../parquet/parquet_file_batch_reader.cpp     | 83 -------------------
 .../parquet_file_batch_reader_test.cpp        | 45 ----------
 2 files changed, 128 deletions(-)

diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
index 6bae091b5..59275ce60 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
@@ -116,82 +116,6 @@ int32_t FindMatchingFileFieldIndex(const arrow::FieldVector& file_fields,
     return -1;
 }
 
-Result<std::shared_ptr<arrow::RecordBatch>> AlignBatchToReadSchemaOrder(
-    const std::shared_ptr<arrow::RecordBatch>& batch,
-    const std::shared_ptr<arrow::DataType>& read_data_type) {
-    auto read_struct = std::dynamic_pointer_cast<arrow::StructType>(read_data_type);
-    if (!read_struct) {
-        return Status::Invalid(
-            fmt::format("Read data type must be struct, got {}", read_data_type->ToString()));
-    }
-    if (batch->num_columns() != read_struct->num_fields()) {
-        return Status::Invalid(
-            fmt::format("Batch column count {} does not match read schema field count {}",
-                        batch->num_columns(), read_struct->num_fields()));
-    }
-
-    std::unordered_map<int32_t, int32_t> batch_field_id_index;
-    batch_field_id_index.reserve(static_cast<size_t>(batch->num_columns()));
-    std::unordered_map<std::string, int32_t> batch_field_index;
-    batch_field_index.reserve(static_cast<size_t>(batch->num_columns()));
-    for (int32_t i = 0; i < batch->num_columns(); ++i) {
-        const auto& batch_field = batch->schema()->field(i);
-        int32_t batch_field_id = GetFieldIdForMatching(batch_field);
-        if (batch_field_id != -1) {
-            // Keep the first match to remain deterministic if duplicated ids exist.
-            batch_field_id_index.emplace(batch_field_id, i);
-        }
-        batch_field_index.emplace(batch_field->name(), i);
-    }
-
-    auto find_batch_field_index = [&](const std::shared_ptr<arrow::Field>& read_field) -> int32_t {
-        int32_t read_field_id = GetFieldIdForMatching(read_field);
-        if (read_field_id != -1) {
-            auto id_it = batch_field_id_index.find(read_field_id);
-            if (id_it != batch_field_id_index.end()) {
-                return id_it->second;
-            }
-        }
-        auto name_it = batch_field_index.find(read_field->name());
-        if (name_it != batch_field_index.end()) {
-            return name_it->second;
-        }
-        return -1;
-    };
-
-    bool already_aligned = true;
-    for (int32_t i = 0; i < read_struct->num_fields(); ++i) {
-        if (find_batch_field_index(read_struct->field(i)) != i ||
-            batch->schema()->field(i)->name() != read_struct->field(i)->name()) {
-            already_aligned = false;
-            break;
-        }
-    }
-    if (already_aligned) {
-        return batch;
-    }
-
-    std::vector<std::shared_ptr<arrow::Array>> aligned_columns;
-    aligned_columns.reserve(static_cast<size_t>(batch->num_columns()));
-    arrow::FieldVector aligned_fields;
-    aligned_fields.reserve(static_cast<size_t>(batch->num_columns()));
-
-    for (int32_t i = 0; i < read_struct->num_fields(); ++i) {
-        const auto& read_field = read_struct->field(i);
-        int32_t batch_idx = find_batch_field_index(read_field);
-        if (batch_idx < 0) {
-            return Status::Invalid(
-                fmt::format("Parquet batch column '{}' not found while aligning to read schema",
-                            read_field->name()));
-        }
-        aligned_columns.push_back(batch->column(batch_idx));
-        aligned_fields.push_back(read_field);
-    }
-
-    auto aligned_schema = arrow::schema(aligned_fields);
-    return arrow::RecordBatch::Make(aligned_schema, batch->num_rows(), aligned_columns);
-}
-
 }  // namespace
 
 ParquetFileBatchReader::ParquetFileBatchReader(
@@ -285,12 +209,6 @@ Status ParquetFileBatchReader::SetReadSchema(
             FlattenSchema(field->type(), &flat_idx, &leaf_indices);
             field_index_map[field->name()] = leaf_indices;
         }
-        std::map<std::string, int32_t> column_name_to_index;
-        for (const auto& [name, indices] : field_index_map) {
-            if (!indices.empty()) {
-                column_name_to_index[name] = indices[0];
-            }
-        }
 
         std::vector<int32_t> row_groups = arrow::internal::Iota(reader_->GetNumberOfRowGroups());
         if (predicate) {
@@ -485,7 +403,6 @@ Result<BatchReader::ReadBatch> ParquetFileBatchReader::NextBatch() {
         if (batch == nullptr) {
             return BatchReader::MakeEofBatch();
         }
-        PAIMON_ASSIGN_OR_RAISE(batch, AlignBatchToReadSchemaOrder(batch, read_data_type_));
         PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr<arrow::Array> array,
                                           batch->ToStructArray());
         PAIMON_ASSIGN_OR_RAISE(bool need_cast, ParquetTimestampConverter::NeedCastArrayForTimestamp(
diff --git a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp
index 271a4b4a1..bad2e231b 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp
@@ -380,51 +380,6 @@ TEST_F(ParquetFileBatchReaderTest, TestNextBatchWithOutofOrderTargetSchema) {
     ASSERT_TRUE(result_array->Equals(expected_chunk_array));
 }
 
-TEST_F(ParquetFileBatchReaderTest, TestNextBatchWithRenamedOutofOrderTargetSchema) {
-    auto write_field_a = arrow::field(
-        "old_a", arrow::int32(),
-        arrow::KeyValueMetadata::Make({ParquetFieldIdConverter::PARQUET_FIELD_ID}, {"1"}));
-    auto write_field_b = arrow::field(
-        "old_b", arrow::int64(),
-        arrow::KeyValueMetadata::Make({ParquetFieldIdConverter::PARQUET_FIELD_ID}, {"2"}));
-    arrow::FieldVector write_fields = {write_field_a, write_field_b};
-    auto write_schema = arrow::schema(write_fields);
-
-    auto write_array = std::dynamic_pointer_cast<arrow::StructArray>(
-        arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(write_fields), R"([
-        [1, 10],
-        [2, 20],
-        [3, 30]
-    ])")
-            .ValueOrDie());
-
-    WriteArray(file_path_, write_array, write_schema, /*write_batch_size=*/3,
-               /*enable_dictionary=*/false, /*max_row_group_length=*/3);
-
-    // Rename fields and read in out-of-order: new_b(id=2), new_a(id=1)
-    auto read_schema = DataField::ConvertDataFieldsToArrowSchema(
-        {DataField(2, arrow::field("new_b", arrow::int64())),
-         DataField(1, arrow::field("new_a", arrow::int32()))});
-
-    auto parquet_batch_reader =
-        PrepareParquetFileBatchReader(file_path_, read_schema, /*predicate=*/nullptr,
-                                      /*selection_bitmap=*/std::nullopt, /*batch_size=*/2);
-
-    ASSERT_OK_AND_ASSIGN(
-        std::shared_ptr<arrow::ChunkedArray> result_array,
-        paimon::test::ReadResultCollector::CollectResult(parquet_batch_reader.get()));
-
-    auto expected_array = std::dynamic_pointer_cast<arrow::StructArray>(
-        arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(read_schema->fields()), R"([
-        [10, 1],
-        [20, 2],
-        [30, 3]
-    ])")
-            .ValueOrDie());
-    auto expected_chunk_array = std::make_shared<arrow::ChunkedArray>(expected_array);
-    ASSERT_TRUE(result_array->Equals(expected_chunk_array));
-}
-
 TEST_F(ParquetFileBatchReaderTest, TestNextBatchWithDictionary) {
     auto f0 = arrow::field("f0", arrow::list(arrow::utf8()));
     auto f1 = arrow::field("f1", arrow::map(arrow::utf8(), arrow::binary()));

From 44048edf061a3765208fc7e1e6d7b1d4d12cce7f Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Wed, 17 Jun 2026 18:49:01 +0800
Subject: [PATCH 22/24] fix

---
 .../parquet/parquet_file_batch_reader.cpp     | 37 -------------------
 1 file changed, 37 deletions(-)

diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
index 59275ce60..75856e333 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
@@ -39,8 +39,6 @@
 #include "paimon/common/metrics/metrics_impl.h"
 #include "paimon/common/utils/arrow/status_utils.h"
 #include "paimon/common/utils/options_utils.h"
-#include "paimon/common/utils/string_utils.h"
-#include "paimon/core/utils/nested_projection_utils.h"
 #include "paimon/format/parquet/parquet_field_id_converter.h"
 #include "paimon/format/parquet/parquet_format_defs.h"
 #include "paimon/format/parquet/parquet_timestamp_converter.h"
@@ -61,34 +59,8 @@ namespace paimon::parquet {
 
 namespace {
 
-int32_t GetFieldIdForMatching(const std::shared_ptr<arrow::Field>& field) {
-    int32_t field_id = NestedProjectionUtils::GetPaimonFieldId(field);
-    if (field_id != -1) {
-        return field_id;
-    }
-    if (!field || !field->HasMetadata() || !field->metadata()) {
-        return -1;
-    }
-    auto get_result = field->metadata()->Get(ParquetFieldIdConverter::PARQUET_FIELD_ID);
-    if (!get_result.ok()) {
-        return -1;
-    }
-    std::optional<int32_t> parquet_field_id =
-        StringUtils::StringToValue<int32_t>(get_result.ValueUnsafe());
-    return parquet_field_id.value_or(-1);
-}
-
 std::shared_ptr<arrow::Field> FindMatchingReadField(
     const arrow::FieldVector& read_fields, const std::shared_ptr<arrow::Field>& file_field) {
-    int32_t file_field_id = GetFieldIdForMatching(file_field);
-    if (file_field_id != -1) {
-        for (const auto& candidate : read_fields) {
-            if (GetFieldIdForMatching(candidate) == file_field_id) {
-                return candidate;
-            }
-        }
-    }
-
     for (const auto& candidate : read_fields) {
         if (candidate->name() == file_field->name()) {
             return candidate;
@@ -99,15 +71,6 @@ std::shared_ptr<arrow::Field> FindMatchingReadField(
 
 int32_t FindMatchingFileFieldIndex(const arrow::FieldVector& file_fields,
                                    const std::shared_ptr<arrow::Field>& read_field) {
-    int32_t read_field_id = GetFieldIdForMatching(read_field);
-    if (read_field_id != -1) {
-        for (int32_t i = 0; i < static_cast<int32_t>(file_fields.size()); ++i) {
-            if (GetFieldIdForMatching(file_fields[i]) == read_field_id) {
-                return i;
-            }
-        }
-    }
-
     for (int32_t i = 0; i < static_cast<int32_t>(file_fields.size()); ++i) {
         if (file_fields[i]->name() == read_field->name()) {
             return i;

From 947b12f8fb0a4cf930ab6cc1ebcbc80bf57a11fc Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Thu, 18 Jun 2026 09:00:58 +0800
Subject: [PATCH 23/24] fix

---
 .../parquet/parquet_file_batch_reader.cpp     | 41 +++++++------------
 1 file changed, 14 insertions(+), 27 deletions(-)

diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
index 75856e333..d15ff2dc9 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
@@ -57,30 +57,6 @@ class Predicate;
 
 namespace paimon::parquet {
 
-namespace {
-
-std::shared_ptr<arrow::Field> FindMatchingReadField(
-    const arrow::FieldVector& read_fields, const std::shared_ptr<arrow::Field>& file_field) {
-    for (const auto& candidate : read_fields) {
-        if (candidate->name() == file_field->name()) {
-            return candidate;
-        }
-    }
-    return nullptr;
-}
-
-int32_t FindMatchingFileFieldIndex(const arrow::FieldVector& file_fields,
-                                   const std::shared_ptr<arrow::Field>& read_field) {
-    for (int32_t i = 0; i < static_cast<int32_t>(file_fields.size()); ++i) {
-        if (file_fields[i]->name() == read_field->name()) {
-            return i;
-        }
-    }
-    return -1;
-}
-
-}  // namespace
-
 ParquetFileBatchReader::ParquetFileBatchReader(
     std::shared_ptr<arrow::io::RandomAccessFile>&& input_stream,
     std::unique_ptr<FileReaderWrapper>&& reader, const std::map<std::string, std::string>& options,
@@ -468,8 +444,13 @@ Status ParquetFileBatchReader::CollectLeafIndices(const std::shared_ptr<arrow::D
                                                   std::vector<int32_t>* indices) {
     if (file_type->id() == arrow::Type::STRUCT) {
         for (const auto& file_child : file_type->fields()) {
-            std::shared_ptr<arrow::Field> read_child =
-                FindMatchingReadField(read_type->fields(), file_child);
+            std::shared_ptr<arrow::Field> read_child = nullptr;
+            for (const auto& candidate : read_type->fields()) {
+                if (candidate->name() == file_child->name()) {
+                    read_child = candidate;
+                    break;
+                }
+            }
             if (read_child) {
                 PAIMON_RETURN_NOT_OK(CollectLeafIndices(read_child->type(), file_child->type(),
                                                         leaf_index, indices));
@@ -523,7 +504,13 @@ Result<std::vector<int32_t>> ParquetFileBatchReader::ComputeNestedColumnIndices(
 
     const auto& file_fields = file_schema->fields();
     for (const auto& read_field : read_schema->fields()) {
-        int32_t file_field_idx = FindMatchingFileFieldIndex(file_fields, read_field);
+        int32_t file_field_idx = -1;
+        for (int32_t i = 0; i < static_cast<int32_t>(file_fields.size()); ++i) {
+            if (file_fields[i]->name() == read_field->name()) {
+                file_field_idx = i;
+                break;
+            }
+        }
         if (file_field_idx < 0) {
             continue;
         }

From b9fc5fd1085d0c29928e957df4eda0fa30a3f3cd Mon Sep 17 00:00:00 2001
From: "yonghao.fyh" <yonghao.fyh@alibaba-inc.com>
Date: Thu, 18 Jun 2026 09:52:48 +0800
Subject: [PATCH 24/24] fix

---
 include/paimon/read_context.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/paimon/read_context.h b/include/paimon/read_context.h
index a25d00408..e89352155 100644
--- a/include/paimon/read_context.h
+++ b/include/paimon/read_context.h
@@ -210,8 +210,11 @@ class PAIMON_EXPORT ReadContextBuilder {
     ///
     /// The read schema is an Arrow C Data Interface schema where STRUCT types
     /// may contain only a subset of the original sub-fields, enabling nested column
-    /// pruning to reduce I/O. Each Arrow field must carry a "paimon.id" metadata
-    /// entry for field matching.
+    /// pruning to reduce I/O. Field matching is based on field name: the system
+    /// looks up each field by name in the table schema and rebuilds the aligned
+    /// schema using the table schema's type and metadata. Any "paimon.id" metadata
+    /// in the user-provided schema is ignored. Other custom metadata (except
+    /// "paimon.id") is preserved and merged into the final aligned schema.
     ///
     /// @param read_schema Arrow C Schema. The caller retains ownership.
     /// @return Reference to this builder for method chaining.