From 94c434a61e9e1cb96823d6d8fd5c1748085bf7f8 Mon Sep 17 00:00:00 2001
From: zhouhongfeng <zhouhongfeng.zhf@alibaba-inc.com>
Date: Tue, 16 Jun 2026 18:34:21 +0800
Subject: [PATCH 1/6] test: add test cases for nested columns

---
 .../page_filtered_row_group_reader_test.cpp   | 274 ++++++++++++++++++
 1 file changed, 274 insertions(+)

diff --git a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp
index a963efaba..795ae142b 100644
--- a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp
+++ b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp
@@ -873,5 +873,279 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesWithDictionaryEncoding)
     auto partial_concat = arrow::Concatenate(result_partial->chunks()).ValueOrDie();
     ASSERT_TRUE(partial_concat->Equals(expected_struct));
 }
+/// Helper: build a StructArray with a top-level int32 "id" column and a nested struct column
+/// "info" containing two int32 fields: "x" and "y".
+/// id[i] = i, info.x[i] = i * 100, info.y[i] = i * 100 + 1, for i in [0, N).
+///
+/// Arrow schema: { id: int32, info: struct<x: int32, y: int32> }
+/// Parquet leaf columns: [id (index 0), info.x (index 1), info.y (index 2)]
+static std::shared_ptr<arrow::StructArray> MakeNestedStructData(int32_t num_rows) {
+    arrow::Int32Builder id_builder, x_builder, y_builder;
+    EXPECT_TRUE(id_builder.Reserve(num_rows).ok());
+    EXPECT_TRUE(x_builder.Reserve(num_rows).ok());
+    EXPECT_TRUE(y_builder.Reserve(num_rows).ok());
+    for (int32_t i = 0; i < num_rows; ++i) {
+        id_builder.UnsafeAppend(i);
+        x_builder.UnsafeAppend(i * 100);
+        y_builder.UnsafeAppend(i * 100 + 1);
+    }
+    auto id_array = id_builder.Finish().ValueOrDie();
+    auto x_array = x_builder.Finish().ValueOrDie();
+    auto y_array = y_builder.Finish().ValueOrDie();
+
+    auto field_x = arrow::field("x", arrow::int32());
+    auto field_y = arrow::field("y", arrow::int32());
+    auto inner_struct =
+        arrow::StructArray::Make({x_array, y_array}, {field_x, field_y}).ValueOrDie();
+
+    auto field_id = arrow::field("id", arrow::int32());
+    auto field_info = arrow::field("info", arrow::struct_({field_x, field_y}));
+    return arrow::StructArray::Make({id_array, inner_struct}, {field_id, field_info}).ValueOrDie();
+}
+
+/// Test: page-level filtering on a file with nested struct columns.
+///
+/// This test exposes the bug where BuildPageFilteredSchema fails to correctly map
+/// Parquet leaf column indices to Arrow fields for nested types, and
+/// ReadFilteredRowGroup cannot correctly assemble nested column results.
+///
+/// Schema: { id: int32, info: struct<x: int32, y: int32> }
+/// Parquet leaf columns: [id=0, info.x=1, info.y=2]
+/// 100 rows, 10 per page, 1 row group.
+/// Predicate: id >= 50 → pages 0-4 skipped, pages 5-9 read → 50 rows expected.
+/// The read schema requests both "id" and "info" columns.
+TEST_F(PageFilteredRowGroupReaderTest, NestedStructColumnPageFilter) {
+    std::string file_name = dir_->Str() + "/nested_struct_filter.parquet";
+    auto data = MakeNestedStructData(100);
+    WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/100);
+
+    auto field_x = arrow::field("x", arrow::int32());
+    auto field_y = arrow::field("y", arrow::int32());
+    auto read_schema = arrow::schema({arrow::field("id", arrow::int32()),
+                                      arrow::field("info", arrow::struct_({field_x, field_y}))});
+
+    auto predicate = PredicateBuilder::GreaterOrEqual(
+        /*field_index=*/0, /*field_name=*/"id", FieldType::INT, Literal(50));
+
+    std::shared_ptr<arrow::ChunkedArray> result;
+    ReadWithPredicateImpl(file_name, read_schema, predicate, &result);
+
+    // Should get rows 50-99 = 50 rows
+    ASSERT_TRUE(result);
+    ASSERT_EQ(50, result->length());
+
+    // Build expected result: rows 50-99 from the original data
+    auto expected = data->Slice(50, 50);
+    ASSERT_TRUE(expected->Equals(result->chunk(0)));
+}
+
+/// Test: page-level filtering reading only the nested struct column (without the predicate column).
+///
+/// This verifies that when reading a subset of columns that includes only a nested column,
+/// the schema mapping and column assembly work correctly.
+///
+/// Schema: { id: int32, info: struct<x: int32, y: int32> }
+/// Read schema: { info: struct<x: int32, y: int32> } (only the nested column)
+/// Predicate on "id": id >= 50.
+TEST_F(PageFilteredRowGroupReaderTest, NestedStructColumnOnlyReadNestedField) {
+    std::string file_name = dir_->Str() + "/nested_struct_only_nested.parquet";
+    auto data = MakeNestedStructData(100);
+    WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/100);
+
+    auto field_x = arrow::field("x", arrow::int32());
+    auto field_y = arrow::field("y", arrow::int32());
+    // Read only the nested "info" column
+    auto read_schema = arrow::schema({arrow::field("info", arrow::struct_({field_x, field_y}))});
+
+    // Predicate is on "id" (field_index=0 in file schema, not in read schema)
+    auto predicate = PredicateBuilder::GreaterOrEqual(
+        /*field_index=*/0, /*field_name=*/"id", FieldType::INT, Literal(50));
+
+    std::shared_ptr<arrow::ChunkedArray> result;
+    ReadWithPredicateImpl(file_name, read_schema, predicate, &result);
+
+    // Should get rows 50-99 = 50 rows
+    ASSERT_TRUE(result);
+    ASSERT_EQ(50, result->length());
+
+    // Build expected: only the "info" field from rows 50-99
+    auto sliced = std::dynamic_pointer_cast<arrow::StructArray>(data->Slice(50, 50));
+    ASSERT_TRUE(sliced);
+    // Extract only the "info" column (field index 1) and wrap as struct with single field
+    auto expected =
+        arrow::StructArray::Make({sliced->field(1)},
+                                 {arrow::field("info", arrow::struct_({field_x, field_y}))})
+            .ValueOrDie();
+    ASSERT_TRUE(expected->Equals(result->chunk(0)));
+}
+
+/// Helper: build a StructArray with an int32 "id" column and a list<int32> "tags" column.
+/// id[i] = i, tags[i] = [i*10, i*10+1], for i in [0, N).
+///
+/// Arrow schema: { id: int32, tags: list<item: int32> }
+/// Parquet leaf columns: [id (index 0), tags.item (index 1)]
+static std::shared_ptr<arrow::StructArray> MakeListColumnData(int32_t num_rows) {
+    arrow::Int32Builder id_builder;
+    EXPECT_TRUE(id_builder.Reserve(num_rows).ok());
+    for (int32_t i = 0; i < num_rows; ++i) {
+        id_builder.UnsafeAppend(i);
+    }
+    auto id_array = id_builder.Finish().ValueOrDie();
+
+    auto value_builder = std::make_shared<arrow::Int32Builder>();
+    arrow::ListBuilder list_builder(arrow::default_memory_pool(), value_builder);
+    for (int32_t i = 0; i < num_rows; ++i) {
+        EXPECT_TRUE(list_builder.Append().ok());
+        EXPECT_TRUE(value_builder->Append(i * 10).ok());
+        EXPECT_TRUE(value_builder->Append(i * 10 + 1).ok());
+    }
+    auto list_array = list_builder.Finish().ValueOrDie();
+
+    auto field_id = arrow::field("id", arrow::int32());
+    auto field_tags = arrow::field("tags", arrow::list(arrow::field("item", arrow::int32())));
+    return arrow::StructArray::Make({id_array, list_array}, {field_id, field_tags}).ValueOrDie();
+}
+
+/// Helper: build a StructArray with an int32 "id" column and a map<utf8, int32> "props" column.
+/// id[i] = i, props[i] = {"k_i": i * 100}, for i in [0, N).
+///
+/// Arrow schema: { id: int32, props: map<utf8, int32> }
+/// Parquet leaf columns: [id (index 0), props.key (index 1), props.value (index 2)]
+static std::shared_ptr<arrow::StructArray> MakeMapColumnData(int32_t num_rows) {
+    arrow::Int32Builder id_builder;
+    EXPECT_TRUE(id_builder.Reserve(num_rows).ok());
+    for (int32_t i = 0; i < num_rows; ++i) {
+        id_builder.UnsafeAppend(i);
+    }
+    auto id_array = id_builder.Finish().ValueOrDie();
+
+    auto key_builder = std::make_shared<arrow::StringBuilder>();
+    auto value_builder = std::make_shared<arrow::Int32Builder>();
+    arrow::MapBuilder map_builder(arrow::default_memory_pool(), key_builder, value_builder);
+    for (int32_t i = 0; i < num_rows; ++i) {
+        EXPECT_TRUE(map_builder.Append().ok());
+        std::string key = "k_" + std::to_string(i);
+        EXPECT_TRUE(key_builder->Append(key).ok());
+        EXPECT_TRUE(value_builder->Append(i * 100).ok());
+    }
+    auto map_array = map_builder.Finish().ValueOrDie();
+
+    auto field_id = arrow::field("id", arrow::int32());
+    auto field_props = arrow::field("props", arrow::map(arrow::utf8(), arrow::int32()));
+    return arrow::StructArray::Make({id_array, map_array}, {field_id, field_props}).ValueOrDie();
+}
+
+/// Test: page-level filtering on a file with a list column.
+///
+/// Schema: { id: int32, tags: list<item: int32> }
+/// 100 rows, 10 per page, 1 row group.
+/// Predicate: id >= 50 → pages 0-4 skipped, pages 5-9 read → 50 rows expected.
+TEST_F(PageFilteredRowGroupReaderTest, NestedListColumnPageFilter) {
+    std::string file_name = dir_->Str() + "/nested_list_filter.parquet";
+    auto data = MakeListColumnData(100);
+    WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/100);
+
+    auto read_schema =
+        arrow::schema({arrow::field("id", arrow::int32()),
+                       arrow::field("tags", arrow::list(arrow::field("item", arrow::int32())))});
+
+    auto predicate = PredicateBuilder::GreaterOrEqual(
+        /*field_index=*/0, /*field_name=*/"id", FieldType::INT, Literal(50));
+
+    std::shared_ptr<arrow::ChunkedArray> result;
+    ReadWithPredicateImpl(file_name, read_schema, predicate, &result);
+
+    ASSERT_TRUE(result);
+    ASSERT_EQ(50, result->length());
+
+    // Build expected result: rows 50-99 from the original data
+    auto expected = data->Slice(50, 50);
+    ASSERT_TRUE(expected->Equals(result->chunk(0)));
+}
+
+/// Test: page-level filtering on a file with a map column.
+///
+/// Schema: { id: int32, props: map<utf8, int32> }
+/// 100 rows, 10 per page, 1 row group.
+/// Predicate: id >= 50 → pages 0-4 skipped, pages 5-9 read → 50 rows expected.
+TEST_F(PageFilteredRowGroupReaderTest, NestedMapColumnPageFilter) {
+    std::string file_name = dir_->Str() + "/nested_map_filter.parquet";
+    auto data = MakeMapColumnData(100);
+    WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/100);
+
+    auto read_schema =
+        arrow::schema({arrow::field("id", arrow::int32()),
+                       arrow::field("props", arrow::map(arrow::utf8(), arrow::int32()))});
+
+    auto predicate = PredicateBuilder::GreaterOrEqual(
+        /*field_index=*/0, /*field_name=*/"id", FieldType::INT, Literal(50));
+
+    std::shared_ptr<arrow::ChunkedArray> result;
+    ReadWithPredicateImpl(file_name, read_schema, predicate, &result);
+
+    ASSERT_TRUE(result);
+    ASSERT_EQ(50, result->length());
+
+    // Build expected result: rows 50-99 from the original data
+    auto expected = data->Slice(50, 50);
+    ASSERT_TRUE(expected->Equals(result->chunk(0)));
+}
+
+/// Test: page-level filtering with multiple adjacent nested columns (struct + list).
+///
+/// Schema: { id: int32, info: struct<x: int32, y: int32>, tags: list<item: int32> }
+/// This tests the boundary handling when two nested fields are adjacent in the schema.
+/// Predicate: id >= 50.
+TEST_F(PageFilteredRowGroupReaderTest, MultipleAdjacentNestedColumns) {
+    std::string file_name = dir_->Str() + "/multi_nested.parquet";
+
+    // Build data with id, info (struct), tags (list)
+    arrow::Int32Builder id_builder, x_builder, y_builder;
+    ASSERT_TRUE(id_builder.Reserve(100).ok());
+    ASSERT_TRUE(x_builder.Reserve(100).ok());
+    ASSERT_TRUE(y_builder.Reserve(100).ok());
+    auto value_builder = std::make_shared<arrow::Int32Builder>();
+    arrow::ListBuilder list_builder(arrow::default_memory_pool(), value_builder);
+
+    for (int32_t i = 0; i < 100; ++i) {
+        id_builder.UnsafeAppend(i);
+        x_builder.UnsafeAppend(i * 100);
+        y_builder.UnsafeAppend(i * 100 + 1);
+        ASSERT_TRUE(list_builder.Append().ok());
+        ASSERT_TRUE(value_builder->Append(i * 10).ok());
+    }
+    auto id_array = id_builder.Finish().ValueOrDie();
+    auto x_array = x_builder.Finish().ValueOrDie();
+    auto y_array = y_builder.Finish().ValueOrDie();
+    auto list_array = list_builder.Finish().ValueOrDie();
+
+    auto field_x = arrow::field("x", arrow::int32());
+    auto field_y = arrow::field("y", arrow::int32());
+    auto inner_struct =
+        arrow::StructArray::Make({x_array, y_array}, {field_x, field_y}).ValueOrDie();
+
+    auto field_id = arrow::field("id", arrow::int32());
+    auto field_info = arrow::field("info", arrow::struct_({field_x, field_y}));
+    auto field_tags = arrow::field("tags", arrow::list(arrow::field("item", arrow::int32())));
+    auto data = arrow::StructArray::Make({id_array, inner_struct, list_array},
+                                         {field_id, field_info, field_tags})
+                    .ValueOrDie();
+
+    WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/100);
+
+    auto read_schema = arrow::schema({field_id, field_info, field_tags});
+    auto predicate = PredicateBuilder::GreaterOrEqual(
+        /*field_index=*/0, /*field_name=*/"id", FieldType::INT, Literal(50));
+
+    std::shared_ptr<arrow::ChunkedArray> result;
+    ReadWithPredicateImpl(file_name, read_schema, predicate, &result);
+
+    ASSERT_TRUE(result);
+    ASSERT_EQ(50, result->length());
+
+    // Build expected result: rows 50-99 from the original data
+    auto expected = data->Slice(50, 50);
+    ASSERT_TRUE(expected->Equals(result->chunk(0)));
+}
 
 }  // namespace paimon::parquet::test

From 30ea1d84998cab8443c456f5282b8b924274370f Mon Sep 17 00:00:00 2001
From: zhouhongfeng <zhouhongfeng.zhf@alibaba-inc.com>
Date: Wed, 17 Jun 2026 10:14:52 +0800
Subject: [PATCH 2/6] walkaround: fallback nested field to RowGroup reading

---
 .../parquet/page_filtered_row_group_reader_test.cpp    | 10 +++++-----
 .../format/parquet/parquet_file_batch_reader.cpp       |  6 +++++-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp
index 795ae142b..6cd558252 100644
--- a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp
+++ b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp
@@ -917,7 +917,7 @@ static std::shared_ptr<arrow::StructArray> MakeNestedStructData(int32_t num_rows
 TEST_F(PageFilteredRowGroupReaderTest, NestedStructColumnPageFilter) {
     std::string file_name = dir_->Str() + "/nested_struct_filter.parquet";
     auto data = MakeNestedStructData(100);
-    WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/100);
+    WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/50);
 
     auto field_x = arrow::field("x", arrow::int32());
     auto field_y = arrow::field("y", arrow::int32());
@@ -950,7 +950,7 @@ TEST_F(PageFilteredRowGroupReaderTest, NestedStructColumnPageFilter) {
 TEST_F(PageFilteredRowGroupReaderTest, NestedStructColumnOnlyReadNestedField) {
     std::string file_name = dir_->Str() + "/nested_struct_only_nested.parquet";
     auto data = MakeNestedStructData(100);
-    WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/100);
+    WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/50);
 
     auto field_x = arrow::field("x", arrow::int32());
     auto field_y = arrow::field("y", arrow::int32());
@@ -1043,7 +1043,7 @@ static std::shared_ptr<arrow::StructArray> MakeMapColumnData(int32_t num_rows) {
 TEST_F(PageFilteredRowGroupReaderTest, NestedListColumnPageFilter) {
     std::string file_name = dir_->Str() + "/nested_list_filter.parquet";
     auto data = MakeListColumnData(100);
-    WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/100);
+    WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/50);
 
     auto read_schema =
         arrow::schema({arrow::field("id", arrow::int32()),
@@ -1071,7 +1071,7 @@ TEST_F(PageFilteredRowGroupReaderTest, NestedListColumnPageFilter) {
 TEST_F(PageFilteredRowGroupReaderTest, NestedMapColumnPageFilter) {
     std::string file_name = dir_->Str() + "/nested_map_filter.parquet";
     auto data = MakeMapColumnData(100);
-    WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/100);
+    WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/50);
 
     auto read_schema =
         arrow::schema({arrow::field("id", arrow::int32()),
@@ -1131,7 +1131,7 @@ TEST_F(PageFilteredRowGroupReaderTest, MultipleAdjacentNestedColumns) {
                                          {field_id, field_info, field_tags})
                     .ValueOrDie();
 
-    WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/100);
+    WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/50);
 
     auto read_schema = arrow::schema({field_id, field_info, field_tags});
     auto predicate = PredicateBuilder::GreaterOrEqual(
diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
index 7533cb99a..2d11fb658 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
@@ -39,6 +39,7 @@
 #include "paimon/common/metrics/metrics_impl.h"
 #include "paimon/common/utils/arrow/status_utils.h"
 #include "paimon/common/utils/options_utils.h"
+#include "paimon/core/schema/arrow_schema_validator.h"
 #include "paimon/format/parquet/parquet_field_id_converter.h"
 #include "paimon/format/parquet/parquet_format_defs.h"
 #include "paimon/format/parquet/parquet_timestamp_converter.h"
@@ -129,8 +130,11 @@ Status ParquetFileBatchReader::SetReadSchema(
 
         PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Schema> file_schema, reader_->GetSchema());
         std::unordered_map<std::string, std::vector<int32_t>> field_index_map;
+        bool has_nested_field = false;
         int32_t i = 0;
         for (const auto& field : file_schema->fields()) {
+            has_nested_field =
+                has_nested_field || ArrowSchemaValidator::IsNestedType(field->type());
             std::vector<int32_t> v;
             FlattenSchema(field->type(), &i, &v);
             field_index_map[field->name()] = v;
@@ -166,7 +170,7 @@ Status ParquetFileBatchReader::SetReadSchema(
                 bool enable_page_index_filter,
                 OptionsUtils::GetValueFromMap<bool>(options_, PARQUET_READ_ENABLE_PAGE_INDEX_FILTER,
                                                     DEFAULT_PARQUET_READ_ENABLE_PAGE_INDEX_FILTER));
-            if (enable_page_index_filter) {
+            if (enable_page_index_filter && !has_nested_field) {
                 // Build column name to index map for page-level filtering.
                 // For leaf columns, indices[0] is the correct leaf column index in Parquet.
                 // For nested types (struct/list/map), FlattenSchema produces multiple leaf indices,

From bed9fdf5b698b8320af04edcdc7ef45794472f48 Mon Sep 17 00:00:00 2001
From: zhouhongfeng <zhouhongfeng.zhf@alibaba-inc.com>
Date: Wed, 17 Jun 2026 10:20:02 +0800
Subject: [PATCH 3/6] style: add comments

---
 src/paimon/format/parquet/parquet_file_batch_reader.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
index 2d11fb658..588a45f7c 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
@@ -170,6 +170,8 @@ Status ParquetFileBatchReader::SetReadSchema(
                 bool enable_page_index_filter,
                 OptionsUtils::GetValueFromMap<bool>(options_, PARQUET_READ_ENABLE_PAGE_INDEX_FILTER,
                                                     DEFAULT_PARQUET_READ_ENABLE_PAGE_INDEX_FILTER));
+            // walkaround: page index filter does not support nested fields for now, skip page index
+            // filter if there is any nested field in the schema
             if (enable_page_index_filter && !has_nested_field) {
                 // Build column name to index map for page-level filtering.
                 // For leaf columns, indices[0] is the correct leaf column index in Parquet.

From dfe419d6476e4d95d61799a350b520f4b44aef3d Mon Sep 17 00:00:00 2001
From: zhouhongfeng <zhouhongfeng.zhf@alibaba-inc.com>
Date: Wed, 17 Jun 2026 11:03:03 +0800
Subject: [PATCH 4/6] fix: judge has_nested_field on read-schema

---
 src/paimon/format/parquet/parquet_file_batch_reader.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
index 588a45f7c..701e420db 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
@@ -131,10 +131,12 @@ Status ParquetFileBatchReader::SetReadSchema(
         PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Schema> file_schema, reader_->GetSchema());
         std::unordered_map<std::string, std::vector<int32_t>> field_index_map;
         bool has_nested_field = false;
-        int32_t i = 0;
-        for (const auto& field : file_schema->fields()) {
+        for (const auto& field : read_schema->fields()) {
             has_nested_field =
                 has_nested_field || ArrowSchemaValidator::IsNestedType(field->type());
+        }
+        int32_t i = 0;
+        for (const auto& field : file_schema->fields()) {
             std::vector<int32_t> v;
             FlattenSchema(field->type(), &i, &v);
             field_index_map[field->name()] = v;

From 2ceaf600bcd48dd1dd54224cfa548f3623717a91 Mon Sep 17 00:00:00 2001
From: zhouhongfeng <zhouhongfeng.zhf@alibaba-inc.com>
Date: Wed, 17 Jun 2026 14:29:45 +0800
Subject: [PATCH 5/6] fix: make tests clearer

---
 .../page_filtered_row_group_reader_test.cpp   | 48 +++++++++----------
 .../parquet/parquet_file_batch_reader.cpp     |  6 ++-
 2 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp
index 6cd558252..221ed7569 100644
--- a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp
+++ b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp
@@ -903,7 +903,7 @@ static std::shared_ptr<arrow::StructArray> MakeNestedStructData(int32_t num_rows
     return arrow::StructArray::Make({id_array, inner_struct}, {field_id, field_info}).ValueOrDie();
 }
 
-/// Test: page-level filtering on a file with nested struct columns.
+/// Test: rowgroup-level filtering on a file with nested struct columns.
 ///
 /// This test exposes the bug where BuildPageFilteredSchema fails to correctly map
 /// Parquet leaf column indices to Arrow fields for nested types, and
@@ -912,7 +912,7 @@ static std::shared_ptr<arrow::StructArray> MakeNestedStructData(int32_t num_rows
 /// Schema: { id: int32, info: struct<x: int32, y: int32> }
 /// Parquet leaf columns: [id=0, info.x=1, info.y=2]
 /// 100 rows, 10 per page, 1 row group.
-/// Predicate: id >= 50 → pages 0-4 skipped, pages 5-9 read → 50 rows expected.
+/// Predicate: id >= 70 → row groups 0 skipped, row groups 1 read → 50 rows expected.
 /// The read schema requests both "id" and "info" columns.
 TEST_F(PageFilteredRowGroupReaderTest, NestedStructColumnPageFilter) {
     std::string file_name = dir_->Str() + "/nested_struct_filter.parquet";
@@ -925,7 +925,7 @@ TEST_F(PageFilteredRowGroupReaderTest, NestedStructColumnPageFilter) {
                                       arrow::field("info", arrow::struct_({field_x, field_y}))});
 
     auto predicate = PredicateBuilder::GreaterOrEqual(
-        /*field_index=*/0, /*field_name=*/"id", FieldType::INT, Literal(50));
+        /*field_index=*/0, /*field_name=*/"id", FieldType::INT, Literal(70));
 
     std::shared_ptr<arrow::ChunkedArray> result;
     ReadWithPredicateImpl(file_name, read_schema, predicate, &result);
@@ -939,27 +939,29 @@ TEST_F(PageFilteredRowGroupReaderTest, NestedStructColumnPageFilter) {
     ASSERT_TRUE(expected->Equals(result->chunk(0)));
 }
 
-/// Test: page-level filtering reading only the nested struct column (without the predicate column).
+/// Test: rowgroup-level filtering reading the nested struct column along with the predicate column.
 ///
-/// This verifies that when reading a subset of columns that includes only a nested column,
-/// the schema mapping and column assembly work correctly.
+/// This verifies that when reading a subset of columns that includes a nested column
+/// and the predicate column, the schema mapping and column assembly work correctly.
 ///
 /// Schema: { id: int32, info: struct<x: int32, y: int32> }
-/// Read schema: { info: struct<x: int32, y: int32> } (only the nested column)
+/// Read schema: { id: int32, info: struct<x: int32, y: int32> }
 /// Predicate on "id": id >= 50.
 TEST_F(PageFilteredRowGroupReaderTest, NestedStructColumnOnlyReadNestedField) {
     std::string file_name = dir_->Str() + "/nested_struct_only_nested.parquet";
     auto data = MakeNestedStructData(100);
     WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/50);
 
+    auto field_id = arrow::field("id", arrow::int32());
     auto field_x = arrow::field("x", arrow::int32());
     auto field_y = arrow::field("y", arrow::int32());
-    // Read only the nested "info" column
-    auto read_schema = arrow::schema({arrow::field("info", arrow::struct_({field_x, field_y}))});
+    auto field_info = arrow::field("info", arrow::struct_({field_x, field_y}));
+    // Read both "id" (needed for predicate evaluation) and "info" columns
+    auto read_schema = arrow::schema({field_id, field_info});
 
-    // Predicate is on "id" (field_index=0 in file schema, not in read schema)
+    // Predicate is on "id" (field_index=0 in file schema)
     auto predicate = PredicateBuilder::GreaterOrEqual(
-        /*field_index=*/0, /*field_name=*/"id", FieldType::INT, Literal(50));
+        /*field_index=*/0, /*field_name=*/"id", FieldType::INT, Literal(70));
 
     std::shared_ptr<arrow::ChunkedArray> result;
     ReadWithPredicateImpl(file_name, read_schema, predicate, &result);
@@ -968,13 +970,11 @@ TEST_F(PageFilteredRowGroupReaderTest, NestedStructColumnOnlyReadNestedField) {
     ASSERT_TRUE(result);
     ASSERT_EQ(50, result->length());
 
-    // Build expected: only the "info" field from rows 50-99
+    // Build expected: "id" and "info" fields from rows 50-99
     auto sliced = std::dynamic_pointer_cast<arrow::StructArray>(data->Slice(50, 50));
     ASSERT_TRUE(sliced);
-    // Extract only the "info" column (field index 1) and wrap as struct with single field
     auto expected =
-        arrow::StructArray::Make({sliced->field(1)},
-                                 {arrow::field("info", arrow::struct_({field_x, field_y}))})
+        arrow::StructArray::Make({sliced->field(0), sliced->field(1)}, {field_id, field_info})
             .ValueOrDie();
     ASSERT_TRUE(expected->Equals(result->chunk(0)));
 }
@@ -1035,11 +1035,11 @@ static std::shared_ptr<arrow::StructArray> MakeMapColumnData(int32_t num_rows) {
     return arrow::StructArray::Make({id_array, map_array}, {field_id, field_props}).ValueOrDie();
 }
 
-/// Test: page-level filtering on a file with a list column.
+/// Test: rowgroup-level filtering on a file with a list column.
 ///
 /// Schema: { id: int32, tags: list<item: int32> }
 /// 100 rows, 10 per page, 1 row group.
-/// Predicate: id >= 50 → pages 0-4 skipped, pages 5-9 read → 50 rows expected.
+/// Predicate: id >= 50 → row groups 0 skipped, row groups 1 read → 50 rows expected.
 TEST_F(PageFilteredRowGroupReaderTest, NestedListColumnPageFilter) {
     std::string file_name = dir_->Str() + "/nested_list_filter.parquet";
     auto data = MakeListColumnData(100);
@@ -1050,7 +1050,7 @@ TEST_F(PageFilteredRowGroupReaderTest, NestedListColumnPageFilter) {
                        arrow::field("tags", arrow::list(arrow::field("item", arrow::int32())))});
 
     auto predicate = PredicateBuilder::GreaterOrEqual(
-        /*field_index=*/0, /*field_name=*/"id", FieldType::INT, Literal(50));
+        /*field_index=*/0, /*field_name=*/"id", FieldType::INT, Literal(70));
 
     std::shared_ptr<arrow::ChunkedArray> result;
     ReadWithPredicateImpl(file_name, read_schema, predicate, &result);
@@ -1063,11 +1063,11 @@ TEST_F(PageFilteredRowGroupReaderTest, NestedListColumnPageFilter) {
     ASSERT_TRUE(expected->Equals(result->chunk(0)));
 }
 
-/// Test: page-level filtering on a file with a map column.
+/// Test: rowgroup filtering on a file with a map column.
 ///
 /// Schema: { id: int32, props: map<utf8, int32> }
 /// 100 rows, 10 per page, 1 row group.
-/// Predicate: id >= 50 → pages 0-4 skipped, pages 5-9 read → 50 rows expected.
+/// Predicate: id >= 70 → row groups 0 skipped, row groups 1 read → 50 rows expected.
 TEST_F(PageFilteredRowGroupReaderTest, NestedMapColumnPageFilter) {
     std::string file_name = dir_->Str() + "/nested_map_filter.parquet";
     auto data = MakeMapColumnData(100);
@@ -1078,7 +1078,7 @@ TEST_F(PageFilteredRowGroupReaderTest, NestedMapColumnPageFilter) {
                        arrow::field("props", arrow::map(arrow::utf8(), arrow::int32()))});
 
     auto predicate = PredicateBuilder::GreaterOrEqual(
-        /*field_index=*/0, /*field_name=*/"id", FieldType::INT, Literal(50));
+        /*field_index=*/0, /*field_name=*/"id", FieldType::INT, Literal(70));
 
     std::shared_ptr<arrow::ChunkedArray> result;
     ReadWithPredicateImpl(file_name, read_schema, predicate, &result);
@@ -1091,11 +1091,11 @@ TEST_F(PageFilteredRowGroupReaderTest, NestedMapColumnPageFilter) {
     ASSERT_TRUE(expected->Equals(result->chunk(0)));
 }
 
-/// Test: page-level filtering with multiple adjacent nested columns (struct + list).
+/// Test: rowgroup-level filtering with multiple adjacent nested columns (struct + list).
 ///
 /// Schema: { id: int32, info: struct<x: int32, y: int32>, tags: list<item: int32> }
 /// This tests the boundary handling when two nested fields are adjacent in the schema.
-/// Predicate: id >= 50.
+/// Predicate: id >= 70 → row groups 0 skipped, row groups 1 read → 50 rows expected.
 TEST_F(PageFilteredRowGroupReaderTest, MultipleAdjacentNestedColumns) {
     std::string file_name = dir_->Str() + "/multi_nested.parquet";
 
@@ -1135,7 +1135,7 @@ TEST_F(PageFilteredRowGroupReaderTest, MultipleAdjacentNestedColumns) {
 
     auto read_schema = arrow::schema({field_id, field_info, field_tags});
     auto predicate = PredicateBuilder::GreaterOrEqual(
-        /*field_index=*/0, /*field_name=*/"id", FieldType::INT, Literal(50));
+        /*field_index=*/0, /*field_name=*/"id", FieldType::INT, Literal(70));
 
     std::shared_ptr<arrow::ChunkedArray> result;
     ReadWithPredicateImpl(file_name, read_schema, predicate, &result);
diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
index 701e420db..121f65d9b 100644
--- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp
+++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp
@@ -132,8 +132,10 @@ Status ParquetFileBatchReader::SetReadSchema(
         std::unordered_map<std::string, std::vector<int32_t>> field_index_map;
         bool has_nested_field = false;
         for (const auto& field : read_schema->fields()) {
-            has_nested_field =
-                has_nested_field || ArrowSchemaValidator::IsNestedType(field->type());
+            if (ArrowSchemaValidator::IsNestedType(field->type())) {
+                has_nested_field = true;
+                break;
+            }
         }
         int32_t i = 0;
         for (const auto& field : file_schema->fields()) {

From 9f0ff523f6f06de9586c2bc2418d124a5fba25d8 Mon Sep 17 00:00:00 2001
From: zhouhongfeng <zhouhongfeng.zhf@alibaba-inc.com>
Date: Wed, 17 Jun 2026 17:16:21 +0800
Subject: [PATCH 6/6] style: update comments and tests names

---
 .../page_filtered_row_group_reader_test.cpp   | 38 +++++++++----------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp
index 221ed7569..a3cb98691 100644
--- a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp
+++ b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp
@@ -911,10 +911,10 @@ static std::shared_ptr<arrow::StructArray> MakeNestedStructData(int32_t num_rows
 ///
 /// Schema: { id: int32, info: struct<x: int32, y: int32> }
 /// Parquet leaf columns: [id=0, info.x=1, info.y=2]
-/// 100 rows, 10 per page, 1 row group.
+/// 100 rows, 10 per page, 2 row groups.
 /// Predicate: id >= 70 → row groups 0 skipped, row groups 1 read → 50 rows expected.
 /// The read schema requests both "id" and "info" columns.
-TEST_F(PageFilteredRowGroupReaderTest, NestedStructColumnPageFilter) {
+TEST_F(PageFilteredRowGroupReaderTest, NestedStructColumnRowGroupFilter) {
     std::string file_name = dir_->Str() + "/nested_struct_filter.parquet";
     auto data = MakeNestedStructData(100);
     WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/50);
@@ -939,15 +939,15 @@ TEST_F(PageFilteredRowGroupReaderTest, NestedStructColumnPageFilter) {
     ASSERT_TRUE(expected->Equals(result->chunk(0)));
 }
 
-/// Test: rowgroup-level filtering reading the nested struct column along with the predicate column.
+/// Test: Page-level filtering reading the nested struct column along with the predicate column.
 ///
 /// This verifies that when reading a subset of columns that includes a nested column
 /// and the predicate column, the schema mapping and column assembly work correctly.
 ///
 /// Schema: { id: int32, info: struct<x: int32, y: int32> }
 /// Read schema: { id: int32, info: struct<x: int32, y: int32> }
-/// Predicate on "id": id >= 50.
-TEST_F(PageFilteredRowGroupReaderTest, NestedStructColumnOnlyReadNestedField) {
+/// Predicate on "id": id >= 70.
+TEST_F(PageFilteredRowGroupReaderTest, NestedStructColumnOnlyReadIdField) {
     std::string file_name = dir_->Str() + "/nested_struct_only_nested.parquet";
     auto data = MakeNestedStructData(100);
     WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/50);
@@ -956,8 +956,8 @@ TEST_F(PageFilteredRowGroupReaderTest, NestedStructColumnOnlyReadNestedField) {
     auto field_x = arrow::field("x", arrow::int32());
     auto field_y = arrow::field("y", arrow::int32());
     auto field_info = arrow::field("info", arrow::struct_({field_x, field_y}));
-    // Read both "id" (needed for predicate evaluation) and "info" columns
-    auto read_schema = arrow::schema({field_id, field_info});
+    // Read "id" column only
+    auto read_schema = arrow::schema({field_id});
 
     // Predicate is on "id" (field_index=0 in file schema)
     auto predicate = PredicateBuilder::GreaterOrEqual(
@@ -966,17 +966,13 @@ TEST_F(PageFilteredRowGroupReaderTest, NestedStructColumnOnlyReadNestedField) {
     std::shared_ptr<arrow::ChunkedArray> result;
     ReadWithPredicateImpl(file_name, read_schema, predicate, &result);
 
-    // Should get rows 50-99 = 50 rows
+    // Should get rows 70-99 = 30 rows
     ASSERT_TRUE(result);
-    ASSERT_EQ(50, result->length());
+    ASSERT_EQ(30, result->length());
 
-    // Build expected: "id" and "info" fields from rows 50-99
-    auto sliced = std::dynamic_pointer_cast<arrow::StructArray>(data->Slice(50, 50));
-    ASSERT_TRUE(sliced);
-    auto expected =
-        arrow::StructArray::Make({sliced->field(0), sliced->field(1)}, {field_id, field_info})
-            .ValueOrDie();
-    ASSERT_TRUE(expected->Equals(result->chunk(0)));
+    auto result_struct = std::dynamic_pointer_cast<arrow::StructArray>(result->chunk(0));
+    ASSERT_TRUE(result_struct);
+    ASSERT_TRUE(data->field(0)->Slice(70, 30)->Equals(result_struct->field(0)));
 }
 
 /// Helper: build a StructArray with an int32 "id" column and a list<int32> "tags" column.
@@ -1038,9 +1034,9 @@ static std::shared_ptr<arrow::StructArray> MakeMapColumnData(int32_t num_rows) {
 /// Test: rowgroup-level filtering on a file with a list column.
 ///
 /// Schema: { id: int32, tags: list<item: int32> }
-/// 100 rows, 10 per page, 1 row group.
-/// Predicate: id >= 50 → row groups 0 skipped, row groups 1 read → 50 rows expected.
-TEST_F(PageFilteredRowGroupReaderTest, NestedListColumnPageFilter) {
+/// 100 rows, 10 per page, 2 row groups.
+/// Predicate: id >= 70 → row groups 0 skipped, row groups 1 read → 50 rows expected.
+TEST_F(PageFilteredRowGroupReaderTest, NestedListColumnRowGroupFilter) {
     std::string file_name = dir_->Str() + "/nested_list_filter.parquet";
     auto data = MakeListColumnData(100);
     WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/50);
@@ -1066,9 +1062,9 @@ TEST_F(PageFilteredRowGroupReaderTest, NestedListColumnPageFilter) {
 /// Test: rowgroup filtering on a file with a map column.
 ///
 /// Schema: { id: int32, props: map<utf8, int32> }
-/// 100 rows, 10 per page, 1 row group.
+/// 100 rows, 10 per page, 2 row groups.
 /// Predicate: id >= 70 → row groups 0 skipped, row groups 1 read → 50 rows expected.
-TEST_F(PageFilteredRowGroupReaderTest, NestedMapColumnPageFilter) {
+TEST_F(PageFilteredRowGroupReaderTest, NestedMapColumnRowGroupFilter) {
     std::string file_name = dir_->Str() + "/nested_map_filter.parquet";
     auto data = MakeMapColumnData(100);
     WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/50);