From 57036d753f988e12b9e7c813188b1c25f8228916 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Wed, 10 Jun 2026 22:26:14 -0700 Subject: [PATCH] API, Parquet: Map geometry and geography to Parquet logical types Map Iceberg geometry and geography to and from Parquet's geometry / geography logical type annotations on a BINARY column, passing the resolved CRS and edge algorithm through directly (Iceberg and Parquet use the same algorithm names; the read side resolves names with EdgeAlgorithm.fromName, the same conversion used when parsing geography type strings, and maps unset annotation parameters to the Iceberg defaults). To make the plain geography type round-trip through writers that omit default parameters (an unset Parquet crs / algorithm defaults to OGC:CRS84 / SPHERICAL), GeographyType now treats an explicit default algorithm as equal to an omitted one: equals, hashCode, and toString use the resolved getters crs() / algorithm() instead of the raw nullable fields, matching how the CRS already resolves through its getter. Schema mapping only; the value read/write path and metrics handling are follow-ups. Co-authored-by: Isaac --- .../java/org/apache/iceberg/types/Types.java | 9 ++- .../org/apache/iceberg/types/TestTypes.java | 26 ++++++ .../iceberg/parquet/MessageTypeToType.java | 21 +++++ .../iceberg/parquet/TypeToMessageType.java | 26 ++++++ .../parquet/TestParquetSchemaUtil.java | 81 +++++++++++++++++++ 5 files changed, 159 insertions(+), 4 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/types/Types.java b/api/src/main/java/org/apache/iceberg/types/Types.java index ec6076b04fa0..24077b13e892 100644 --- a/api/src/main/java/org/apache/iceberg/types/Types.java +++ b/api/src/main/java/org/apache/iceberg/types/Types.java @@ -680,18 +680,19 @@ public boolean equals(Object o) { } GeographyType that = (GeographyType) o; - return Objects.equals(crs, that.crs) && Objects.equals(algorithm, that.algorithm); + // compare the resolved CRS and algorithm so an explicit default is equal to an omitted one + return crs().equals(that.crs()) && algorithm() == that.algorithm(); } @Override public int hashCode() { - return Objects.hash(GeographyType.class, crs, algorithm); + return Objects.hash(GeographyType.class, crs(), algorithm()); } @Override public String toString() { - if (algorithm != null) { - return String.format("geography(%s, %s)", crs != null ? crs : DEFAULT_CRS, algorithm); + if (algorithm() != DEFAULT_ALGORITHM) { + return String.format("geography(%s, %s)", crs(), algorithm()); } else if (crs != null) { return String.format("geography(%s)", crs); } else { diff --git a/api/src/test/java/org/apache/iceberg/types/TestTypes.java b/api/src/test/java/org/apache/iceberg/types/TestTypes.java index fa5ed4304d3c..c765790569c8 100644 --- a/api/src/test/java/org/apache/iceberg/types/TestTypes.java +++ b/api/src/test/java/org/apache/iceberg/types/TestTypes.java @@ -172,6 +172,32 @@ public void testGeospatialTypeToString() { .isEqualTo("geography(srid:4326, karney)"); assertThat(Types.GeographyType.of(null, EdgeAlgorithm.KARNEY).toString()) .isEqualTo("geography(OGC:CRS84, karney)"); + assertThat( + Types.GeographyType.of(Types.GeographyType.DEFAULT_CRS, EdgeAlgorithm.SPHERICAL) + .toString()) + .isEqualTo("geography"); + assertThat(Types.GeographyType.of("srid:4326", EdgeAlgorithm.SPHERICAL).toString()) + .isEqualTo("geography(srid:4326)"); + } + + @Test + public void testGeospatialTypeDefaultNormalization() { + // the default CRS and edge algorithm normalize so that equivalent type specs are equal + assertThat(Types.GeometryType.of(Types.GeometryType.DEFAULT_CRS)) + .isEqualTo(Types.GeometryType.crs84()); + assertThat(Types.GeographyType.of(Types.GeographyType.DEFAULT_CRS)) + .isEqualTo(Types.GeographyType.crs84()); + assertThat(Types.GeographyType.of(Types.GeographyType.DEFAULT_CRS, EdgeAlgorithm.SPHERICAL)) + .isEqualTo(Types.GeographyType.crs84()) + .hasSameHashCodeAs(Types.GeographyType.crs84()); + assertThat(Types.GeographyType.of("srid:4326", EdgeAlgorithm.SPHERICAL)) + .isEqualTo(Types.GeographyType.of("srid:4326")) + .hasSameHashCodeAs(Types.GeographyType.of("srid:4326")); + assertThat(Types.GeographyType.of("srid:4326", EdgeAlgorithm.SPHERICAL).algorithm()) + .isEqualTo(EdgeAlgorithm.SPHERICAL); + assertThat(Types.GeographyType.of("srid:4326", EdgeAlgorithm.KARNEY)) + .isNotEqualTo(Types.GeographyType.of("srid:4326")); + assertThat(Types.GeographyType.of("srid:4326")).isNotEqualTo(Types.GeographyType.crs84()); } @Test diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/MessageTypeToType.java b/parquet/src/main/java/org/apache/iceberg/parquet/MessageTypeToType.java index 98023bafcb8f..6b985a8bab43 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/MessageTypeToType.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/MessageTypeToType.java @@ -29,9 +29,11 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.EdgeAlgorithm; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.TimestampType; +import org.apache.parquet.column.schema.EdgeInterpolationAlgorithm; import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; @@ -254,6 +256,25 @@ public Optional visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation json public Optional visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonType) { return Optional.of(Types.BinaryType.get()); } + + @Override + public Optional visit(LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryType) { + String crs = geometryType.getCrs(); + return Optional.of(Types.GeometryType.of(crs != null ? crs : Types.GeometryType.DEFAULT_CRS)); + } + + @Override + public Optional visit( + LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyType) { + String crs = geographyType.getCrs(); + EdgeInterpolationAlgorithm algorithm = geographyType.getAlgorithm(); + return Optional.of( + Types.GeographyType.of( + crs != null ? crs : Types.GeographyType.DEFAULT_CRS, + algorithm != null + ? EdgeAlgorithm.fromName(algorithm.name()) + : Types.GeographyType.DEFAULT_ALGORITHM)); + } } private void addAlias(String name, int fieldId) { diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/TypeToMessageType.java b/parquet/src/main/java/org/apache/iceberg/parquet/TypeToMessageType.java index d648cbf0694b..f05001f5f43d 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/TypeToMessageType.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/TypeToMessageType.java @@ -30,12 +30,15 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.avro.AvroSchemaUtil; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.types.EdgeAlgorithm; import org.apache.iceberg.types.Type.NestedType; import org.apache.iceberg.types.Type.PrimitiveType; import org.apache.iceberg.types.Type.TypeID; import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types.DecimalType; import org.apache.iceberg.types.Types.FixedType; +import org.apache.iceberg.types.Types.GeographyType; +import org.apache.iceberg.types.Types.GeometryType; import org.apache.iceberg.types.Types.ListType; import org.apache.iceberg.types.Types.MapType; import org.apache.iceberg.types.Types.NestedField; @@ -43,6 +46,7 @@ import org.apache.iceberg.types.Types.TimestampNanoType; import org.apache.iceberg.types.Types.TimestampType; import org.apache.iceberg.variants.Variant; +import org.apache.parquet.column.schema.EdgeInterpolationAlgorithm; import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit; @@ -280,11 +284,33 @@ public Type primitive( .id(id) .named(name); + case GEOMETRY: + GeometryType geometry = (GeometryType) primitive; + return Types.primitive(BINARY, repetition) + .as(LogicalTypeAnnotation.geometryType(geometry.crs())) + .id(id) + .named(name); + + case GEOGRAPHY: + GeographyType geography = (GeographyType) primitive; + return Types.primitive(BINARY, repetition) + .as( + LogicalTypeAnnotation.geographyType( + geography.crs(), toParquet(geography.algorithm()))) + .id(id) + .named(name); + default: throw new UnsupportedOperationException("Unsupported type for Parquet: " + primitive); } } + private static EdgeInterpolationAlgorithm toParquet(EdgeAlgorithm algorithm) { + // Iceberg and Parquet use the same algorithm names (SPHERICAL, VINCENTY, THOMAS, ANDOYER, + // KARNEY) so the algorithm is mapped by name + return EdgeInterpolationAlgorithm.valueOf(algorithm.name()); + } + private static LogicalTypeAnnotation decimalAnnotation(int precision, int scale) { return LogicalTypeAnnotation.decimalType(scale, precision); } diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetSchemaUtil.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetSchemaUtil.java index 51bfc1e811f4..b5163cb175b9 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetSchemaUtil.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetSchemaUtil.java @@ -22,13 +22,17 @@ import static org.apache.iceberg.types.Types.NestedField.required; import static org.assertj.core.api.Assertions.assertThat; +import java.util.List; import java.util.concurrent.atomic.AtomicInteger; import org.apache.iceberg.Schema; import org.apache.iceberg.mapping.MappingUtil; import org.apache.iceberg.mapping.NameMapping; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.EdgeAlgorithm; import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; import org.apache.iceberg.variants.Variant; +import org.apache.parquet.column.schema.EdgeInterpolationAlgorithm; import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; @@ -338,6 +342,83 @@ public void testVariantTypesWithoutAssigningIds() { .isEqualTo(expectedSchema.asStruct()); } + @Test + public void testGeospatialTypeRoundTrip() { + List fields = + Lists.newArrayList( + required(1, "geom_default", Types.GeometryType.crs84()), + optional(2, "geom_3857", Types.GeometryType.of("EPSG:3857")), + required(3, "geog_default", Types.GeographyType.crs84())); + // cover the by-name algorithm mapping for every edge algorithm in both directions + int nextId = fields.size() + 1; + for (EdgeAlgorithm algorithm : EdgeAlgorithm.values()) { + fields.add( + optional(nextId++, "geog_" + algorithm, Types.GeographyType.of("EPSG:4326", algorithm))); + } + + Schema schema = new Schema(fields); + MessageType messageType = ParquetSchemaUtil.convert(schema, "geo_table"); + Schema actualSchema = ParquetSchemaUtil.convert(messageType); + assertThat(actualSchema.asStruct()) + .as("Schema must round-trip through Parquet geometry/geography logical types") + .isEqualTo(schema.asStruct()); + } + + @Test + public void testGeospatialAnnotationsWithOmittedParameters() { + // unset CRS and algorithm parameters must map to Iceberg's defaults, and explicit values + // (including explicit default values) must be preserved + MessageType messageType = + org.apache.parquet.schema.Types.buildMessage() + .required(PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.geometryType(null)) + .id(1) + .named("geom_bare") + .optional(PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.geometryType("OGC:CRS84")) + .id(2) + .named("geom_explicit_default") + .required(PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.geographyType(null, null)) + .id(3) + .named("geog_bare") + .optional(PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.geographyType("EPSG:4326", null)) + .id(4) + .named("geog_crs_only") + .optional(PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.geographyType(null, EdgeInterpolationAlgorithm.ANDOYER)) + .id(5) + .named("geog_algorithm_only") + .optional(PrimitiveTypeName.BINARY) + .as( + LogicalTypeAnnotation.geographyType( + "EPSG:4326", EdgeInterpolationAlgorithm.SPHERICAL)) + .id(6) + .named("geog_explicit_spherical") + .named("geo_table"); + + Schema expectedSchema = + new Schema( + required(1, "geom_bare", Types.GeometryType.crs84()), + optional(2, "geom_explicit_default", Types.GeometryType.crs84()), + required(3, "geog_bare", Types.GeographyType.crs84()), + optional(4, "geog_crs_only", Types.GeographyType.of("EPSG:4326")), + optional( + 5, + "geog_algorithm_only", + Types.GeographyType.of(Types.GeographyType.DEFAULT_CRS, EdgeAlgorithm.ANDOYER)), + optional( + 6, + "geog_explicit_spherical", + Types.GeographyType.of("EPSG:4326", EdgeAlgorithm.SPHERICAL))); + + Schema actualSchema = ParquetSchemaUtil.convert(messageType); + assertThat(actualSchema.asStruct()) + .as("Geometry and geography annotations must convert to the expected Iceberg types") + .isEqualTo(expectedSchema.asStruct()); + } + @Test public void testSchemaConversionForHiveStyleLists() { String parquetSchemaString =