From c3547384af7658344b129d35ba6a8693eec9999a Mon Sep 17 00:00:00 2001 From: Daniil Ivanik <61067749+divanik@users.noreply.github.com> Date: Tue, 24 Mar 2026 13:56:03 +0000 Subject: [PATCH 1/4] Merge pull request #100420 from ClickHouse/divanik/rerevert_spark_azure_fixes Resolve problems with paths and compatibility problems with Spark in Azure (v2) --- ci/docker/integration/runner/Dockerfile | 2 + src/Databases/DataLake/GlueCatalog.cpp | 28 +- .../IO/ReadBufferFromAzureBlobStorage.cpp | 2 +- src/Interpreters/IcebergMetadataLog.cpp | 4 +- src/Interpreters/IcebergMetadataLog.h | 3 +- .../Common/AvroForIcebergDeserializer.cpp | 22 +- .../Common/AvroForIcebergDeserializer.h | 5 +- .../DataLakes/Iceberg/Compaction.cpp | 84 +++-- .../DataLakes/Iceberg/FileNamesGenerator.cpp | 106 ++---- .../DataLakes/Iceberg/FileNamesGenerator.h | 60 ++-- .../Iceberg/IcebergDataObjectInfo.cpp | 20 +- .../DataLakes/Iceberg/IcebergDataObjectInfo.h | 9 +- .../DataLakes/Iceberg/IcebergIterator.cpp | 48 +-- .../DataLakes/Iceberg/IcebergMetadata.cpp | 41 +-- .../Iceberg/IcebergMetadataFilesCache.h | 4 +- .../DataLakes/Iceberg/IcebergPath.cpp | 95 +++++ .../DataLakes/Iceberg/IcebergPath.h | 119 +++++++ .../DataLakes/Iceberg/IcebergWrites.cpp | 104 +++--- .../DataLakes/Iceberg/IcebergWrites.h | 10 +- .../DataLakes/Iceberg/ManifestFile.h | 19 +- .../Iceberg/ManifestFileIterator.cpp | 35 +- .../DataLakes/Iceberg/ManifestFileIterator.h | 21 +- .../DataLakes/Iceberg/MetadataGenerator.cpp | 10 +- .../DataLakes/Iceberg/MetadataGenerator.h | 15 +- .../DataLakes/Iceberg/MultipleFileWriter.cpp | 36 +- .../DataLakes/Iceberg/MultipleFileWriter.h | 7 +- .../DataLakes/Iceberg/Mutations.cpp | 322 ++++++++--------- .../DataLakes/Iceberg/Mutations.h | 6 +- .../Iceberg/PersistentTableComponents.h | 2 + .../Iceberg/PositionDeleteTransform.cpp | 6 +- .../DataLakes/Iceberg/Snapshot.h | 2 +- .../Iceberg/StatelessMetadataFileGetter.cpp | 26 +- .../Iceberg/StatelessMetadataFileGetter.h | 4 +- .../ObjectStorage/DataLakes/Iceberg/Utils.cpp | 200 +++++------ .../ObjectStorage/DataLakes/Iceberg/Utils.h | 14 +- .../DataLakes/Paimon/PaimonClient.cpp | 4 +- .../StorageObjectStorageSource.cpp | 8 + src/Storages/VirtualColumnUtils.cpp | 18 +- src/Storages/VirtualColumnUtils.h | 3 + .../configs/config.d/query_log.xml | 6 - .../conftest.py | 8 - .../__init__.py | 0 .../configs/config.d/named_collections.xml | 8 + .../configs/users.d/users.xml | 9 + .../conftest.py | 132 +++++++ .../test_interoperability.py | 270 +++++++++++++++ .../__init__.py | 0 .../configs/config.d/cluster.xml | 8 - .../configs/config.d/named_collections.xml | 6 + .../configs/users.d/users.xml | 9 + .../conftest.py | 160 +++++++++ .../test_interoperability.py | 326 ++++++++++++++++++ .../test_writes_create_version_hint.py | 6 +- 53 files changed, 1784 insertions(+), 688 deletions(-) create mode 100644 src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergPath.cpp create mode 100644 src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergPath.h delete mode 100644 tests/integration/test_storage_iceberg_concurrent/configs/config.d/query_log.xml create mode 100644 tests/integration/test_storage_iceberg_interoperability_azure/__init__.py create mode 100644 tests/integration/test_storage_iceberg_interoperability_azure/configs/config.d/named_collections.xml create mode 100644 tests/integration/test_storage_iceberg_interoperability_azure/configs/users.d/users.xml create mode 100644 tests/integration/test_storage_iceberg_interoperability_azure/conftest.py create mode 100644 tests/integration/test_storage_iceberg_interoperability_azure/test_interoperability.py create mode 100644 tests/integration/test_storage_iceberg_interoperability_local/__init__.py rename tests/integration/{test_storage_iceberg_concurrent => test_storage_iceberg_interoperability_local}/configs/config.d/cluster.xml (68%) create mode 100644 tests/integration/test_storage_iceberg_interoperability_local/configs/config.d/named_collections.xml create mode 100644 tests/integration/test_storage_iceberg_interoperability_local/configs/users.d/users.xml create mode 100644 tests/integration/test_storage_iceberg_interoperability_local/conftest.py create mode 100644 tests/integration/test_storage_iceberg_interoperability_local/test_interoperability.py diff --git a/ci/docker/integration/runner/Dockerfile b/ci/docker/integration/runner/Dockerfile index 6b2c679699cb..5923adc6cda5 100644 --- a/ci/docker/integration/runner/Dockerfile +++ b/ci/docker/integration/runner/Dockerfile @@ -80,6 +80,8 @@ org.apache.hudi:hudi-spark3.5-bundle_2.12:1.0.1,\ org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.4.3,\ org.apache.hadoop:hadoop-aws:3.3.4,\ com.amazonaws:aws-java-sdk-bundle:1.12.262,\ +org.apache.hadoop:hadoop-azure:3.3.4,\ +com.microsoft.azure:azure-storage:8.6.6,\ org.apache.spark:spark-avro_2.12:3.5.1"\ && /spark-3.5.5-bin-hadoop3/bin/spark-shell --packages "$packages" \ && find /root/.ivy2/ -name '*.jar' -exec ln -sf {} /spark-3.5.5-bin-hadoop3/jars/ \; diff --git a/src/Databases/DataLake/GlueCatalog.cpp b/src/Databases/DataLake/GlueCatalog.cpp index 426ea0f688ab..e9cc1fb52f80 100644 --- a/src/Databases/DataLake/GlueCatalog.cpp +++ b/src/Databases/DataLake/GlueCatalog.cpp @@ -38,19 +38,20 @@ #include -#include -#include -#include +#include +#include #include #include -#include -#include -#include -#include -#include -#include +#include +#include +#include #include #include +#include +#include +#include +#include +#include namespace DB::ErrorCodes { @@ -544,14 +545,7 @@ String GlueCatalog::resolveMetadataPathFromTableLocation(const String & table_lo try { auto [metadata_version, metadata_path, compression_method] = DB::Iceberg::getLatestOrExplicitMetadataFileAndVersion( - object_storage, - table_path, - *storage_settings, - nullptr, - getContext(), - log.get(), - std::nullopt - ); + object_storage, table_path, *storage_settings, nullptr, getContext(), log.get(), std::nullopt, DB::CompressionMethod::None); LOG_TRACE(log, "Resolved metadata path '{}' (version {}) for table location '{}'", metadata_path, metadata_version, table_location); diff --git a/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp b/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp index 70088e13bcd3..7d0982e26a3f 100644 --- a/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp +++ b/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp @@ -380,7 +380,7 @@ size_t ReadBufferFromAzureBlobStorage::readBigAt(char * to, size_t n, size_t ran ObjectMetadata ReadBufferFromAzureBlobStorage::getObjectMetadataFromTheLastRequest() const { - if (last_object_metadata.get()->has_value()) + if (!last_object_metadata.get()->has_value()) throw Exception(ErrorCodes::NOT_INITIALIZED, "No Azure object metadata available because there were no successful requests"); return last_object_metadata.get()->value(); diff --git a/src/Interpreters/IcebergMetadataLog.cpp b/src/Interpreters/IcebergMetadataLog.cpp index 2df936e77253..46ad24e53450 100644 --- a/src/Interpreters/IcebergMetadataLog.cpp +++ b/src/Interpreters/IcebergMetadataLog.cpp @@ -83,7 +83,7 @@ void insertRowToLogTable( String row, IcebergMetadataLogLevel row_log_level, const String & table_path, - const String & file_path, + const Iceberg::IcebergPathFromMetadata & file_path, std::optional row_in_file, std::optional pruning_status) { @@ -107,7 +107,7 @@ void insertRowToLogTable( .query_id = local_context->getCurrentQueryId(), .content_type = row_log_level, .table_path = table_path, - .file_path = file_path, + .file_path = file_path.serialize(), .metadata_content = row, .row_in_file = row_in_file, .pruning_status = pruning_status}); diff --git a/src/Interpreters/IcebergMetadataLog.h b/src/Interpreters/IcebergMetadataLog.h index 0a86cf921083..b850fb0a8500 100644 --- a/src/Interpreters/IcebergMetadataLog.h +++ b/src/Interpreters/IcebergMetadataLog.h @@ -3,6 +3,7 @@ #include #include #include +#include #include namespace DB @@ -31,7 +32,7 @@ void insertRowToLogTable( String row, IcebergMetadataLogLevel row_log_level, const String & table_path, - const String & file_path, + const Iceberg::IcebergPathFromMetadata & file_path, std::optional row_in_file, std::optional pruning_status); diff --git a/src/Storages/ObjectStorage/DataLakes/Common/AvroForIcebergDeserializer.cpp b/src/Storages/ObjectStorage/DataLakes/Common/AvroForIcebergDeserializer.cpp index 3c07bbb8ab4f..b8652ae703f4 100644 --- a/src/Storages/ObjectStorage/DataLakes/Common/AvroForIcebergDeserializer.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Common/AvroForIcebergDeserializer.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -28,7 +29,9 @@ namespace DB::Iceberg using namespace DB; AvroForIcebergDeserializer::AvroForIcebergDeserializer( - std::unique_ptr buffer_, const std::string & manifest_file_path_, const DB::FormatSettings & format_settings) + std::unique_ptr buffer_, + const IcebergPathFromMetadata & manifest_file_path_, + const DB::FormatSettings & format_settings) try : buffer(std::move(buffer_)) , manifest_file_path(manifest_file_path_) @@ -146,7 +149,8 @@ ParsedManifestFileEntryPtr AvroForIcebergDeserializer::createParsedManifestFileE } - const auto file_path_key = getValueFromRowByName(row_index, c_data_file_file_path, TypeIndex::String).safeGet(); + const auto file_path_key = IcebergPathFromMetadata::deserialize( + getValueFromRowByName(row_index, c_data_file_file_path, TypeIndex::String).safeGet()); /// NOTE: This is weird, because in manifest file partition looks like this: /// { /// ... @@ -247,16 +251,18 @@ ParsedManifestFileEntryPtr AvroForIcebergDeserializer::createParsedManifestFileE } case FileContentType::POSITION_DELETE: { /// reference_file_path can be absent in schema for some reason, though it is present in specification: https://iceberg.apache.org/spec/#manifests - std::optional lower_reference_data_file_path = std::nullopt; - std::optional upper_reference_data_file_path = std::nullopt; + std::optional lower_reference_data_file_path; + std::optional upper_reference_data_file_path; bool bounds_set_by_referenced_data_file = false; if (hasPath(c_data_file_referenced_data_file)) { Field reference_file_path_field = getValueFromRowByName(row_index, c_data_file_referenced_data_file); if (!reference_file_path_field.isNull()) { - lower_reference_data_file_path = reference_file_path_field.safeGet(); - upper_reference_data_file_path = reference_file_path_field.safeGet(); + lower_reference_data_file_path.emplace( + Iceberg::IcebergPathFromMetadata::deserialize(reference_file_path_field.safeGet())); + upper_reference_data_file_path.emplace( + Iceberg::IcebergPathFromMetadata::deserialize(reference_file_path_field.safeGet())); bounds_set_by_referenced_data_file = true; } } @@ -267,9 +273,9 @@ ParsedManifestFileEntryPtr AvroForIcebergDeserializer::createParsedManifestFileE { auto & [lower, upper] = it->second; if (!lower.isNull()) - lower_reference_data_file_path = lower.safeGet(); + lower_reference_data_file_path.emplace(Iceberg::IcebergPathFromMetadata::deserialize(lower.safeGet())); if (!upper.isNull()) - upper_reference_data_file_path = upper.safeGet(); + upper_reference_data_file_path.emplace(Iceberg::IcebergPathFromMetadata::deserialize(upper.safeGet())); } } return std::make_shared( diff --git a/src/Storages/ObjectStorage/DataLakes/Common/AvroForIcebergDeserializer.h b/src/Storages/ObjectStorage/DataLakes/Common/AvroForIcebergDeserializer.h index 31719fb7ca05..9b1e080cd890 100644 --- a/src/Storages/ObjectStorage/DataLakes/Common/AvroForIcebergDeserializer.h +++ b/src/Storages/ObjectStorage/DataLakes/Common/AvroForIcebergDeserializer.h @@ -9,6 +9,7 @@ #include #include #include +#include #include @@ -36,7 +37,7 @@ class AvroForIcebergDeserializer { private: std::unique_ptr buffer; - std::string manifest_file_path; + Iceberg::IcebergPathFromMetadata manifest_file_path; DB::ColumnPtr parsed_column; std::shared_ptr parsed_column_data_type; mutable std::optional cache_parsed_columns TSA_GUARDED_BY(cache_mutex); @@ -61,7 +62,7 @@ class AvroForIcebergDeserializer public: AvroForIcebergDeserializer( std::unique_ptr buffer_, - const std::string & manifest_file_path_, + const Iceberg::IcebergPathFromMetadata & manifest_file_path_, const DB::FormatSettings & format_settings); size_t rows() const; diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Compaction.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/Compaction.cpp index a41006d83afa..4e0c2380b66d 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Compaction.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Compaction.cpp @@ -44,11 +44,11 @@ struct ManifestFilePlan { } - String path; - std::vector manifest_lists_path; + Iceberg::IcebergPathFromMetadata path; + std::vector manifest_lists_path; DataFileStatistics statistics; - FileNamesGenerator::Result patched_path; + Iceberg::IcebergPathFromMetadata patched_path; }; struct DataFilePlan @@ -56,7 +56,7 @@ struct DataFilePlan IcebergDataObjectInfoPtr data_object_info; std::shared_ptr manifest_list; - FileNamesGenerator::Result patched_path; + Iceberg::IcebergPathFromMetadata patched_path; UInt64 new_records_count = 0; }; @@ -68,10 +68,10 @@ struct Plan using PartitionPlan = std::vector>; std::vector partitions; IcebergHistory history; - std::unordered_map manifest_file_to_first_snapshot; - std::unordered_map> manifest_list_to_manifest_files; + std::unordered_map manifest_file_to_first_snapshot; + std::unordered_map> manifest_list_to_manifest_files; std::unordered_map>> snapshot_id_to_data_files; - std::unordered_map> path_to_data_file; + std::unordered_map> path_to_data_file; FileNamesGenerator generator; Poco::JSON::Object::Ptr initial_metadata_object; @@ -120,8 +120,7 @@ Plan getPlan( LoggerPtr log = getLogger("IcebergCompaction::getPlan"); Plan plan; - plan.generator = FileNamesGenerator( - persistent_table_components.table_path, persistent_table_components.table_path, false, compression_method, write_format); + plan.generator = FileNamesGenerator(persistent_table_components.path_resolver.getTableLocation(), false, compression_method, write_format); const auto [metadata_version, metadata_file_path, _] = getLatestOrExplicitMetadataFileAndVersion( object_storage, @@ -130,7 +129,8 @@ Plan getPlan( persistent_table_components.metadata_cache, context, log.get(), - persistent_table_components.table_uuid); + persistent_table_components.table_uuid, + persistent_table_components.metadata_compression_method); Poco::JSON::Object::Ptr initial_metadata_object = getMetadataJSONObject(metadata_file_path, object_storage, persistent_table_components.metadata_cache, context, log, compression_method, persistent_table_components.table_uuid); @@ -152,7 +152,7 @@ Plan getPlan( plan.initial_metadata_object = initial_metadata_object; std::vector all_positional_delete_files; - std::unordered_map> manifest_files; + std::unordered_map> manifest_files; for (const auto & snapshot : snapshots_info) { auto manifest_list = getManifestList(object_storage, persistent_table_components, context, snapshot.manifest_list_path, log); @@ -179,7 +179,8 @@ Plan getPlan( if (plan.partitions.size() <= partition_index) plan.partitions.push_back({}); - IcebergDataObjectInfoPtr data_object_info = std::make_shared(data_file, 0); + IcebergDataObjectInfoPtr data_object_info = std::make_shared( + data_file, persistent_table_components.path_resolver.resolve(data_file->parsed_entry->file_path_key), 0); std::shared_ptr data_file_ptr; if (!plan.path_to_data_file.contains(manifest_file.manifest_file_path)) { @@ -209,7 +210,8 @@ Plan getPlan( for (auto & data_file : plan.partitions[partition_index]) { if (data_file->data_object_info->info.sequence_number <= delete_file->sequence_number) - data_file->data_object_info->addPositionDeleteObject(delete_file); + data_file->data_object_info->addPositionDeleteObject( + delete_file, persistent_table_components.path_resolver.resolve(delete_file->parsed_entry->file_path_key)); } } plan.history = std::move(snapshots_info); @@ -221,6 +223,7 @@ static void writeDataFiles( Plan & initial_plan, SharedHeader sample_block, ObjectStoragePtr object_storage, + const IcebergPathResolver & path_resolver, const std::optional & format_settings, ContextPtr context, const String & write_format, @@ -259,7 +262,7 @@ static void writeDataFiles( false); auto write_buffer = object_storage->writeObject( - StoredObject(data_file->patched_path.path_in_storage), + StoredObject(path_resolver.resolve(data_file->patched_path)), WriteMode::Rewrite, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, @@ -293,7 +296,7 @@ static void writeDataFiles( } void writeMetadataFiles( - Plan & plan, ObjectStoragePtr object_storage, ContextPtr context, SharedHeader sample_block_, String write_format, String table_path) + Plan & plan, const IcebergPathResolver & path_resolver, ObjectStoragePtr object_storage, ContextPtr context, SharedHeader sample_block_, String write_format, String table_path) { auto log = getLogger("IcebergCompaction"); @@ -314,7 +317,7 @@ void writeMetadataFiles( MetadataGenerator metadata_generator(metadata_object); std::vector new_snapshots; - auto generated_metadata_name = plan.generator.generateMetadataName(); + auto generated_metadata_info = plan.generator.generateMetadataPathWithInfo(); std::unordered_map snapshot_id_to_snapshot; std::unordered_map snapshot_id_to_records_count; @@ -332,7 +335,7 @@ void writeMetadataFiles( auto new_snapshot = metadata_generator.generateNextMetadata( plan.generator, - generated_metadata_name.path_in_metadata, + generated_metadata_info.path, history_record.parent_id, history_record.added_files, total_records_count, @@ -348,11 +351,11 @@ void writeMetadataFiles( } Poco::JSON::Object::Ptr initial_metadata_object = plan.initial_metadata_object; - std::unordered_map manifest_file_renamings; - std::unordered_map manifest_file_sizes; + std::unordered_map manifest_file_renamings; + std::unordered_map manifest_file_sizes; { - std::unordered_map, std::unordered_set> grouped_by_manifest_files_result; + std::unordered_map, std::unordered_set> grouped_by_manifest_files_result; std::unordered_map, size_t> grouped_by_manifest_files_partitions; std::unordered_map, size_t> partition_values; @@ -362,7 +365,7 @@ void writeMetadataFiles( for (const auto & data_file : partition) { grouped_by_manifest_files_partitions[data_file->manifest_list] = i; - grouped_by_manifest_files_result[data_file->manifest_list].insert(data_file->patched_path.path_in_metadata); + grouped_by_manifest_files_result[data_file->manifest_list].insert(data_file->patched_path); partition_values[data_file->manifest_list] = i; } } @@ -391,9 +394,9 @@ void writeMetadataFiles( for (auto & [manifest_entry, data_filenames] : grouped_by_manifest_files_result) { manifest_entry->patched_path = plan.generator.generateManifestEntryName(); - manifest_file_renamings[manifest_entry->path] = manifest_entry->patched_path.path_in_metadata; + manifest_file_renamings[manifest_entry->path] = manifest_entry->patched_path; auto buffer_manifest_entry = object_storage->writeObject( - StoredObject(manifest_entry->patched_path.path_in_storage), + StoredObject(path_resolver.resolve(manifest_entry->patched_path)), WriteMode::Rewrite, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, @@ -419,18 +422,25 @@ void writeMetadataFiles( *buffer_manifest_entry, Iceberg::FileContentType::DATA); - manifest_file_sizes[manifest_entry->patched_path.path_in_metadata] += buffer_manifest_entry->count(); buffer_manifest_entry->finalize(); + auto manifest_bytes = buffer_manifest_entry->count(); + if (manifest_bytes == 0) + { + auto file_metadata = object_storage->getObjectMetadata( + path_resolver.resolve(manifest_entry->patched_path), /*with_tags=*/ false); + manifest_bytes = file_metadata.size_bytes; + } + manifest_file_sizes[manifest_entry->patched_path] += manifest_bytes; } } - std::unordered_map manifest_list_renamings; + std::unordered_map manifest_list_renamings; for (size_t i = 0; i < plan.history.size(); ++i) { if (plan.history[i].added_files == 0) continue; - manifest_list_renamings[plan.history[i].manifest_list_path] = new_snapshots[i].metadata_path; + manifest_list_renamings[plan.history[i].manifest_list_path] = new_snapshots[i].manifest_list_path; } for (size_t i = 0; i < plan.history.size(); ++i) @@ -441,27 +451,32 @@ void writeMetadataFiles( auto initial_manifest_list_name = plan.history[i].manifest_list_path; auto initial_manifest_entries = plan.manifest_list_to_manifest_files[initial_manifest_list_name]; auto renamed_manifest_list = manifest_list_renamings[initial_manifest_list_name]; - std::vector renamed_manifest_entries; - Int32 total_manifest_file_sizes = 0; + std::vector renamed_manifest_entries; for (const auto & initial_manifest_entry : initial_manifest_entries) { auto renamed_manifest_entry = manifest_file_renamings[initial_manifest_entry]; if (!renamed_manifest_entry.empty()) { renamed_manifest_entries.push_back(renamed_manifest_entry); - total_manifest_file_sizes += manifest_file_sizes[renamed_manifest_entry]; } } + std::vector per_manifest_sizes; + for (const auto & entry : renamed_manifest_entries) + per_manifest_sizes.push_back(manifest_file_sizes[entry]); auto buffer_manifest_list = object_storage->writeObject( - StoredObject(renamed_manifest_list), WriteMode::Rewrite, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, context->getWriteSettings()); + StoredObject(path_resolver.resolve(renamed_manifest_list)), + WriteMode::Rewrite, + std::nullopt, + DBMS_DEFAULT_BUFFER_SIZE, + context->getWriteSettings()); generateManifestList( - plan.generator, + path_resolver, metadata_object, object_storage, context, renamed_manifest_entries, new_snapshots[i].snapshot, - total_manifest_file_sizes, + per_manifest_sizes, *buffer_manifest_list, Iceberg::FileContentType::DATA, false); @@ -474,7 +489,7 @@ void writeMetadataFiles( std::string json_representation = removeEscapedSlashes(oss.str()); auto buffer_metadata = object_storage->writeObject( - StoredObject(generated_metadata_name.path_in_storage), + StoredObject(path_resolver.resolve(generated_metadata_info.path)), WriteMode::Rewrite, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, @@ -529,11 +544,12 @@ void compactIcebergTable( plan, sample_block_, object_storage_, + persistent_table_components.path_resolver, format_settings_, context_, write_format, persistent_table_components.metadata_compression_method); - writeMetadataFiles(plan, object_storage_, context_, sample_block_, write_format, persistent_table_components.table_path); + writeMetadataFiles(plan, persistent_table_components.path_resolver, object_storage_, context_, sample_block_, write_format, persistent_table_components.table_path); clearOldFiles(object_storage_, old_files); } } diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/FileNamesGenerator.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/FileNamesGenerator.cpp index 9c51d95cf805..b0b1c3e5ef68 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/FileNamesGenerator.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/FileNamesGenerator.cpp @@ -1,45 +1,32 @@ #include #include +#include #if USE_AVRO -namespace DB::ErrorCodes -{ - extern const int BAD_ARGUMENTS; -} - namespace DB { FileNamesGenerator::FileNamesGenerator( - const String & table_dir_, - const String & storage_dir_, + const String & table_location_, bool use_uuid_in_metadata_, CompressionMethod compression_method_, const String & format_name_) - : table_dir(table_dir_) - , storage_dir(storage_dir_) - , data_dir(table_dir + "data/") - , metadata_dir(table_dir + "metadata/") - , storage_data_dir(storage_dir + "data/") - , storage_metadata_dir(storage_dir + "metadata/") + : table_location(table_location_) , use_uuid_in_metadata(use_uuid_in_metadata_) , compression_method(compression_method_) , format_name(boost::to_lower_copy(format_name_)) { + /// Normalize: ensure table_location ends with '/' + if (!table_location.empty() && table_location.back() != '/') + table_location += '/'; } FileNamesGenerator::FileNamesGenerator(const FileNamesGenerator & other) { - data_dir = other.data_dir; - metadata_dir = other.metadata_dir; - storage_data_dir = other.storage_data_dir; - storage_metadata_dir = other.storage_metadata_dir; initial_version = other.initial_version; - - table_dir = other.table_dir; - storage_dir = other.storage_dir; + table_location = other.table_location; use_uuid_in_metadata = other.use_uuid_in_metadata; compression_method = other.compression_method; format_name = other.format_name; @@ -50,14 +37,8 @@ FileNamesGenerator & FileNamesGenerator::operator=(const FileNamesGenerator & ot if (this == &other) return *this; - data_dir = other.data_dir; - metadata_dir = other.metadata_dir; - storage_data_dir = other.storage_data_dir; - storage_metadata_dir = other.storage_metadata_dir; initial_version = other.initial_version; - - table_dir = other.table_dir; - storage_dir = other.storage_dir; + table_location = other.table_location; use_uuid_in_metadata = other.use_uuid_in_metadata; compression_method = other.compression_method; format_name = other.format_name; @@ -65,85 +46,58 @@ FileNamesGenerator & FileNamesGenerator::operator=(const FileNamesGenerator & ot return *this; } -FileNamesGenerator::Result FileNamesGenerator::generateDataFileName() +Iceberg::IcebergPathFromMetadata FileNamesGenerator::generateDataFileName() { auto uuid_str = uuid_generator.createRandom().toString(); - - return Result{ - .path_in_metadata = fmt::format("{}data-{}.{}", data_dir, uuid_str, format_name), - .path_in_storage = fmt::format("{}data-{}.{}", storage_data_dir, uuid_str, format_name) - }; + return Iceberg::IcebergPathFromMetadata(fmt::format("{}data/data-{}.{}", table_location, uuid_str, format_name)); } -FileNamesGenerator::Result FileNamesGenerator::generateManifestEntryName() +Iceberg::IcebergPathFromMetadata FileNamesGenerator::generateManifestEntryName() { auto uuid_str = uuid_generator.createRandom().toString(); - - return Result{ - .path_in_metadata = fmt::format("{}{}.avro", metadata_dir, uuid_str), - .path_in_storage = fmt::format("{}{}.avro", storage_metadata_dir, uuid_str), - }; + return Iceberg::IcebergPathFromMetadata(fmt::format("{}metadata/{}.avro", table_location, uuid_str)); } -FileNamesGenerator::Result FileNamesGenerator::generateManifestListName(Int64 snapshot_id, Int32 format_version) +Iceberg::IcebergPathFromMetadata FileNamesGenerator::generateManifestListName(Int64 snapshot_id, Int32 format_version) { auto uuid_str = uuid_generator.createRandom().toString(); - - return Result{ - .path_in_metadata = fmt::format("{}snap-{}-{}-{}.avro", metadata_dir, snapshot_id, format_version, uuid_str), - .path_in_storage = fmt::format("{}snap-{}-{}-{}.avro", storage_metadata_dir, snapshot_id, format_version, uuid_str), - }; + return Iceberg::IcebergPathFromMetadata(fmt::format("{}metadata/snap-{}-{}-{}.avro", table_location, snapshot_id, format_version, uuid_str)); } -FileNamesGenerator::Result FileNamesGenerator::generateMetadataName() +GeneratedMetadataFileWithInfo FileNamesGenerator::generateMetadataPathWithInfo() { auto compression_suffix = toContentEncodingName(compression_method); if (!compression_suffix.empty()) compression_suffix = "." + compression_suffix; + auto used_version = initial_version++; if (!use_uuid_in_metadata) { - auto res = Result{ - .path_in_metadata = fmt::format("{}v{}{}.metadata.json", metadata_dir, initial_version, compression_suffix), - .path_in_storage = fmt::format("{}v{}{}.metadata.json", storage_metadata_dir, initial_version, compression_suffix), - }; - initial_version++; - return res; + return GeneratedMetadataFileWithInfo{ + .path = Iceberg::IcebergPathFromMetadata( + fmt::format("{}metadata/v{}{}.metadata.json", table_location, used_version, compression_suffix)), + .version = used_version, + .compression_method = compression_method}; } else { auto uuid_str = uuid_generator.createRandom().toString(); - auto res = Result{ - .path_in_metadata = fmt::format("{}v{}-{}{}.metadata.json", metadata_dir, initial_version, uuid_str, compression_suffix), - .path_in_storage = fmt::format("{}v{}-{}{}.metadata.json", storage_metadata_dir, initial_version, uuid_str, compression_suffix), - }; - initial_version++; - return res; + return GeneratedMetadataFileWithInfo{ + .path = Iceberg::IcebergPathFromMetadata( + fmt::format("{}metadata/v{}-{}{}.metadata.json", table_location, used_version, uuid_str, compression_suffix)), + .version = used_version, + .compression_method = compression_method}; } } -FileNamesGenerator::Result FileNamesGenerator::generateVersionHint() +Iceberg::IcebergPathFromMetadata FileNamesGenerator::generateVersionHint() { - return Result{ - .path_in_metadata = fmt::format("{}version-hint.text", metadata_dir), - .path_in_storage = fmt::format("{}version-hint.text", storage_metadata_dir), - }; + return Iceberg::IcebergPathFromMetadata(fmt::format("{}metadata/version-hint.text", table_location)); } -FileNamesGenerator::Result FileNamesGenerator::generatePositionDeleteFile() +Iceberg::IcebergPathFromMetadata FileNamesGenerator::generatePositionDeleteFile() { auto uuid_str = uuid_generator.createRandom().toString(); - - return Result{ - .path_in_metadata = fmt::format("{}{}-deletes.{}", data_dir, uuid_str, format_name), - .path_in_storage = fmt::format("{}{}-deletes.{}", storage_data_dir, uuid_str, format_name) - }; -} - -String FileNamesGenerator::convertMetadataPathToStoragePath(const String & metadata_path) const -{ - if (!metadata_path.starts_with(table_dir)) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Paths in Iceberg must use a consistent format — either /your/path or s3://your/path. Use the write_full_path_in_iceberg_metadata setting to control this behavior {} {}", metadata_path, table_dir); - return storage_dir + metadata_path.substr(table_dir.size()); + return Iceberg::IcebergPathFromMetadata(fmt::format("{}data/{}-deletes.{}", table_location, uuid_str, format_name)); } } diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/FileNamesGenerator.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/FileNamesGenerator.h index 07dbe97cd6f9..b26a5c86ffd4 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/FileNamesGenerator.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/FileNamesGenerator.h @@ -1,7 +1,9 @@ #pragma once +#include "config.h" + #include -#include +#include #include @@ -10,24 +12,25 @@ namespace DB #if USE_AVRO +struct GeneratedMetadataFileWithInfo +{ + Iceberg::IcebergPathFromMetadata path; + Int32 version; + CompressionMethod compression_method; +}; + +/// Generates Iceberg metadata paths (IcebergPathFromMetadata) for new files. +/// +/// All generated paths use table_location as prefix, ensuring they are +/// always in the correct format for writing into Iceberg metadata files. +/// To get the actual storage path for I/O, pass the result through +/// IcebergPathResolver::resolve(). class FileNamesGenerator { public: - struct Result - { - /// Path recorded in the Iceberg metadata files. - /// If `write_full_path_in_iceberg_metadata` is disabled, it will be a simple relative path (e.g., /a/b/c.avro). - /// Otherwise, it will include a prefix indicating the file system type (e.g., s3://a/b/c.avro). - String path_in_metadata; - - /// Actual path to the object in the storage (e.g., /a/b/c.avro). - String path_in_storage; - }; - FileNamesGenerator() = default; explicit FileNamesGenerator( - const String & table_dir_, - const String & storage_dir_, + const String & table_location_, bool use_uuid_in_metadata_, CompressionMethod compression_method_, const String & format_name_); @@ -35,29 +38,24 @@ class FileNamesGenerator FileNamesGenerator(const FileNamesGenerator & other); FileNamesGenerator & operator=(const FileNamesGenerator & other); - Result generateDataFileName(); - Result generateManifestEntryName(); - Result generateManifestListName(Int64 snapshot_id, Int32 format_version); - Result generateMetadataName(); - Result generateVersionHint(); - Result generatePositionDeleteFile(); - - String convertMetadataPathToStoragePath(const String & metadata_path) const; + /// All generate* methods return IcebergPathFromMetadata. + /// These paths are ready to be written into Iceberg metadata files. + /// To get a storage path for actual I/O, use IcebergPathResolver::resolve(). + Iceberg::IcebergPathFromMetadata generateDataFileName(); + Iceberg::IcebergPathFromMetadata generateManifestEntryName(); + Iceberg::IcebergPathFromMetadata generateManifestListName(Int64 snapshot_id, Int32 format_version); + GeneratedMetadataFileWithInfo generateMetadataPathWithInfo(); + Iceberg::IcebergPathFromMetadata generateVersionHint(); + Iceberg::IcebergPathFromMetadata generatePositionDeleteFile(); void setVersion(Int32 initial_version_) { initial_version = initial_version_; } void setCompressionMethod(CompressionMethod compression_method_) { compression_method = compression_method_; } private: Poco::UUIDGenerator uuid_generator; - String table_dir; - String storage_dir; - - String data_dir; - String metadata_dir; - String storage_data_dir; - String storage_metadata_dir; - bool use_uuid_in_metadata; - CompressionMethod compression_method; + String table_location; + bool use_uuid_in_metadata = false; + CompressionMethod compression_method = CompressionMethod::None; String format_name; Int32 initial_version = 0; diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.cpp index 9a963feb3d3f..e0d7d9197fb6 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.cpp @@ -34,8 +34,8 @@ extern const SettingsBool use_roaring_bitmap_iceberg_positional_deletes; #if USE_AVRO IcebergDataObjectInfo::IcebergDataObjectInfo( - Iceberg::ProcessedManifestFileEntryPtr data_manifest_file_entry_, Int32 schema_id_relevant_to_iterator_) - : ObjectInfo(RelativePathWithMetadata(data_manifest_file_entry_->file_path)) + Iceberg::ProcessedManifestFileEntryPtr data_manifest_file_entry_, const String & resolved_storage_path_, Int32 schema_id_relevant_to_iterator_) + : ObjectInfo(RelativePathWithMetadata(resolved_storage_path_)) , info{ data_manifest_file_entry_->parsed_entry->file_path_key, data_manifest_file_entry_->resolved_schema_id, @@ -66,7 +66,7 @@ std::shared_ptr IcebergDataObjectInfo::getPositionDeleteTransf return std::make_shared(header, self, object_storage, format_settings, parser_shared_resources, context_); } -void IcebergDataObjectInfo::addPositionDeleteObject(Iceberg::ProcessedManifestFileEntryPtr position_delete_object) +void IcebergDataObjectInfo::addPositionDeleteObject(Iceberg::ProcessedManifestFileEntryPtr position_delete_object, const String & resolved_storage_path) { if (Poco::toUpper(info.file_format) != "PARQUET") { @@ -76,13 +76,13 @@ void IcebergDataObjectInfo::addPositionDeleteObject(Iceberg::ProcessedManifestFi info.file_format); } info.position_deletes_objects.emplace_back( - position_delete_object->file_path, position_delete_object->parsed_entry->file_format, std::nullopt); + resolved_storage_path, position_delete_object->parsed_entry->file_format, std::nullopt); } -void IcebergDataObjectInfo::addEqualityDeleteObject(const Iceberg::ProcessedManifestFileEntryPtr & equality_delete_object) +void IcebergDataObjectInfo::addEqualityDeleteObject(const Iceberg::ProcessedManifestFileEntryPtr & equality_delete_object, const String & resolved_storage_path) { info.equality_deletes_objects.emplace_back( - equality_delete_object->file_path, + resolved_storage_path, equality_delete_object->parsed_entry->file_format, equality_delete_object->parsed_entry->equality_ids, equality_delete_object->resolved_schema_id); @@ -93,7 +93,7 @@ void IcebergDataObjectInfo::addEqualityDeleteObject(const Iceberg::ProcessedMani void IcebergObjectSerializableInfo::serializeForClusterFunctionProtocol(WriteBuffer & out, size_t protocol_version) const { checkVersion(protocol_version); - writeStringBinary(data_object_file_path_key, out); + writeStringBinary(data_object_file_path_key.serialize(), out); writeVarInt(underlying_format_read_schema_id, out); writeVarInt(schema_id_relevant_to_iterator, out); writeVarInt(sequence_number, out); @@ -142,7 +142,11 @@ void IcebergObjectSerializableInfo::serializeForClusterFunctionProtocol(WriteBuf void IcebergObjectSerializableInfo::deserializeForClusterFunctionProtocol(ReadBuffer & in, size_t protocol_version) { checkVersion(protocol_version); - readStringBinary(data_object_file_path_key, in); + { + String raw_path; + readStringBinary(raw_path, in); + data_object_file_path_key = IcebergPathFromMetadata::deserialize(std::move(raw_path)); + } readVarInt(underlying_format_read_schema_id, in); readVarInt(schema_id_relevant_to_iterator, in); readVarInt(sequence_number, in); diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.h index ce8e551d6e32..8931f5d702df 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.h @@ -9,13 +9,14 @@ #include #include +#include namespace DB::Iceberg { struct IcebergObjectSerializableInfo { - String data_object_file_path_key; + IcebergPathFromMetadata data_object_file_path_key; Int32 underlying_format_read_schema_id; Int32 schema_id_relevant_to_iterator; Int64 sequence_number; @@ -47,7 +48,7 @@ struct IcebergDataObjectInfo : public ObjectInfo, std::enable_shared_from_this getFileFormat() const override { return info.file_format; } - void addPositionDeleteObject(Iceberg::ProcessedManifestFileEntryPtr position_delete_object); + void addPositionDeleteObject(Iceberg::ProcessedManifestFileEntryPtr position_delete_object, const String & resolved_storage_path); - void addEqualityDeleteObject(const Iceberg::ProcessedManifestFileEntryPtr & equality_delete_object); + void addEqualityDeleteObject(const Iceberg::ProcessedManifestFileEntryPtr & equality_delete_object, const String & resolved_storage_path); Iceberg::IcebergObjectSerializableInfo info; }; diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp index 97de3771264e..21b02a6a0ba4 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp @@ -131,7 +131,7 @@ std::span defineDeletesSpan( "delete object info is {}", std::distance(beg_it, end_it), is_equality_delete ? "equality" : "position", - data_object_->file_path, + data_object_->parsed_entry->file_path_key, data_object_->dumpDeletesMatchingInfo(), (*beg_it)->dumpDeletesMatchingInfo(), (*previous_it)->dumpDeletesMatchingInfo()); @@ -142,7 +142,7 @@ std::span defineDeletesSpan( logger, "No {} delete elements for data file {}, taken data file object info: {}", is_equality_delete ? "equality" : "position", - data_object_->file_path, + data_object_->parsed_entry->file_path_key, data_object_->dumpDeletesMatchingInfo()); } return {beg_it, end_it}; @@ -173,8 +173,8 @@ std::optional SingleThreadIcebergKeysIterator::ne /// Find the next manifest file with matching content type. while (manifest_file_index < data_snapshot->manifest_list_entries.size()) { - const auto & mle = data_snapshot->manifest_list_entries[manifest_file_index++]; - if (mle.content_type != manifest_file_content_type) + const auto & manifest_list_entry = data_snapshot->manifest_list_entries[manifest_file_index++]; + if (manifest_list_entry.content_type != manifest_file_content_type) continue; auto manifest_file_cacheable_part = Iceberg::getManifestFile( @@ -182,20 +182,18 @@ std::optional SingleThreadIcebergKeysIterator::ne persistent_components, local_context, log, - mle.manifest_file_path, - mle.manifest_file_byte_size); + manifest_list_entry.manifest_file_path, + manifest_list_entry.manifest_file_byte_size); current_manifest_file_iterator = Iceberg::ManifestFileIterator::create( manifest_file_cacheable_part.deserializer, - mle.manifest_file_path, + manifest_list_entry.manifest_file_path, persistent_components.format_version, - persistent_components.table_path, + persistent_components.path_resolver, *persistent_components.schema_processor, - mle.added_sequence_number, - mle.added_snapshot_id, - persistent_components.table_location, + manifest_list_entry.added_sequence_number, + manifest_list_entry.added_snapshot_id, local_context, - mle.manifest_file_path, filter_dag, table_snapshot->schema_id); break; @@ -332,7 +330,10 @@ ObjectInfoPtr IcebergIterator::next(size_t) if (blocking_queue.pop(manifest_file_entry)) { IcebergDataObjectInfoPtr object_info - = std::make_shared(manifest_file_entry, table_state_snapshot->schema_id); + = std::make_shared( + manifest_file_entry, + persistent_components.path_resolver.resolve(manifest_file_entry->parsed_entry->file_path_key), + table_state_snapshot->schema_id); for (const auto & position_delete : defineDeletesSpan(manifest_file_entry, position_deletes_files, /* is_equality_delete */ false, logger)) { @@ -340,7 +341,8 @@ ObjectInfoPtr IcebergIterator::next(size_t) const auto & lower = position_delete->parsed_entry->lower_reference_data_file_path; const auto & upper = position_delete->parsed_entry->upper_reference_data_file_path; bool can_contain_data_file_deletes - = (!lower.has_value() || lower.value() <= data_file_path) && (!upper.has_value() || upper.value() >= data_file_path); + = (!lower.has_value() || *lower <= data_file_path) + && (!upper.has_value() || *upper >= data_file_path); /// Skip position deletes that do not match the data file path. if (!can_contain_data_file_deletes) { @@ -350,10 +352,10 @@ ObjectInfoPtr IcebergIterator::next(size_t) "Skipping position delete file `{}` for data file `{}` because position delete has out of bounds reference data file " "bounds: " "(lower bound: `{}`, upper bound: `{}`)", - position_delete->file_path, + position_delete->parsed_entry->file_path_key, data_file_path, - lower.value_or("[no lower bound]"), - upper.value_or("[no upper bound]")); + lower.has_value() ? lower->serialize() : "[no lower bound]", + upper.has_value() ? upper->serialize() : "[no upper bound]"); } else { @@ -362,11 +364,12 @@ ObjectInfoPtr IcebergIterator::next(size_t) logger, "Processing position delete file `{}` for data file `{}` with reference data file bounds: " "(lower bound: `{}`, upper bound: `{}`)", - position_delete->file_path, + position_delete->parsed_entry->file_path_key, data_file_path, - lower.value_or("[no lower bound]"), - upper.value_or("[no upper bound]")); - object_info->addPositionDeleteObject(position_delete); + lower.has_value() ? lower->serialize() : "[no lower bound]", + upper.has_value() ? upper->serialize() : "[no upper bound]"); + object_info->addPositionDeleteObject( + position_delete, persistent_components.path_resolver.resolve(position_delete->parsed_entry->file_path_key)); } } @@ -382,7 +385,8 @@ ObjectInfoPtr IcebergIterator::next(size_t) for (const auto & equality_delete : defineDeletesSpan(manifest_file_entry, equality_deletes_files, /* is_equality_delete */ true, logger)) { - object_info->addEqualityDeleteObject(equality_delete); + object_info->addEqualityDeleteObject( + equality_delete, persistent_components.path_resolver.resolve(equality_delete->parsed_entry->file_path_key)); } if (!object_info->info.equality_deletes_objects.empty()) diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp index 56eb10aad57a..7eb6d839a1fa 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp @@ -206,7 +206,7 @@ Iceberg::PersistentTableComponents IcebergMetadata::initializePersistentTableCom StorageObjectStorageConfigurationPtr configuration, IcebergMetadataFilesCachePtr cache_ptr, ContextPtr context_) { const auto [metadata_version, metadata_file_path, compression_method] - = getLatestOrExplicitMetadataFileAndVersion(object_storage, configuration->getPathForRead().path, configuration->getDataLakeSettings(), cache_ptr, context_, log.get(), std::nullopt, true); + = getLatestOrExplicitMetadataFileAndVersion(object_storage, configuration->getPathForRead().path, configuration->getDataLakeSettings(), cache_ptr, context_, log.get(), std::nullopt, CompressionMethod::None, true); LOG_DEBUG(log, "Latest metadata file path is {}, version {}", metadata_file_path, metadata_version); auto metadata_object = getMetadataJSONObject(metadata_file_path, object_storage, cache_ptr, context_, log, compression_method, std::nullopt); @@ -228,14 +228,16 @@ Iceberg::PersistentTableComponents IcebergMetadata::initializePersistentTableCom Iceberg::f_table_uuid); } } + auto table_path = configuration->getPathForRead().path; return PersistentTableComponents{ .schema_processor = std::make_shared(), .metadata_cache = cache_ptr, .format_version = format_version, .table_location = table_location, .metadata_compression_method = compression_method, - .table_path = configuration->getPathForRead().path, + .table_path = table_path, .table_uuid = table_uuid, + .path_resolver = IcebergPathResolver(table_location, table_path, configuration->getTypeName(), configuration->getNamespace()), }; } @@ -249,6 +251,7 @@ std::pair IcebergMetadata::getReleva context, log.get(), persistent_components.table_uuid, + persistent_components.metadata_compression_method, force_fetch_latest_metadata); return getState(context, metadata_file_path, metadata_version); } @@ -419,7 +422,7 @@ IcebergDataSnapshotPtr IcebergMetadata::createIcebergDataSnapshotFromSnapshotJSO ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Snapshot object doesn't contain a manifest list path for snapshot with id `{}`", snapshot_id); - String manifest_list_file_path = snapshot_object->getValue(f_manifest_list); + IcebergPathFromMetadata manifest_list_file_path = IcebergPathFromMetadata::deserialize(snapshot_object->getValue(f_manifest_list)); std::optional total_rows; std::optional total_bytes; std::optional total_position_deletes; @@ -445,13 +448,7 @@ IcebergDataSnapshotPtr IcebergMetadata::createIcebergDataSnapshotFromSnapshotJSO return std::make_shared( - getManifestList( - object_storage, - persistent_components, - local_context, - getProperFilePathFromMetadataInfo( - manifest_list_file_path, persistent_components.table_path, persistent_components.table_location), - log), + getManifestList(object_storage, persistent_components, local_context, manifest_list_file_path, log), snapshot_id, schema_id, total_rows, @@ -574,8 +571,8 @@ IcebergMetadata::getState(const ContextPtr & local_context, const String & metad local_context, dumpMetadataObjectToString(metadata_object), DB::IcebergMetadataLogLevel::Metadata, - persistent_components.table_path, - metadata_path, + persistent_components.path_resolver.getTableRoot(), + Iceberg::IcebergPathFromMetadata::deserialize(metadata_path), std::nullopt, std::nullopt); @@ -613,7 +610,7 @@ std::shared_ptr IcebergMetadata::getSchemaTransformer(ContextP void IcebergMetadata::mutate( const MutationCommands & commands, - StorageObjectStorageConfigurationPtr configuration, + StorageObjectStorageConfigurationPtr /*configuration*/, ContextPtr context, const StorageID & storage_id, StorageMetadataPtr metadata_snapshot, @@ -638,10 +635,7 @@ void IcebergMetadata::mutate( persistent_components, write_format, format_settings, - catalog, - configuration->getTypeName(), - configuration->getNamespace() - ); + catalog); } void IcebergMetadata::checkMutationIsPossible(const MutationCommands & commands) @@ -716,7 +710,7 @@ Pipe IcebergMetadata::executeCommand( const String & command_name, const ASTPtr & args, ObjectStoragePtr object_storage_, - StorageObjectStorageConfigurationPtr configuration_, + StorageObjectStorageConfigurationPtr /*configuration*/, std::shared_ptr catalog_, ContextPtr context, const StorageID & storage_id) @@ -749,8 +743,6 @@ Pipe IcebergMetadata::executeCommand( persistent_components, write_format, catalog_, - configuration_->getTypeName(), - configuration_->getNamespace(), storage_id.getTableName()); return expireSnapshotsResultToPipe(result); @@ -795,6 +787,8 @@ void IcebergMetadata::createInitial( } String location_path = configuration_ptr->getRawPath().path; + if (location_path.find("://") == String::npos && !location_path.starts_with('/')) + location_path = "/" + location_path; if (local_context->getSettingsRef()[Setting::write_full_path_in_iceberg_metadata].value) location_path = configuration_ptr->getTypeName() + "://" + configuration_ptr->getNamespace() + "/" + configuration_ptr->getRawPath().path; @@ -827,7 +821,7 @@ void IcebergMetadata::createInitial( if (configuration_ptr->getDataLakeSettings()[DataLakeStorageSetting::iceberg_use_version_hint].value) { auto filename_version_hint = configuration_ptr->getRawPath().path + "metadata/version-hint.text"; - writeMessageToFile(filename, filename_version_hint, object_storage, local_context, "*", ""); + writeMessageToFile("1", filename_version_hint, object_storage, local_context, "*", ""); } if (catalog) @@ -890,7 +884,8 @@ IcebergMetadata::IcebergHistory IcebergMetadata::getHistory(ContextPtr local_con persistent_components.metadata_cache, local_context, log.get(), - persistent_components.table_uuid); + persistent_components.table_uuid, + persistent_components.metadata_compression_method); auto metadata_object = getMetadataJSONObject(metadata_file_path, object_storage, persistent_components.metadata_cache, local_context, log, compression_method, persistent_components.table_uuid); @@ -934,7 +929,7 @@ IcebergMetadata::IcebergHistory IcebergMetadata::getHistory(ContextPtr local_con const auto snapshot = snapshots->getObject(static_cast(i)); history_record.snapshot_id = snapshot->getValue(f_metadata_snapshot_id); - history_record.manifest_list_path = snapshot->getValue(f_manifest_list); + history_record.manifest_list_path = IcebergPathFromMetadata::deserialize(snapshot->getValue(f_manifest_list)); const auto summary = snapshot->getObject(f_summary); if (summary->has(f_added_data_files)) history_record.added_files = summary->getValue(f_added_data_files); diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadataFilesCache.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadataFilesCache.h index 44309f94e60a..f52e596e2315 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadataFilesCache.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadataFilesCache.h @@ -55,7 +55,7 @@ using LatestMetadataVersionPtr = std::shared_ptr; /// And we can get `ManifestFileContent` from cache by ManifestFileEntry. struct ManifestFileCacheKey { - String manifest_file_path; + Iceberg::IcebergPathFromMetadata manifest_file_path; size_t manifest_file_byte_size; Int64 added_sequence_number; Int64 added_snapshot_id; @@ -110,7 +110,7 @@ struct IcebergMetadataFilesCacheCell : private boost::noncopyable size_t total_size = 0; for (const auto & entry: manifest_file_cache_keys) { - total_size += sizeof(ManifestFileCacheKey) + entry.manifest_file_path.capacity(); + total_size += sizeof(ManifestFileCacheKey) + entry.manifest_file_path.serialize().capacity(); } return total_size; } diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergPath.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergPath.cpp new file mode 100644 index 000000000000..fee289f73389 --- /dev/null +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergPath.cpp @@ -0,0 +1,95 @@ +#include +#include + +#include + +namespace DB::ErrorCodes +{ +extern const int BAD_ARGUMENTS; +} + +namespace DB::Iceberg +{ + +// This function is used to get the file path inside the directory which corresponds to Iceberg table from the full blob path which is written in manifest and metadata files. +// For example, if the full blob path is s3://bucket/table_name/data/00000-1-1234567890.avro, the function will return table_name/data/00000-1-1234567890.avro +// Common path should end with "" or "/". +String IcebergPathResolver::resolve(const IcebergPathFromMetadata & metadata_path) const +{ + auto trim_forward_slash = [](std::string_view str) -> std::string_view + { + if (str.starts_with('/')) + { + return str.substr(1); + } + return str; + }; + + auto raw_path = metadata_path.serialize(); + + if (raw_path.starts_with(table_location) && table_location.ends_with(table_root)) + { + auto result = std::filesystem::path{table_root} / trim_forward_slash(raw_path.substr(table_location.size())); + return result; + } + + if (table_root.empty()) + { + throw ::DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, + "IcebergPathResolver::resolve failed first branch. raw_path='{}', table_location='{}', table_root='{}'", + raw_path, table_location, table_root); + } + + + auto pos = raw_path.find(table_root); + /// Valid situation when data and metadata files are stored in different directories. + if (pos == std::string::npos) + { + /// connection://bucket + auto prefix = table_location.substr(0, table_location.size() - table_root.size()); + if (raw_path.size() < prefix.size()) + { + throw ::DB::Exception( + DB::ErrorCodes::BAD_ARGUMENTS, + "IcebergPathResolver::resolve failed in the second branch. raw_path='{}', table_location='{}', table_root='{}'", + raw_path, + table_location, + table_root); + } + return std::string{raw_path.substr(prefix.size())}; + } + + size_t good_pos = std::string::npos; + while (pos != std::string::npos) + { + auto potential_position = pos + table_root.size(); + if (((potential_position + 6 <= raw_path.size()) && (std::string_view(raw_path.data() + potential_position, 6) == "/data/")) + || ((potential_position + 10 <= raw_path.size()) + && (std::string_view(raw_path.data() + potential_position, 10) == "/metadata/"))) + { + good_pos = pos; + break; + } + size_t new_pos = raw_path.find(table_root, pos + 1); + if (new_pos == std::string::npos) + { + break; + } + pos = new_pos; + } + + + if (good_pos != std::string::npos) + { + return std::string{raw_path.substr(good_pos)}; + } + else if (pos != std::string::npos) + { + return std::string{raw_path.substr(pos)}; + } + else + { + throw ::DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Expected to find '{}' in data path: '{}'", table_root, raw_path); + } +} +} diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergPath.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergPath.h new file mode 100644 index 000000000000..13072990005c --- /dev/null +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergPath.h @@ -0,0 +1,119 @@ +#pragma once + +#include +#include + +namespace DB +{ +class FileNamesGenerator; +} + +namespace DB::Iceberg +{ + +/// Strong type for paths stored in Iceberg metadata files (Avro/JSON). +/// These paths may use various URI schemes (wasb://, s3://, abfss://, hdfs://, etc.) +/// or may be absolute paths (/table/data/xxx.parquet) if written from ClickHouse. +/// +/// If we get a path from metadata, we don't modify its format to preserve compatibility +/// with different Iceberg components and versions. Instead, we treat it as opaque and pass through IcebergPathResolver to get the actual storage path for I/O operations. +/// +/// This type is intentionally NOT implicitly convertible to String +/// to prevent accidental use of metadata paths as storage paths. +/// +/// Also pay attention that paths to metadata.json files themselves may not have this form, because usually they are either +/// produced by ClickHouse by table_location traversal or taken from the corresponding Iceberg Catalog. In case metadata.json +/// is generated by ClickHouse it will be expressed by this type. +class IcebergPathFromMetadata +{ +public: + IcebergPathFromMetadata() = default; + + /// Should be used only when deserialization is inevitable (the most common case is reading manifest files, which contain metadata paths). + /// Also needed to get the file which corresponds to a line in the Chunk when used for position-delete algorithms. + static IcebergPathFromMetadata deserialize(String path_) { return IcebergPathFromMetadata(std::move(path_)); } + + /// Extract the raw path string for writing into Iceberg metadata files, + /// serialization, cache keys, virtual column values, etc. + const String & serialize() const { return raw_path; } + + bool empty() const { return raw_path.empty(); } + + auto operator<=>(const IcebergPathFromMetadata & other) const { return raw_path <=> other.raw_path; } + auto operator==(const IcebergPathFromMetadata & other) const { return raw_path == other.raw_path; } + + +private: + friend class DB::FileNamesGenerator; + + explicit IcebergPathFromMetadata(String path_) + : raw_path(std::move(path_)) + { + } + String raw_path; +}; + +/// Converts Iceberg metadata paths to actual object storage paths. +/// +/// This is the ONLY way to go from a metadata path to a storage path. +class IcebergPathResolver +{ +public: + IcebergPathResolver(String table_location_, String table_root_, String blob_storage_type_name_ = {}, String blob_storage_namespace_name_ = {}) + : table_location(std::move(table_location_)) + , table_root(std::move(table_root_)) + , blob_storage_type_name(std::move(blob_storage_type_name_)) + , blob_storage_namespace_name(std::move(blob_storage_namespace_name_)) + { + auto trim_backward_slashes = [](String & str) + { + while (!str.empty() && str.back() == '/') + str.pop_back(); + }; + trim_backward_slashes(table_root); + trim_backward_slashes(table_location); + + /// Normalize: non-URI table_location should start with '/' + if (!table_location.empty() && table_location.find("://") == String::npos && table_location[0] != '/') + table_location = "/" + table_location; + } + + /// Convert a metadata path to an actual storage path for I/O operations. + String resolve(const IcebergPathFromMetadata & metadata_path) const; + + /// Convert a metadata path to a catalog-compatible path. + /// Done this way because backward compatibility reasons. + String resolveForCatalog(const IcebergPathFromMetadata & metadata_path) const + { + String catalog_filename = metadata_path.serialize(); + if (!catalog_filename.starts_with(blob_storage_type_name)) + catalog_filename = blob_storage_type_name + "://" + blob_storage_namespace_name + "/" + catalog_filename; + return catalog_filename; + } + + const String & getTableLocation() const { return table_location; } + const String & getTableRoot() const { return table_root; } + +private: + String table_location; + String table_root; + String blob_storage_type_name; + String blob_storage_namespace_name; +}; + +} + +template <> +struct std::hash +{ + size_t operator()(const DB::Iceberg::IcebergPathFromMetadata & p) const noexcept { return std::hash{}(p.serialize()); } +}; + +template <> +struct fmt::formatter : fmt::formatter +{ + auto format(const DB::Iceberg::IcebergPathFromMetadata & p, fmt::format_context & ctx) const + { + return fmt::formatter::format(p.serialize(), ctx); + } +}; diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergWrites.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergWrites.cpp index e1016bcc4327..160f8f6e5349 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergWrites.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergWrites.cpp @@ -405,7 +405,7 @@ void generateManifestFile( const std::vector & partition_columns, const std::vector & partition_values, const std::vector & partition_types, - const std::vector & data_file_names, + const std::vector & data_file_names, const std::optional & data_file_statistics, SharedHeader sample_block, Poco::JSON::Object::Ptr new_snapshot, @@ -488,7 +488,7 @@ void generateManifestFile( avro::GenericRecord & data_file = manifest.field(Iceberg::f_data_file).value(); if (version > 1) data_file.field(Iceberg::f_content) = avro::GenericDatum(static_cast(content_type)); - data_file.field(Iceberg::f_file_path) = avro::GenericDatum(data_file_name); + data_file.field(Iceberg::f_file_path) = avro::GenericDatum(data_file_name.serialize()); data_file.field(Iceberg::f_file_format) = avro::GenericDatum(format); /// vibe coded - needs extra attention @@ -648,13 +648,13 @@ void generateManifestFile( } void generateManifestList( - const FileNamesGenerator & filename_generator, + const Iceberg::IcebergPathResolver & path_resolver, Poco::JSON::Object::Ptr metadata, ObjectStoragePtr object_storage, ContextPtr context, - const Strings & manifest_entry_names, + const std::vector & manifest_entry_names, Poco::JSON::Object::Ptr new_snapshot, - Int64 manifest_length, + const std::vector & manifest_entry_sizes, WriteBuffer & buf, Iceberg::FileContentType content_type, bool use_previous_snapshots) @@ -673,13 +673,13 @@ void generateManifestList( auto adapter = std::make_unique(buf); avro::DataFileWriter writer(std::move(adapter), schema); - for (const auto & manifest_entry_name : manifest_entry_names) + for (size_t entry_idx = 0; entry_idx < manifest_entry_names.size(); ++entry_idx) { avro::GenericDatum entry_datum(schema.root()); avro::GenericRecord & entry = entry_datum.value(); - entry.field(Iceberg::f_manifest_path) = manifest_entry_name; - entry.field(Iceberg::f_manifest_length) = manifest_length; + entry.field(Iceberg::f_manifest_path) = manifest_entry_names[entry_idx].serialize(); + entry.field(Iceberg::f_manifest_length) = manifest_entry_sizes[entry_idx]; entry.field(Iceberg::f_partition_spec_id) = metadata->getValue(Iceberg::f_default_spec_id); if (version > 1) { @@ -755,9 +755,10 @@ void generateManifestList( { if (snapshots->getObject(static_cast(i))->getValue(Iceberg::f_metadata_snapshot_id) == parent_snapshot_id) { - auto manifest_list = snapshots->getObject(static_cast(i))->getValue(Iceberg::f_manifest_list); + auto manifest_list = Iceberg::IcebergPathFromMetadata::deserialize( + snapshots->getObject(static_cast(i))->getValue(Iceberg::f_manifest_list)); - RelativePathWithMetadata relative_path_with_metadata(filename_generator.convertMetadataPathToStoragePath(manifest_list)); + RelativePathWithMetadata relative_path_with_metadata(path_resolver.resolve(manifest_list)); auto manifest_list_buf = createReadBuffer(relative_path_with_metadata, object_storage, context, getLogger("IcebergWrites")); auto input_stream = std::make_unique(*manifest_list_buf); @@ -850,8 +851,6 @@ IcebergStorageSink::IcebergStorageSink( , persistent_table_components(persistent_table_components_) , data_lake_settings(configuration_->getDataLakeSettings()) , write_format(configuration_->format) - , blob_storage_type_name(configuration_->getTypeName()) - , blob_storage_namespace_name(configuration_->getNamespace()) { auto [last_version, metadata_path, compression_method] = getLatestOrExplicitMetadataFileAndVersion( object_storage, @@ -861,6 +860,7 @@ IcebergStorageSink::IcebergStorageSink( context_, log.get(), persistent_table_components.table_uuid, + persistent_table_components.metadata_compression_method, true); metadata = getMetadataJSONObject( @@ -872,25 +872,9 @@ IcebergStorageSink::IcebergStorageSink( compression_method, persistent_table_components.table_uuid); metadata_compression_method = compression_method; - auto config_path = persistent_table_components.table_path; - if (config_path.empty() || config_path.back() != '/') - config_path += "/"; - if (!config_path.starts_with('/')) - config_path = '/' + config_path; - - if (!context_->getSettingsRef()[Setting::write_full_path_in_iceberg_metadata]) - { - filename_generator = FileNamesGenerator( - config_path, config_path, (catalog != nullptr && catalog->isTransactional()), metadata_compression_method, write_format); - } - else - { - auto bucket = metadata->getValue(Iceberg::f_location); - if (bucket.empty() || bucket.back() != '/') - bucket += "/"; - filename_generator = FileNamesGenerator( - bucket, config_path, (catalog != nullptr && catalog->isTransactional()), metadata_compression_method, write_format); - } + filename_generator = FileNamesGenerator( + persistent_table_components.path_resolver.getTableLocation(), + (catalog != nullptr && catalog->isTransactional()), metadata_compression_method, write_format); filename_generator.setVersion(last_version + 1); @@ -974,6 +958,7 @@ void IcebergStorageSink::consume(Chunk & chunk) context->getSettingsRef()[Setting::iceberg_insert_max_bytes_in_data_file], current_schema->getArray(Iceberg::f_fields), filename_generator, + persistent_table_components.path_resolver, object_storage, context, format_settings, @@ -1091,7 +1076,8 @@ void IcebergStorageSink::cancelBuffers() bool IcebergStorageSink::initializeMetadata() { - auto [metadata_name, storage_metadata_name] = filename_generator.generateMetadataName(); + const auto & resolver = persistent_table_components.path_resolver; + auto metadata_info = filename_generator.generateMetadataPathWithInfo(); Int64 parent_snapshot = -1; if (metadata->has(Iceberg::f_current_snapshot_id)) @@ -1100,13 +1086,22 @@ bool IcebergStorageSink::initializeMetadata() Int32 total_data_files = 0; for (const auto & [_, writer] : writer_per_partition_key) total_data_files += writer.getDataFiles().size(); - auto [new_snapshot, manifest_list_name, storage_manifest_list_name] = MetadataGenerator(metadata).generateNextMetadata( - filename_generator, metadata_name, parent_snapshot, total_data_files, total_rows, total_chunks_size, total_data_files, /* added_delete_files */0, /* num_deleted_rows */0); + auto [new_snapshot, manifest_list_path] = MetadataGenerator(metadata).generateNextMetadata( + filename_generator, + metadata_info.path, + parent_snapshot, + total_data_files, + total_rows, + total_chunks_size, + total_data_files, + /* added_delete_files */ 0, + /* num_deleted_rows */ 0); + auto storage_manifest_list_name = resolver.resolve(manifest_list_path); Strings manifest_entries_in_storage; - Strings manifest_entries; - Int64 manifest_lengths = 0; + std::vector manifest_entries; + std::vector manifest_entry_sizes; auto cleanup = [&] (bool retry_because_of_metadata_conflict) { @@ -1131,6 +1126,7 @@ bool IcebergStorageSink::initializeMetadata() context, getLogger("IcebergWrites").get(), persistent_table_components.table_uuid, + persistent_table_components.metadata_compression_method, true); LOG_DEBUG(log, "Rereading metadata file {} with version {}", metadata_path, last_version); @@ -1179,12 +1175,12 @@ bool IcebergStorageSink::initializeMetadata() { for (const auto & [partition_key, writer] : writer_per_partition_key) { - auto [manifest_entry_name, storage_manifest_entry_name] = filename_generator.generateManifestEntryName(); - manifest_entries_in_storage.push_back(storage_manifest_entry_name); - manifest_entries.push_back(manifest_entry_name); + auto manifest_entry_path = filename_generator.generateManifestEntryName(); + manifest_entries_in_storage.push_back(resolver.resolve(manifest_entry_path)); + manifest_entries.push_back(manifest_entry_path); auto buffer_manifest_entry = object_storage->writeObject( - StoredObject(storage_manifest_entry_name), WriteMode::Rewrite, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, context->getWriteSettings()); + StoredObject(resolver.resolve(manifest_entry_path)), WriteMode::Rewrite, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, context->getWriteSettings()); try { generateManifestFile( @@ -1202,7 +1198,12 @@ bool IcebergStorageSink::initializeMetadata() *buffer_manifest_entry, Iceberg::FileContentType::DATA); buffer_manifest_entry->finalize(); - manifest_lengths += buffer_manifest_entry->count(); + auto size = buffer_manifest_entry->count(); + if (size == 0) + { + size = object_storage->getObjectMetadata(resolver.resolve(manifest_entry_path), /*with_tags=*/false).size_bytes; + } + manifest_entry_sizes.push_back(size); } catch (...) { @@ -1217,7 +1218,7 @@ bool IcebergStorageSink::initializeMetadata() try { generateManifestList( - filename_generator, metadata, object_storage, context, manifest_entries, new_snapshot, manifest_lengths, *buffer_manifest_list, Iceberg::FileContentType::DATA); + persistent_table_components.path_resolver, metadata, object_storage, context, manifest_entries, new_snapshot, manifest_entry_sizes, *buffer_manifest_list, Iceberg::FileContentType::DATA); buffer_manifest_list->finalize(); } catch (...) @@ -1237,32 +1238,29 @@ bool IcebergStorageSink::initializeMetadata() throw Exception(ErrorCodes::BAD_ARGUMENTS, "Failpoint for cleanup enabled"); }); - LOG_DEBUG(log, "Writing new metadata file {}", storage_metadata_name); - auto hint = filename_generator.generateVersionHint(); + LOG_DEBUG(log, "Writing new metadata file {}", metadata_info.path); + auto hint_path = filename_generator.generateVersionHint(); if (!writeMetadataFileAndVersionHint( - storage_metadata_name, + persistent_table_components.path_resolver, + metadata_info, json_representation, - hint.path_in_storage, - storage_metadata_name, + hint_path, object_storage, context, - metadata_compression_method, data_lake_settings[DataLakeStorageSetting::iceberg_use_version_hint])) { - LOG_DEBUG(log, "Failed to write metadata {}, retrying", storage_metadata_name); + LOG_DEBUG(log, "Failed to write metadata {}, retrying", metadata_info.path); cleanup(true); return false; } else { - LOG_DEBUG(log, "Metadata file {} written", storage_metadata_name); + LOG_DEBUG(log, "Metadata file {} written", metadata_info.path); } if (catalog) { - String catalog_filename = metadata_name; - if (!catalog_filename.starts_with(blob_storage_type_name)) - catalog_filename = blob_storage_type_name + "://" + blob_storage_namespace_name + "/" + metadata_name; + auto catalog_filename = resolver.resolveForCatalog(metadata_info.path); const auto & [namespace_name, table_name] = DataLake::parseTableName(table_id.getTableName()); if (!catalog->updateMetadata(namespace_name, table_name, catalog_filename, new_snapshot)) diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergWrites.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergWrites.h index 71b8743bd1ce..ffaeb5d10588 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergWrites.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergWrites.h @@ -82,7 +82,7 @@ void generateManifestFile( const std::vector & partition_columns, const std::vector & partition_values, const std::vector & partition_types, - const std::vector & data_file_names, + const std::vector & data_file_names, const std::optional & data_file_statistics, SharedHeader sample_block, Poco::JSON::Object::Ptr new_snapshot, @@ -94,13 +94,13 @@ void generateManifestFile( const std::vector & per_file_stats = {}); void generateManifestList( - const FileNamesGenerator & filename_generator, + const Iceberg::IcebergPathResolver & path_resolver, Poco::JSON::Object::Ptr metadata, ObjectStoragePtr object_storage, ContextPtr context, - const Strings & manifest_entry_names, + const std::vector & manifest_entry_names, Poco::JSON::Object::Ptr new_snapshot, - Int64 manifest_length, + const std::vector & manifest_entry_sizes, WriteBuffer & buf, Iceberg::FileContentType content_type, bool use_previous_snapshots = true); @@ -162,8 +162,6 @@ class IcebergStorageSink : public SinkToStorage Iceberg::PersistentTableComponents persistent_table_components; const DataLakeStorageSettings & data_lake_settings; const String write_format; - const String blob_storage_type_name; - const String blob_storage_namespace_name; }; diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h index 3a1bccc53411..c56dbe5e7a74 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h @@ -21,6 +21,7 @@ struct ColumnInfo #include #include +#include #include #include @@ -76,8 +77,9 @@ struct ManifestFileCacheableInfo struct ParsedManifestFileEntry : boost::noncopyable { FileContentType content_type; - // It's the original string in the Iceberg metadata - String file_path_key; + /// The original path as stored in the Iceberg metadata. + /// Must be resolved through IcebergPathResolver before use in storage operations. + IcebergPathFromMetadata file_path_key; Int64 row_number; ManifestEntryStatus status; @@ -89,8 +91,8 @@ struct ParsedManifestFileEntry : boost::noncopyable std::unordered_map> value_bounds; String file_format; - std::optional lower_reference_data_file_path; // For position delete files only. - std::optional upper_reference_data_file_path; // For position delete files only. + std::optional lower_reference_data_file_path; // For position delete files only. + std::optional upper_reference_data_file_path; // For position delete files only. std::optional> equality_ids; /// Data file is sorted with this sort_order_id (can be read from metadata.json) @@ -98,7 +100,7 @@ struct ParsedManifestFileEntry : boost::noncopyable ParsedManifestFileEntry( FileContentType content_type_, - String file_path_key_, + IcebergPathFromMetadata file_path_key_, Int64 row_number_, ManifestEntryStatus status_, std::optional written_sequence_number_, @@ -107,8 +109,8 @@ struct ParsedManifestFileEntry : boost::noncopyable std::unordered_map columns_infos_, std::unordered_map> value_bounds_, String file_format_, - std::optional lower_reference_data_file_path_, - std::optional upper_reference_data_file_path_, + std::optional lower_reference_data_file_path_, + std::optional upper_reference_data_file_path_, std::optional> equality_ids_, std::optional sort_order_id_) : content_type(content_type_) @@ -134,9 +136,6 @@ struct ProcessedManifestFileEntry std::shared_ptr parsed_entry; std::shared_ptr common_partition_specification; - /// Computed file path for Object Storage (resolved from parsed_entry->file_path_key) - String file_path; - // Always zero in case of format version 1 Int64 sequence_number; Int32 resolved_schema_id; diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileIterator.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileIterator.cpp index d68fa264e0f1..64a8e76dd501 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileIterator.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileIterator.cpp @@ -140,15 +140,13 @@ ManifestFileIterator::~ManifestFileIterator() = default; std::shared_ptr ManifestFileIterator::create( std::shared_ptr manifest_file_deserializer_, - const String & manifest_file_name_, + const IcebergPathFromMetadata & path_to_manifest_file_, Int32 format_version_, - const String & common_path_, + const IcebergPathResolver & path_resolver_, IcebergSchemaProcessor & schema_processor, Int64 inherited_sequence_number_, Int64 inherited_snapshot_id_, - const String & table_location_, DB::ContextPtr context_, - const String & path_to_manifest_file_, std::shared_ptr filter_dag_, Int32 table_snapshot_schema_id_) { @@ -156,7 +154,7 @@ std::shared_ptr ManifestFileIterator::create( context_, manifest_file_deserializer_->getMetadataContent(), DB::IcebergMetadataLogLevel::ManifestFileMetadata, - common_path_, + path_resolver_.getTableRoot(), path_to_manifest_file_, std::nullopt, std::nullopt); @@ -192,7 +190,7 @@ std::shared_ptr ManifestFileIterator::create( throw Exception( ErrorCodes::BAD_ARGUMENTS, "Cannot read Iceberg table: manifest file '{}' doesn't have field '{}' in its metadata", - manifest_file_name_, + path_to_manifest_file_, f_schema); Poco::Dynamic::Var json = parser.parse(*schema_json_string); @@ -236,10 +234,8 @@ std::shared_ptr ManifestFileIterator::create( return std::shared_ptr(new ManifestFileIterator( std::move(manifest_file_deserializer_), path_to_manifest_file_, - manifest_file_name_, format_version_, - common_path_, - table_location_, + path_resolver_, schema_processor, inherited_sequence_number_, inherited_snapshot_id_, @@ -254,11 +250,9 @@ std::shared_ptr ManifestFileIterator::create( ManifestFileIterator::ManifestFileIterator( std::shared_ptr manifest_file_deserializer_, - const String & path_to_manifest_file_, - const String & manifest_file_name_, + const IcebergPathFromMetadata & path_to_manifest_file_, Int32 format_version_, - const String & common_path_, - const String & table_location_, + const IcebergPathResolver & path_resolver_, IcebergSchemaProcessor & schema_processor, Int64 inherited_sequence_number_, Int64 inherited_snapshot_id_, @@ -271,10 +265,8 @@ ManifestFileIterator::ManifestFileIterator( Int32 table_snapshot_schema_id_) : manifest_file_deserializer(std::move(manifest_file_deserializer_)) , path_to_manifest_file(path_to_manifest_file_) - , manifest_file_name(manifest_file_name_) , format_version(format_version_) - , common_path(common_path_) - , table_location(table_location_) + , path_resolver(path_resolver_) , inherited_sequence_number(inherited_sequence_number_) , inherited_snapshot_id(inherited_snapshot_id_) , context(context_) @@ -308,7 +300,7 @@ ProcessedManifestFileEntryPtr ManifestFileIterator::processRow(size_t row_index) context, manifest_file_deserializer->getContent(row_index), DB::IcebergMetadataLogLevel::ManifestFileEntry, - common_path, + path_resolver.getTableRoot(), path_to_manifest_file, row_index, std::nullopt); @@ -316,7 +308,6 @@ ProcessedManifestFileEntryPtr ManifestFileIterator::processRow(size_t row_index) } /// Compute inherited/resolved fields - const auto file_path = getProperFilePathFromMetadataInfo(parsed_entry->file_path_key, common_path, table_location); Int64 resolved_snapshot_id; if (parsed_entry->parsed_snapshot_id.has_value()) @@ -328,7 +319,7 @@ ProcessedManifestFileEntryPtr ManifestFileIterator::processRow(size_t row_index) throw Exception( ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Cannot read Iceberg table: manifest file '{}' has entry with snapshot_id 'null' for which write file schema is unknown", - manifest_file_name); + path_to_manifest_file); } else { @@ -345,7 +336,7 @@ ProcessedManifestFileEntryPtr ManifestFileIterator::processRow(size_t row_index) throw Exception( ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Cannot read Iceberg table: manifest file '{}' has entry with snapshot_id '{}' for which write file schema is unknown", - manifest_file_name, + path_to_manifest_file, resolved_snapshot_id); } catch (const Exception &) @@ -375,7 +366,7 @@ ProcessedManifestFileEntryPtr ManifestFileIterator::processRow(size_t row_index) } auto entry = std::make_shared( - parsed_entry, common_partition_specification, file_path, resolved_sequence_number, resolved_schema_id); + parsed_entry, common_partition_specification, resolved_sequence_number, resolved_schema_id); PruningReturnStatus pruning_status = PruningReturnStatus::NOT_PRUNED; @@ -422,7 +413,7 @@ ProcessedManifestFileEntryPtr ManifestFileIterator::processRow(size_t row_index) context, manifest_file_deserializer->getContent(row_index), DB::IcebergMetadataLogLevel::ManifestFileEntry, - common_path, + path_resolver.getTableRoot(), path_to_manifest_file, row_index, pruning_status); diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileIterator.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileIterator.h index 26b630e81bc7..74a2f1e794b8 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileIterator.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileIterator.h @@ -5,6 +5,7 @@ #if USE_AVRO +#include #include #include #include @@ -76,15 +77,13 @@ class ManifestFileIterator : public boost::noncopyable static std::shared_ptr create( std::shared_ptr manifest_file_deserializer, - const String & manifest_file_name, + const IcebergPathFromMetadata & path_to_manifest_file, Int32 format_version_, - const String & common_path, + const IcebergPathResolver & path_resolver, IcebergSchemaProcessor & schema_processor, Int64 inherited_sequence_number, Int64 inherited_snapshot_id, - const std::string & table_location, DB::ContextPtr context, - const String & path_to_manifest_file_, std::shared_ptr filter_dag_, Int32 table_snapshot_schema_id_); @@ -97,8 +96,6 @@ class ManifestFileIterator : public boost::noncopyable std::optional getRowsCountInAllFilesExcludingDeleted(FileContentType content) const; std::optional getBytesCountInAllDataFilesExcludingDeleted() const; - const String & getPathToManifestFile() const { return path_to_manifest_file; } - bool areAllDataFilesSortedBySortOrderID(Int32 sort_order_id) const; /// Returns true if all manifest file entries have been processed @@ -115,11 +112,9 @@ class ManifestFileIterator : public boost::noncopyable private: ManifestFileIterator( std::shared_ptr manifest_file_deserializer, - const String & path_to_manifest_file, - const String & manifest_file_name, + const IcebergPathFromMetadata & path_to_manifest_file, Int32 format_version, - const String & common_path, - const String & table_location, + const IcebergPathResolver & path_resolver, IcebergSchemaProcessor & schema_processor, Int64 inherited_sequence_number, Int64 inherited_snapshot_id, @@ -135,11 +130,9 @@ class ManifestFileIterator : public boost::noncopyable /// Constant properties of this manifest file const std::shared_ptr manifest_file_deserializer; - const String path_to_manifest_file; - const String manifest_file_name; + const IcebergPathFromMetadata path_to_manifest_file; const Int32 format_version; - const String common_path; - const String table_location; + const IcebergPathResolver path_resolver; // always zero in case of format version 1 const Int64 inherited_sequence_number; const Int64 inherited_snapshot_id; diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/MetadataGenerator.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/MetadataGenerator.cpp index bd924af181f1..c4d7cb2222c3 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/MetadataGenerator.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/MetadataGenerator.cpp @@ -105,7 +105,7 @@ Poco::JSON::Object::Ptr MetadataGenerator::getParentSnapshot(Int64 parent_snapsh MetadataGenerator::NextMetadataResult MetadataGenerator::generateNextMetadata( FileNamesGenerator & generator, - const String & metadata_filename, + const Iceberg::IcebergPathFromMetadata & metadata_file_path, Int64 parent_snapshot_id, Int64 added_files, Int64 added_records, @@ -126,7 +126,7 @@ MetadataGenerator::NextMetadataResult MetadataGenerator::generateNextMetadata( } Int64 snapshot_id = user_defined_snapshot_id.value_or(static_cast(dis(gen))); - auto [manifest_list_name, storage_manifest_list_name] = generator.generateManifestListName(snapshot_id, format_version); + auto manifest_list_path = generator.generateManifestListName(snapshot_id, format_version); new_snapshot->set(Iceberg::f_metadata_snapshot_id, snapshot_id); new_snapshot->set(Iceberg::f_parent_snapshot_id, parent_snapshot_id); @@ -171,7 +171,7 @@ MetadataGenerator::NextMetadataResult MetadataGenerator::generateNextMetadata( new_snapshot->set(Iceberg::f_summary, summary); new_snapshot->set(Iceberg::f_schema_id, metadata_object->getValue(Iceberg::f_current_schema_id)); - new_snapshot->set(Iceberg::f_manifest_list, manifest_list_name); + new_snapshot->set(Iceberg::f_manifest_list, manifest_list_path.serialize()); metadata_object->getArray(Iceberg::f_snapshots)->add(new_snapshot); metadata_object->set(Iceberg::f_current_snapshot_id, snapshot_id); @@ -192,7 +192,7 @@ MetadataGenerator::NextMetadataResult MetadataGenerator::generateNextMetadata( { Poco::JSON::Object::Ptr new_metadata_item = new Poco::JSON::Object; - new_metadata_item->set(Iceberg::f_metadata_file, metadata_filename); + new_metadata_item->set(Iceberg::f_metadata_file, metadata_file_path.serialize()); new_metadata_item->set(Iceberg::f_timestamp_ms, timestamp); metadata_object->getArray(Iceberg::f_metadata_log)->add(new_metadata_item); } @@ -216,7 +216,7 @@ MetadataGenerator::NextMetadataResult MetadataGenerator::generateNextMetadata( properties->set("write.merge.mode", "merge-on-read"); properties->set("write.update.mode", "merge-on-read"); } - return {new_snapshot, manifest_list_name, storage_manifest_list_name}; + return {new_snapshot, manifest_list_path}; } void MetadataGenerator::generateDropColumnMetadata(const String & column_name) diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/MetadataGenerator.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/MetadataGenerator.h index a5ebdfc593f8..4477430fc49f 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/MetadataGenerator.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/MetadataGenerator.h @@ -1,6 +1,13 @@ #pragma once +#include +#include "config.h" + +#include #include +#include +#include + namespace DB { @@ -15,13 +22,15 @@ class MetadataGenerator struct NextMetadataResult { Poco::JSON::Object::Ptr snapshot = nullptr; - String metadata_path; - String storage_metadata_path; + /// Metadata path for the manifest list file (e.g. "wasb://container@account/table/metadata/snap-xxx.avro"). + /// Use IcebergPathResolver::resolve to get storage path for I/O. + /// Use .serialize() to get the path for writing into Iceberg metadata. + Iceberg::IcebergPathFromMetadata manifest_list_path; }; NextMetadataResult generateNextMetadata( FileNamesGenerator & generator, - const String & metadata_filename, + const Iceberg::IcebergPathFromMetadata & metadata_file_path, Int64 parent_snapshot_id, Int64 added_files, Int64 added_records, diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/MultipleFileWriter.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/MultipleFileWriter.cpp index 5b4e514536d2..c52498b00044 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/MultipleFileWriter.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/MultipleFileWriter.cpp @@ -15,6 +15,7 @@ MultipleFileWriter::MultipleFileWriter( UInt64 max_data_file_num_bytes_, Poco::JSON::Array::Ptr schema, FileNamesGenerator & filename_generator_, + const Iceberg::IcebergPathResolver & path_resolver_, ObjectStoragePtr object_storage_, ContextPtr context_, const std::optional & format_settings_, @@ -26,6 +27,7 @@ MultipleFileWriter::MultipleFileWriter( , aggregate_stats(schema) , current_file_stats(schema) , filename_generator(filename_generator_) + , path_resolver(path_resolver_) , object_storage(object_storage_) , context(context_) , format_settings(format_settings_) @@ -46,14 +48,15 @@ void MultipleFileWriter::startNewFile() current_file_num_rows = 0; current_file_num_bytes = 0; - auto filename = filename_generator.generateDataFileName(); + auto metadata_path = filename_generator.generateDataFileName(); + auto storage_path = path_resolver.resolve(metadata_path); - data_file_names.push_back(filename.path_in_storage); + data_file_names.push_back(metadata_path); if (new_file_path_callback) - new_file_path_callback(filename.path_in_storage); + new_file_path_callback(metadata_path); buffer = object_storage->writeObject( - StoredObject(filename.path_in_storage), WriteMode::Rewrite, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, context->getWriteSettings()); + StoredObject(storage_path), WriteMode::Rewrite, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, context->getWriteSettings()); if (format_settings) { @@ -84,10 +87,23 @@ void MultipleFileWriter::finalize() output_format->flush(); output_format->finalize(); buffer->finalize(); - const UInt64 file_bytes = buffer->count(); - total_bytes += file_bytes; + + auto buffer_bytes = buffer->count(); + if (buffer_bytes > 0) + { + total_bytes += buffer_bytes; + } + else if (!data_file_names.empty()) + { + /// Some storage backends (e.g. Azure) don't track bytes in the write buffer. + /// Fall back to querying the actual object size. + auto metadata = object_storage->getObjectMetadata(path_resolver.resolve(data_file_names.back()), /*with_tags=*/false); + total_bytes += metadata.size_bytes; + } + per_file_record_counts.push_back(static_cast(*current_file_num_rows)); - per_file_byte_sizes.push_back(static_cast(file_bytes)); + /// todo arthur fix the wrong counter for file bytes, probably by backporting something else + per_file_byte_sizes.push_back(static_cast(buffer_bytes)); per_file_stats_list.push_back(current_file_stats); } @@ -101,7 +117,7 @@ std::vector MultipleFileWriter::getDataFileEntries() const for (size_t i = 0; i < data_file_names.size(); ++i) entries.emplace_back( - data_file_names[i], + path_resolver.resolve(data_file_names[i]), per_file_record_counts[i], per_file_byte_sizes[i], per_file_stats_list[i]); @@ -125,8 +141,8 @@ void MultipleFileWriter::cancel() void MultipleFileWriter::clearAllDataFiles() const { - for (const auto & data_filename : data_file_names) - object_storage->removeObjectIfExists(StoredObject(data_filename)); + for (const auto & metadata_path : data_file_names) + object_storage->removeObjectIfExists(StoredObject(path_resolver.resolve(metadata_path))); } UInt64 MultipleFileWriter::getResultBytes() const diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/MultipleFileWriter.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/MultipleFileWriter.h index eba5511c8d6d..e56400476ad4 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/MultipleFileWriter.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/MultipleFileWriter.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -18,6 +19,7 @@ class MultipleFileWriter UInt64 max_data_file_num_bytes_, Poco::JSON::Array::Ptr schema, FileNamesGenerator & filename_generator_, + const Iceberg::IcebergPathResolver & path_resolver_, ObjectStoragePtr object_storage_, ContextPtr context_, const std::optional & format_settings_, @@ -34,7 +36,7 @@ class MultipleFileWriter UInt64 getResultBytes() const; - const std::vector & getDataFiles() const + const std::vector & getDataFiles() const { return data_file_names; } @@ -56,13 +58,14 @@ class MultipleFileWriter DataFileStatistics current_file_stats; /// accumulates for the current file only std::optional current_file_num_rows = std::nullopt; std::optional current_file_num_bytes = std::nullopt; - std::vector data_file_names; + std::vector data_file_names; std::vector per_file_record_counts; std::vector per_file_byte_sizes; std::vector per_file_stats_list; std::unique_ptr buffer; OutputFormatPtr output_format; FileNamesGenerator & filename_generator; + const Iceberg::IcebergPathResolver & path_resolver; ObjectStoragePtr object_storage; ContextPtr context; std::optional format_settings; diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Mutations.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/Mutations.cpp index 9b4e242087eb..d24be327a040 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Mutations.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Mutations.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -27,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -62,13 +62,14 @@ namespace DB::Iceberg #if USE_AVRO -static constexpr const char * block_datafile_path = "_path"; +static constexpr const char * block_datafile_path = "_iceberg_metadata_file_path"; static constexpr const char * block_row_number = "_row_number"; static constexpr auto MAX_TRANSACTION_RETRIES = 100; struct DeleteFileWriteResult { - FileNamesGenerator::Result path; + /// Metadata path (e.g. "wasb://container@account/table/data/uuid-deletes.parquet") + Iceberg::IcebergPathFromMetadata path; Int32 total_rows; Int32 total_bytes; }; @@ -134,10 +135,10 @@ static std::optional writeDataFiles( ObjectStoragePtr object_storage, String write_format, FileNamesGenerator & generator, + const Iceberg::IcebergPathResolver & path_resolver, const std::optional & format_settings, std::optional & chunk_partitioner, - Poco::JSON::Object::Ptr data_schema, - const String& blob_storage_namespace_name) + Poco::JSON::Object::Ptr data_schema) { chassert(commands.size() == 1); @@ -188,11 +189,11 @@ static std::optional writeDataFiles( if (!delete_data_writers.contains(partition_key)) { - auto delete_file_info = generator.generatePositionDeleteFile(); + auto delete_file_path = generator.generatePositionDeleteFile(); - delete_data_result[partition_key].path = delete_file_info; + delete_data_result[partition_key].path = delete_file_path; auto write_buffer = object_storage->writeObject( - StoredObject(delete_file_info.path_in_storage), + StoredObject(path_resolver.resolve(delete_file_path)), WriteMode::Rewrite, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, @@ -223,21 +224,9 @@ static std::optional writeDataFiles( col_position.column = nullable->getNestedColumnPtr(); } - auto col_data_filename_without_namespaces = ColumnString::create(); - for (size_t i = 0; i < col_data_filename.column->size(); ++i) - { - Field cur_value; - col_data_filename.column->get(i, cur_value); - - String path_without_namespace; - if (cur_value.safeGet().starts_with(blob_storage_namespace_name)) - path_without_namespace = cur_value.safeGet().substr(blob_storage_namespace_name.size()); - - if (!path_without_namespace.starts_with('/')) - path_without_namespace = "/" + path_without_namespace; - col_data_filename_without_namespaces->insert(path_without_namespace); - } - col_data_filename.column = std::move(col_data_filename_without_namespaces); + /// _iceberg_metadata_file_path already contains the correct metadata path format + /// (e.g. wasb://container@host/.../data/xxx.parquet or /iceberg/.../data/xxx.parquet) + /// so no transformation is needed. Columns chunk_pos_delete; chunk_pos_delete.push_back(col_data_filename.column); chunk_pos_delete.push_back(col_position.column); @@ -258,7 +247,13 @@ static std::optional writeDataFiles( delete_data_writers[partition_key]->flush(); delete_data_writers[partition_key]->finalize(); delete_data_write_buffers[partition_key]->finalize(); - delete_data_result[partition_key].total_bytes = static_cast(delete_data_write_buffers[partition_key]->count()); + { + auto delete_bytes = delete_data_write_buffers[partition_key]->count(); + if (delete_bytes == 0) + delete_bytes = object_storage->getObjectMetadata( + path_resolver.resolve(delete_data_result[partition_key].path), /*with_tags=*/ false).size_bytes; + delete_data_result[partition_key].total_bytes = static_cast(delete_bytes); + } } } @@ -289,10 +284,10 @@ static std::optional writeDataFiles( auto it = update_data_writers.find(partition_key); if (it == update_data_writers.end()) { - auto data_file_info = generator.generateDataFileName(); - update_data_result[partition_key].path = data_file_info; + auto data_file_path = generator.generateDataFileName(); + update_data_result[partition_key].path = data_file_path; auto data_write_buffer = object_storage->writeObject( - StoredObject(data_file_info.path_in_storage), + StoredObject(path_resolver.resolve(data_file_path)), WriteMode::Rewrite, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, @@ -316,7 +311,13 @@ static std::optional writeDataFiles( update_data_writers[partition_key]->flush(); update_data_writers[partition_key]->finalize(); update_data_write_buffers[partition_key]->finalize(); - update_data_result[partition_key].total_bytes = static_cast(update_data_write_buffers[partition_key]->count()); + { + auto update_bytes = update_data_write_buffers[partition_key]->count(); + if (update_bytes == 0) + update_bytes = object_storage->getObjectMetadata( + path_resolver.resolve(update_data_result[partition_key].path), /*with_tags=*/ false).size_bytes; + update_data_result[partition_key].total_bytes = static_cast(update_bytes); + } } } @@ -333,6 +334,7 @@ static bool writeMetadataFiles( ObjectStoragePtr object_storage, ContextPtr context, FileNamesGenerator & filename_generator, + const Iceberg::IcebergPathResolver & path_resolver, const DataLakeStorageSettings & data_lake_settings, String write_format, std::shared_ptr catalog, @@ -343,12 +345,10 @@ static bool writeMetadataFiles( std::optional & chunk_partitioner, Iceberg::FileContentType content_type, SharedHeader sample_block, - CompressionMethod compression_method, - bool write_metadata_json_file, - const String& blob_storage_type_name, - const String& blob_storage_namespace_name) + bool write_metadata_json_file) { - auto [metadata_name, storage_metadata_name] = filename_generator.generateMetadataName(); + auto metadata_info = filename_generator.generateMetadataPathWithInfo(); + auto storage_metadata_name = path_resolver.resolve(metadata_info.path); Int64 parent_snapshot = -1; if (metadata->has(Iceberg::f_current_snapshot_id)) parent_snapshot = metadata->getValue(Iceberg::f_current_snapshot_id); @@ -364,53 +364,47 @@ static bool writeMetadataFiles( } Poco::JSON::Object::Ptr new_snapshot; - String manifest_list_name; String storage_manifest_list_name; if (content_type == Iceberg::FileContentType::POSITION_DELETE) { - auto result_generation_metadata = MetadataGenerator(metadata) - .generateNextMetadata( - filename_generator, - metadata_name, - parent_snapshot, - /* added_files */0, - /* added_records */0, - total_bytes, - /* num_partitions */total_files, - /* added_delete_files */total_files, - total_rows); - new_snapshot = result_generation_metadata.snapshot; - manifest_list_name = result_generation_metadata.metadata_path; - storage_manifest_list_name = result_generation_metadata.storage_metadata_path; + auto result = MetadataGenerator(metadata).generateNextMetadata( + filename_generator, + metadata_info.path, + parent_snapshot, + /* added_files */ 0, + /* added_records */ 0, + total_bytes, + /* num_partitions */ total_files, + /* added_delete_files */ total_files, + total_rows); + new_snapshot = result.snapshot; + storage_manifest_list_name = path_resolver.resolve(result.manifest_list_path); } else { - auto result_generation_metadata = MetadataGenerator(metadata) - .generateNextMetadata( - filename_generator, - metadata_name, - parent_snapshot, - /* added_files */total_files, - /* added_records */total_rows, - total_bytes, - /* num_partitions */total_files, - /* added_delete_files */0, - /*num_deleted_rows*/0); - new_snapshot = result_generation_metadata.snapshot; - manifest_list_name = result_generation_metadata.metadata_path; - storage_manifest_list_name = result_generation_metadata.storage_metadata_path; - + auto result = MetadataGenerator(metadata).generateNextMetadata( + filename_generator, + metadata_info.path, + parent_snapshot, + /* added_files */ total_files, + /* added_records */ total_rows, + total_bytes, + /* num_partitions */ total_files, + /* added_delete_files */ 0, + /*num_deleted_rows*/ 0); + new_snapshot = result.snapshot; + storage_manifest_list_name = path_resolver.resolve(result.manifest_list_path); } auto manifest_entries_in_storage = std::make_shared(); - Strings manifest_entries; - Int32 manifest_lengths = 0; + std::vector manifest_entries; + std::vector manifest_entry_sizes; - auto cleanup = [object_storage, delete_filenames, manifest_entries_in_storage, storage_manifest_list_name, storage_metadata_name]() + auto cleanup = [object_storage, &delete_filenames, &path_resolver, manifest_entries_in_storage, storage_manifest_list_name, storage_metadata_name]() { try { for (const auto & [_, data_file] : delete_filenames.delete_file) - object_storage->removeObjectIfExists(StoredObject(data_file.path.path_in_storage)); + object_storage->removeObjectIfExists(StoredObject(path_resolver.resolve(data_file.path))); for (const auto & manifest_filename_in_storage : *manifest_entries_in_storage) object_storage->removeObjectIfExists(StoredObject(manifest_filename_in_storage)); @@ -427,12 +421,12 @@ static bool writeMetadataFiles( { for (const auto & [partition_key, delete_filename] : delete_filenames.delete_file) { - auto [manifest_entry_name, storage_manifest_entry_name] = filename_generator.generateManifestEntryName(); - manifest_entries_in_storage->push_back(storage_manifest_entry_name); - manifest_entries.push_back(manifest_entry_name); + auto manifest_entry_path = filename_generator.generateManifestEntryName(); + manifest_entries_in_storage->push_back(path_resolver.resolve(manifest_entry_path)); + manifest_entries.push_back(manifest_entry_path); auto buffer_manifest_entry = object_storage->writeObject( - StoredObject(storage_manifest_entry_name), + StoredObject(path_resolver.resolve(manifest_entry_path)), WriteMode::Rewrite, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, @@ -444,7 +438,7 @@ static bool writeMetadataFiles( chunk_partitioner ? chunk_partitioner->getColumns() : std::vector{}, partition_key, chunk_partitioner ? chunk_partitioner->getResultTypes() : std::vector{}, - {delete_filename.path.path_in_metadata}, + {delete_filename.path}, delete_filenames.delete_statistic.at(partition_key), sample_block, new_snapshot, @@ -454,7 +448,12 @@ static bool writeMetadataFiles( *buffer_manifest_entry, content_type); buffer_manifest_entry->finalize(); - manifest_lengths += buffer_manifest_entry->count(); + auto size = buffer_manifest_entry->count(); + if (size == 0) + { + size = object_storage->getObjectMetadata(path_resolver.resolve(manifest_entry_path), /*with_tags=*/false).size_bytes; + } + manifest_entry_sizes.push_back(size); } catch (...) { @@ -474,13 +473,13 @@ static bool writeMetadataFiles( try { generateManifestList( - filename_generator, + path_resolver, metadata, object_storage, context, manifest_entries, new_snapshot, - manifest_lengths, + manifest_entry_sizes, *buffer_manifest_list, content_type); buffer_manifest_list->finalize(); @@ -503,15 +502,14 @@ static bool writeMetadataFiles( throw Exception(ErrorCodes::BAD_ARGUMENTS, "Failpoint for cleanup enabled"); }); - auto hint = filename_generator.generateVersionHint(); + auto hint_path = filename_generator.generateVersionHint(); if (!writeMetadataFileAndVersionHint( - storage_metadata_name, + path_resolver, + metadata_info, json_representation, - hint.path_in_storage, - storage_metadata_name, + hint_path, object_storage, context, - compression_method, data_lake_settings[DataLakeStorageSetting::iceberg_use_version_hint])) { cleanup(); @@ -520,10 +518,7 @@ static bool writeMetadataFiles( if (catalog) { - String catalog_filename = metadata_name; - if (!catalog_filename.starts_with(blob_storage_type_name)) - catalog_filename = blob_storage_type_name + "://" + blob_storage_namespace_name + "/" + metadata_name; - + auto catalog_filename = path_resolver.resolveForCatalog(metadata_info.path); const auto & [namespace_name, table_name] = DataLake::parseTableName(table_id.getTableName()); if (!catalog->updateMetadata(namespace_name, table_name, catalog_filename, new_snapshot)) { @@ -551,9 +546,7 @@ void mutate( PersistentTableComponents & persistent_table_components, const String & write_format, const std::optional & format_settings, - std::shared_ptr catalog, - const String & blob_storage_type_name, - const String & blob_storage_namespace_name) + std::shared_ptr catalog) { auto common_path = persistent_table_components.table_path; if (!common_path.starts_with('/')) @@ -562,7 +555,6 @@ void mutate( int max_retries = MAX_TRANSACTION_RETRIES; while (--max_retries > 0) { - FileNamesGenerator filename_generator(common_path, common_path, false, CompressionMethod::None, write_format); auto log = getLogger("IcebergMutations"); auto [last_version, metadata_path, compression_method] = getLatestOrExplicitMetadataFileAndVersion( object_storage, @@ -571,8 +563,10 @@ void mutate( persistent_table_components.metadata_cache, context, log.get(), - persistent_table_components.table_uuid); + persistent_table_components.table_uuid, + persistent_table_components.metadata_compression_method); + FileNamesGenerator filename_generator(persistent_table_components.path_resolver.getTableLocation(), false, CompressionMethod::None, write_format); filename_generator.setVersion(last_version + 1); filename_generator.setCompressionMethod(compression_method); @@ -616,10 +610,10 @@ void mutate( object_storage, write_format, filename_generator, + persistent_table_components.path_resolver, format_settings, chunk_partitioner, - current_schema, - blob_storage_namespace_name); + current_schema); if (mutation_files) { @@ -628,6 +622,7 @@ void mutate( object_storage, context, filename_generator, + persistent_table_components.path_resolver, data_lake_settings, write_format, catalog, @@ -638,10 +633,7 @@ void mutate( chunk_partitioner, Iceberg::FileContentType::POSITION_DELETE, std::make_shared(getPositionDeleteFileSampleBlock()), - compression_method, - !mutation_files->data_file, - blob_storage_type_name, - blob_storage_namespace_name); + !mutation_files->data_file); if (!result_delete_files_metadata) continue; @@ -652,6 +644,7 @@ void mutate( object_storage, context, filename_generator, + persistent_table_components.path_resolver, data_lake_settings, write_format, catalog, @@ -662,10 +655,7 @@ void mutate( chunk_partitioner, Iceberg::FileContentType::DATA, sample_block, - compression_method, - true, - blob_storage_type_name, - blob_storage_namespace_name); + true); if (!result_data_files_metadata) { continue; @@ -693,8 +683,6 @@ void alter( size_t i = 0; while (i++ < MAX_TRANSACTION_RETRIES) { - FileNamesGenerator filename_generator( - persistent_table_components.table_path, persistent_table_components.table_path, false, CompressionMethod::None, write_format); auto log = getLogger("IcebergMutations"); auto [last_version, metadata_path, compression_method] = getLatestOrExplicitMetadataFileAndVersion( object_storage, @@ -703,8 +691,10 @@ void alter( persistent_table_components.metadata_cache, context, log.get(), - persistent_table_components.table_uuid); + persistent_table_components.table_uuid, + persistent_table_components.metadata_compression_method); + FileNamesGenerator filename_generator(persistent_table_components.path_resolver.getTableLocation(), false, CompressionMethod::None, write_format); filename_generator.setVersion(last_version + 1); filename_generator.setCompressionMethod(compression_method); @@ -736,17 +726,16 @@ void alter( Poco::JSON::Stringifier::stringify(metadata, oss, 4); std::string json_representation = removeEscapedSlashes(oss.str()); - auto [metadata_name, storage_metadata_name] = filename_generator.generateMetadataName(); + auto metadata_info = filename_generator.generateMetadataPathWithInfo(); - auto hint = filename_generator.generateVersionHint(); + auto hint_path = filename_generator.generateVersionHint(); if (writeMetadataFileAndVersionHint( - storage_metadata_name, + persistent_table_components.path_resolver, + metadata_info, json_representation, - hint.path_in_storage, - storage_metadata_name, + hint_path, object_storage, context, - compression_method, data_lake_settings[DataLakeStorageSetting::iceberg_use_version_hint])) break; } @@ -936,14 +925,14 @@ static std::pair, Strings> applyRetentionPolicy( static void collectAllFilePaths( const Iceberg::ManifestFileIterator::ManifestFileEntriesHandle & entries_handle, - std::set & out) + std::set & out) { for (const auto & entry : entries_handle.getFilesWithoutDeleted(FileContentType::DATA)) - out.insert(entry->file_path); + out.insert(entry->parsed_entry->file_path_key); for (const auto & entry : entries_handle.getFilesWithoutDeleted(FileContentType::POSITION_DELETE)) - out.insert(entry->file_path); + out.insert(entry->parsed_entry->file_path_key); for (const auto & entry : entries_handle.getFilesWithoutDeleted(FileContentType::EQUALITY_DELETE)) - out.insert(entry->file_path); + out.insert(entry->parsed_entry->file_path_key); } /// Collect all file paths (manifest lists, manifests, data/delete files) @@ -966,9 +955,9 @@ static void collectRetainedFiles( ContextPtr context, LoggerPtr log, Int32 current_schema_id, - std::set & retained_manifest_paths, - std::set & retained_data_file_paths, - std::set & retained_manifest_list_paths) + std::set & retained_manifest_paths, + std::set & retained_data_file_paths, + std::set & retained_manifest_list_paths) { for (UInt32 i = 0; i < retained_snapshots->size(); ++i) { @@ -976,14 +965,10 @@ static void collectRetainedFiles( if (!snapshot->has(Iceberg::f_manifest_list)) continue; - String manifest_list_path = snapshot->getValue(Iceberg::f_manifest_list); + auto manifest_list_path = IcebergPathFromMetadata::deserialize(snapshot->getValue(Iceberg::f_manifest_list)); retained_manifest_list_paths.insert(manifest_list_path); - String storage_manifest_list_path = getProperFilePathFromMetadataInfo( - manifest_list_path, persistent_table_components.table_path, persistent_table_components.table_location); - - auto manifest_keys = getManifestList( - object_storage, persistent_table_components, context, storage_manifest_list_path, log); + auto manifest_keys = getManifestList(object_storage, persistent_table_components, context, manifest_list_path, log); for (const auto & mf_key : manifest_keys) { @@ -998,7 +983,7 @@ static void collectRetainedFiles( struct ExpiredFiles { - Strings all_paths; + std::vector all_paths; Int64 data_files = 0; Int64 position_delete_files = 0; Int64 equality_delete_files = 0; @@ -1008,10 +993,10 @@ struct ExpiredFiles /// Collect files from expired snapshots that are not referenced by any retained snapshot. static ExpiredFiles collectExpiredFiles( - const std::vector & expired_manifest_list_paths, - const std::set & retained_manifest_list_paths, - const std::set & retained_manifest_paths, - const std::set & retained_data_file_paths, + const std::vector & expired_manifest_list_paths, + const std::set & retained_manifest_list_paths, + const std::set & retained_manifest_paths, + const std::set & retained_data_file_paths, ObjectStoragePtr object_storage, PersistentTableComponents & persistent_table_components, ContextPtr context, @@ -1019,28 +1004,24 @@ static ExpiredFiles collectExpiredFiles( Int32 current_schema_id) { ExpiredFiles result; - std::set seen_expired_manifest_list_paths; - std::set seen_expired_manifest_paths; - for (const auto & ml_path : expired_manifest_list_paths) + std::set seen_expired_manifest_list_paths; + std::set seen_expired_manifest_paths; + for (const auto & manifest_list_path : expired_manifest_list_paths) { - if (retained_manifest_list_paths.contains(ml_path)) + if (retained_manifest_list_paths.contains(manifest_list_path)) continue; - if (seen_expired_manifest_list_paths.contains(ml_path)) + if (seen_expired_manifest_list_paths.contains(manifest_list_path)) continue; - String storage_ml_path = getProperFilePathFromMetadataInfo( - ml_path, persistent_table_components.table_path, persistent_table_components.table_location); - ManifestFileCacheKeys manifest_keys; try { - manifest_keys = getManifestList( - object_storage, persistent_table_components, context, storage_ml_path, log); + manifest_keys = getManifestList(object_storage, persistent_table_components, context, manifest_list_path, log); } catch (...) { - LOG_WARNING(log, "Failed to read manifest list {}, skipping", storage_ml_path); + LOG_WARNING(log, "Failed to read manifest list {}, skipping", manifest_list_path); continue; } @@ -1059,21 +1040,21 @@ static ExpiredFiles collectExpiredFiles( mf_key, current_schema_id); for (const auto & entry : entries_handle.getFilesWithoutDeleted(FileContentType::DATA)) - if (!retained_data_file_paths.contains(entry->file_path)) + if (!retained_data_file_paths.contains(entry->parsed_entry->file_path_key)) { - result.all_paths.push_back(entry->file_path); + result.all_paths.push_back(entry->parsed_entry->file_path_key); ++result.data_files; } for (const auto & entry : entries_handle.getFilesWithoutDeleted(FileContentType::POSITION_DELETE)) - if (!retained_data_file_paths.contains(entry->file_path)) + if (!retained_data_file_paths.contains(entry->parsed_entry->file_path_key)) { - result.all_paths.push_back(entry->file_path); + result.all_paths.push_back(entry->parsed_entry->file_path_key); ++result.position_delete_files; } for (const auto & entry : entries_handle.getFilesWithoutDeleted(FileContentType::EQUALITY_DELETE)) - if (!retained_data_file_paths.contains(entry->file_path)) + if (!retained_data_file_paths.contains(entry->parsed_entry->file_path_key)) { - result.all_paths.push_back(entry->file_path); + result.all_paths.push_back(entry->parsed_entry->file_path_key); ++result.equality_delete_files; } } @@ -1088,8 +1069,8 @@ static ExpiredFiles collectExpiredFiles( ++result.manifest_files; } - seen_expired_manifest_list_paths.insert(ml_path); - result.all_paths.push_back(storage_ml_path); + seen_expired_manifest_list_paths.insert(manifest_list_path); + result.all_paths.push_back(manifest_list_path); ++result.manifest_lists; } return result; @@ -1123,7 +1104,7 @@ struct SnapshotPartition { Poco::JSON::Array::Ptr retained_snapshots = new Poco::JSON::Array; std::set expired_snapshot_ids; - std::vector expired_manifest_list_paths; + std::vector expired_manifest_list_paths; }; /// Split snapshots into retained and expired. @@ -1152,7 +1133,8 @@ static SnapshotPartition partitionSnapshots( { result.expired_snapshot_ids.insert(snap_id); if (snapshot->has(Iceberg::f_manifest_list)) - result.expired_manifest_list_paths.push_back(snapshot->getValue(Iceberg::f_manifest_list)); + result.expired_manifest_list_paths.push_back( + Iceberg::IcebergPathFromMetadata::deserialize(snapshot->getValue(Iceberg::f_manifest_list))); } } return result; @@ -1203,7 +1185,7 @@ static SnapshotPartition partitionSnapshotsByIds( { result.expired_snapshot_ids.insert(snap_id); if (snapshot->has(Iceberg::f_manifest_list)) - result.expired_manifest_list_paths.push_back(snapshot->getValue(Iceberg::f_manifest_list)); + result.expired_manifest_list_paths.push_back(Iceberg::IcebergPathFromMetadata::deserialize(snapshot->getValue(Iceberg::f_manifest_list))); } else { @@ -1239,7 +1221,8 @@ static void updateMetadataForExpiration( } static void deleteExpiredFiles( - const Strings & files_to_delete, + const std::vector & files_to_delete, + const Iceberg::IcebergPathResolver & path_resolver, ObjectStoragePtr object_storage, LoggerPtr log) { @@ -1247,7 +1230,7 @@ static void deleteExpiredFiles( { try { - object_storage->removeObjectIfExists(StoredObject(file_path)); + object_storage->removeObjectIfExists(StoredObject(path_resolver.resolve(file_path))); LOG_DEBUG(log, "Deleted expired file {}", file_path); } catch (...) @@ -1276,8 +1259,6 @@ ExpireSnapshotsResult expireSnapshots( PersistentTableComponents & persistent_table_components, const String & write_format, std::shared_ptr catalog, - const String & blob_storage_type_name, - const String & blob_storage_namespace_name, const String & table_name) { auto common_path = persistent_table_components.table_path; @@ -1287,7 +1268,7 @@ ExpireSnapshotsResult expireSnapshots( int max_retries = MAX_TRANSACTION_RETRIES; while (--max_retries > 0) { - FileNamesGenerator filename_generator(common_path, common_path, false, CompressionMethod::None, write_format); + FileNamesGenerator filename_generator(persistent_table_components.path_resolver.getTableLocation(), false, CompressionMethod::None, write_format); auto log = getLogger("IcebergExpireSnapshots"); auto [last_version, metadata_path, compression_method] = getLatestOrExplicitMetadataFileAndVersion( object_storage, @@ -1296,13 +1277,20 @@ ExpireSnapshotsResult expireSnapshots( persistent_table_components.metadata_cache, context, log.get(), - persistent_table_components.table_uuid); + persistent_table_components.table_uuid, + persistent_table_components.metadata_compression_method); filename_generator.setVersion(last_version + 1); filename_generator.setCompressionMethod(compression_method); auto metadata = getMetadataJSONObject( - metadata_path, object_storage, persistent_table_components.metadata_cache, context, log, compression_method, persistent_table_components.table_uuid); + metadata_path, + object_storage, + persistent_table_components.metadata_cache, + context, + log, + compression_method, + persistent_table_components.table_uuid); if (metadata->getValue(f_format_version) < 2) throw Exception(ErrorCodes::BAD_ARGUMENTS, "expire_snapshots is supported only for the second version of iceberg format"); @@ -1347,9 +1335,9 @@ ExpireSnapshotsResult expireSnapshots( Int32 current_schema_id = metadata->getValue(Iceberg::f_current_schema_id); - std::set retained_manifest_paths; - std::set retained_data_file_paths; - std::set retained_manifest_list_paths; + std::set retained_manifest_paths; + std::set retained_data_file_paths; + std::set retained_manifest_list_paths; collectRetainedFiles( partition.retained_snapshots, object_storage, persistent_table_components, context, log, current_schema_id, retained_manifest_paths, retained_data_file_paths, retained_manifest_list_paths); @@ -1375,16 +1363,15 @@ ExpireSnapshotsResult expireSnapshots( std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM Poco::JSON::Stringifier::stringify(metadata, oss, 4); std::string json_representation = removeEscapedSlashes(oss.str()); - auto [metadata_name, storage_metadata_name] = filename_generator.generateMetadataName(); - auto hint = filename_generator.generateVersionHint(); + auto metadata_info = filename_generator.generateMetadataPathWithInfo(); + auto hint_path = filename_generator.generateVersionHint(); if (!writeMetadataFileAndVersionHint( - storage_metadata_name, + persistent_table_components.path_resolver, + metadata_info, json_representation, - hint.path_in_storage, - storage_metadata_name, + hint_path, object_storage, context, - compression_method, data_lake_settings[DataLakeStorageSetting::iceberg_use_version_hint])) { LOG_WARNING(log, "Metadata commit conflict during expire_snapshots, retrying ({} retries left)", max_retries); @@ -1393,10 +1380,7 @@ ExpireSnapshotsResult expireSnapshots( if (catalog) { - String catalog_filename = metadata_name; - if (!catalog_filename.starts_with(blob_storage_type_name)) - catalog_filename = blob_storage_type_name + "://" + blob_storage_namespace_name + "/" + metadata_name; - + auto catalog_filename = persistent_table_components.path_resolver.resolveForCatalog(metadata_info.path); const auto & [namespace_name, parsed_table_name] = DataLake::parseTableName(table_name); if (!catalog->updateMetadata(namespace_name, parsed_table_name, catalog_filename, nullptr)) { @@ -1408,7 +1392,7 @@ ExpireSnapshotsResult expireSnapshots( } LOG_INFO(log, "Deleting {} expired files for {} expired snapshots", expired_files.all_paths.size(), partition.expired_snapshot_ids.size()); - deleteExpiredFiles(expired_files.all_paths, object_storage, log); + deleteExpiredFiles(expired_files.all_paths, persistent_table_components.path_resolver, object_storage, log); LOG_INFO(log, "Expired {} snapshots, deleted {} files", partition.expired_snapshot_ids.size(), expired_files.all_paths.size()); return ExpireSnapshotsResult{ diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Mutations.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/Mutations.h index f46ec2823509..a182bb9aa1b6 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Mutations.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Mutations.h @@ -31,9 +31,7 @@ void mutate( PersistentTableComponents & persistent_table_components, const String & write_format, const std::optional & format_settings, - std::shared_ptr catalog, - const String & blob_storage_type_name, - const String & blob_storage_namespace_name); + std::shared_ptr catalog); void alter( const AlterCommands & params, @@ -51,8 +49,6 @@ ExpireSnapshotsResult expireSnapshots( PersistentTableComponents & persistent_table_components, const String & write_format, std::shared_ptr catalog, - const String & blob_storage_type_name, - const String & blob_storage_namespace_name, const String & table_name); } diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/PersistentTableComponents.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/PersistentTableComponents.h index 7a1aa97afc61..b4a77da9a842 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/PersistentTableComponents.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/PersistentTableComponents.h @@ -5,6 +5,7 @@ #include #include +#include #include namespace DB::Iceberg @@ -20,6 +21,7 @@ struct PersistentTableComponents const CompressionMethod metadata_compression_method; const String table_path; const std::optional table_uuid; + const IcebergPathResolver path_resolver; }; } diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/PositionDeleteTransform.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/PositionDeleteTransform.cpp index 8d7820f9ea71..d60719dc3b4f 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/PositionDeleteTransform.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/PositionDeleteTransform.cpp @@ -60,7 +60,7 @@ Poco::JSON::Array::Ptr IcebergPositionDeleteTransform::getSchemaFields() void IcebergPositionDeleteTransform::initializeDeleteSources() { /// Create filter on the data object to get interested rows - auto iceberg_data_path = iceberg_object_info->info.data_object_file_path_key; + auto iceberg_data_path = iceberg_object_info->info.data_object_file_path_key.serialize(); ASTPtr where_ast = makeASTFunction( "equals", make_intrusive(IcebergPositionDeleteTransform::data_file_path_column_name), @@ -155,10 +155,10 @@ void IcebergBitmapPositionDeleteTransform::initialize() { // Add filename matching check auto filename_in_delete_record = filename_column->getDataAt(i); - auto current_data_file_path = iceberg_object_info->info.data_object_file_path_key; + auto current_data_file_path = iceberg_object_info->info.data_object_file_path_key.serialize(); // Only add to delete bitmap when the filename in delete record matches current data file path - if (filename_in_delete_record == current_data_file_path || filename_in_delete_record == "/" + current_data_file_path) + if (filename_in_delete_record == current_data_file_path) { auto position_to_delete = position_column->get64(i); bitmap.add(position_to_delete); diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h index 47fc360ad13d..7ace7a71b364 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h @@ -36,7 +36,7 @@ struct IcebergHistoryRecord DB::DateTime64 made_current_at; Int64 parent_id; bool is_current_ancestor; - String manifest_list_path; + Iceberg::IcebergPathFromMetadata manifest_list_path; Int32 added_files = 0; Int32 added_records = 0; diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/StatelessMetadataFileGetter.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/StatelessMetadataFileGetter.cpp index 80b61b2fa728..57b8b9076efb 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/StatelessMetadataFileGetter.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/StatelessMetadataFileGetter.cpp @@ -69,7 +69,7 @@ Iceberg::ManifestFileCacheableInfo getManifestFile( const PersistentTableComponents & persistent_table_components, ContextPtr local_context, LoggerPtr log, - const String & filename, + const IcebergPathFromMetadata & filename, size_t bytes_size) { auto log_level = local_context->getSettingsRef()[Setting::iceberg_metadata_log_level].value; @@ -79,7 +79,7 @@ Iceberg::ManifestFileCacheableInfo getManifestFile( auto create_fn = [&, use_iceberg_metadata_cache]() { - RelativePathWithMetadata manifest_object_info(filename); + RelativePathWithMetadata manifest_object_info(persistent_table_components.path_resolver.resolve(filename)); auto read_settings = local_context->getReadSettings(); /// Do not utilize filesystem cache if more precise cache enabled @@ -96,7 +96,7 @@ Iceberg::ManifestFileCacheableInfo getManifestFile( if (use_iceberg_metadata_cache && persistent_table_components.table_uuid.has_value()) { auto manifest_file = persistent_table_components.metadata_cache->getOrSetManifestFile( - IcebergMetadataFilesCache::getKey(persistent_table_components.table_uuid.value(), filename), create_fn); + IcebergMetadataFilesCache::getKey(persistent_table_components.table_uuid.value(), filename.serialize()), create_fn); return manifest_file; } return create_fn(); @@ -122,13 +122,11 @@ Iceberg::ManifestFileIterator::ManifestFileEntriesHandle getManifestFileEntriesH cacheable_info.deserializer, cache_key.manifest_file_path, persistent_table_components.format_version, - persistent_table_components.table_path, + persistent_table_components.path_resolver, *persistent_table_components.schema_processor, cache_key.added_sequence_number, cache_key.added_snapshot_id, - persistent_table_components.table_location, local_context, - cache_key.manifest_file_path, nullptr, table_snapshot_schema_id); @@ -143,7 +141,7 @@ ManifestFileCacheKeys getManifestList( ObjectStoragePtr object_storage, const PersistentTableComponents & persistent_table_components, ContextPtr local_context, - const String & filename, + const IcebergPathFromMetadata & filename, LoggerPtr log) { IcebergMetadataLogLevel log_level = local_context->getSettingsRef()[Setting::iceberg_metadata_log_level].value; @@ -153,7 +151,7 @@ ManifestFileCacheKeys getManifestList( auto create_fn = [&, use_iceberg_metadata_cache]() { - RelativePathWithMetadata object_info(filename); + RelativePathWithMetadata object_info(persistent_table_components.path_resolver.resolve(filename)); auto read_settings = local_context->getReadSettings(); /// Do not utilize filesystem cache if more precise cache enabled @@ -169,17 +167,15 @@ ManifestFileCacheKeys getManifestList( local_context, manifest_list_deserializer.getMetadataContent(), DB::IcebergMetadataLogLevel::ManifestListMetadata, - persistent_table_components.table_path, + persistent_table_components.path_resolver.getTableRoot(), filename, std::nullopt, std::nullopt); for (size_t i = 0; i < manifest_list_deserializer.rows(); ++i) { - const std::string file_path - = manifest_list_deserializer.getValueFromRowByName(i, f_manifest_path, TypeIndex::String).safeGet(); - const auto manifest_file_name = getProperFilePathFromMetadataInfo( - file_path, persistent_table_components.table_path, persistent_table_components.table_location); + const IcebergPathFromMetadata manifest_file_name = IcebergPathFromMetadata::deserialize( + manifest_list_deserializer.getValueFromRowByName(i, f_manifest_path, TypeIndex::String).safeGet()); Int64 added_sequence_number = 0; auto added_snapshot_id = manifest_list_deserializer.getValueFromRowByName(i, f_added_snapshot_id); if (added_snapshot_id.isNull()) @@ -214,7 +210,7 @@ ManifestFileCacheKeys getManifestList( local_context, manifest_list_deserializer.getContent(i), DB::IcebergMetadataLogLevel::ManifestListEntry, - persistent_table_components.table_path, + persistent_table_components.path_resolver.getTableRoot(), filename, i, std::nullopt); @@ -228,7 +224,7 @@ ManifestFileCacheKeys getManifestList( ManifestFileCacheKeys manifest_file_cache_keys; if (use_iceberg_metadata_cache && persistent_table_components.table_uuid.has_value()) manifest_file_cache_keys = persistent_table_components.metadata_cache->getOrSetManifestFileCacheKeys( - IcebergMetadataFilesCache::getKey(persistent_table_components.table_uuid.value(), filename), create_fn); + IcebergMetadataFilesCache::getKey(persistent_table_components.table_uuid.value(), filename.serialize()), create_fn); else manifest_file_cache_keys = create_fn(); return manifest_file_cache_keys; diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/StatelessMetadataFileGetter.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/StatelessMetadataFileGetter.h index ac5a7f7bc06e..4949de8d33e1 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/StatelessMetadataFileGetter.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/StatelessMetadataFileGetter.h @@ -28,7 +28,7 @@ Iceberg::ManifestFileCacheableInfo getManifestFile( const PersistentTableComponents & persistent_table_components, ContextPtr local_context, LoggerPtr log, - const String & filename, + const IcebergPathFromMetadata & filename, size_t bytes_size); /// Creates a fully initialized ManifestFileIterator from a cache key. @@ -47,7 +47,7 @@ ManifestFileCacheKeys getManifestList( ObjectStoragePtr object_storage, const PersistentTableComponents & persistent_table_components, ContextPtr local_context, - const String & filename, + const IcebergPathFromMetadata & filename, LoggerPtr log); } diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp index 758c4a7aede5..5c7f24ebb805 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -78,6 +79,11 @@ namespace DB::DataLakeStorageSetting extern const DataLakeStorageSettingsNonZeroUInt64 iceberg_format_version; } +namespace DB::Setting +{ +extern const SettingsString iceberg_metadata_compression_method; +} + namespace ProfileEvents { extern const Event IcebergVersionHintUsed; @@ -123,7 +129,7 @@ static bool isTemporaryMetadataFile(const String & file_name) return Poco::UUID{}.tryParse(substring); } -Iceberg::MetadataFileWithInfo getMetadataFileAndVersion(const std::string & path) +static MetadataFileWithInfo getMetadataFileAndVersion(const std::string & path) { String file_name = std::filesystem::path(path).filename(); if (isTemporaryMetadataFile(file_name)) @@ -146,9 +152,53 @@ Iceberg::MetadataFileWithInfo getMetadataFileAndVersion(const std::string & path ErrorCodes::BAD_ARGUMENTS, "Bad metadata file name: '{}'. Expected vN.metadata.json where N is a number", file_name); return MetadataFileWithInfo{ - .version = std::stoi(version_str), - .path = path, - .compression_method = getCompressionMethodFromMetadataFile(path)}; + .version = std::stoi(version_str), .path = path, .compression_method = getCompressionMethodFromMetadataFile(path)}; +} + +/// Resolve metadata filename from version hint content. +/// Version hint may contain just a version number (e.g. "1") or a full filename (e.g. "v1.metadata.json"). +/// When only a version number is given, we try to find the actual file, which may have a compression suffix. +static std::optional resolveMetadataFilenameFromVersionHint( + const String & version_hint_content, + const String & table_path, + const ObjectStoragePtr & object_storage, + CompressionMethod known_compression_method, + const ContextPtr & local_context) +{ + String metadata_file = version_hint_content; + if (metadata_file.ends_with(".metadata.json")) + return metadata_file; + + if (!std::all_of(metadata_file.begin(), metadata_file.end(), isdigit)) + return metadata_file + ".metadata.json"; + + /// Version hint is a number. Try to find the actual file. + String version_number = metadata_file; + + /// First try without compression. + String candidate = "v" + version_number + ".metadata.json"; + auto candidate_path = std::filesystem::path(table_path) / "metadata" / candidate; + if (object_storage->exists(StoredObject(candidate_path))) + return candidate; + + /// Try with known compression method. + auto compression_method = known_compression_method; + if (compression_method == CompressionMethod::None) + { + auto compression_method_str = local_context->getSettingsRef()[Setting::iceberg_metadata_compression_method].value; + compression_method = chooseCompressionMethod(compression_method_str, compression_method_str); + } + if (compression_method != CompressionMethod::None) + { + auto suffix = toContentEncodingName(compression_method); + String compressed_candidate = "v" + version_number + "." + suffix + ".metadata.json"; + auto compressed_path = std::filesystem::path(table_path) / "metadata" / compressed_candidate; + if (object_storage->exists(StoredObject(compressed_path))) + return compressed_candidate; + } + + /// Nothing found via direct checks. + return std::nullopt; } @@ -181,21 +231,29 @@ void writeMessageToFile( } bool writeMetadataFileAndVersionHint( - const std::string & metadata_file_path, + const IcebergPathResolver & resolver, + const GeneratedMetadataFileWithInfo & metadata_file_info, const std::string & metadata_file_content, - const std::string & version_hint_path, - std::string version_hint_content, + const IcebergPathFromMetadata & version_hint_path, DB::ObjectStoragePtr object_storage, DB::ContextPtr context, - DB::CompressionMethod compression_method, bool try_write_version_hint) { + auto storage_metadata_path = resolver.resolve(metadata_file_info.path); + auto storage_version_hint_path = resolver.resolve(version_hint_path); try { - if (object_storage->exists(StoredObject(metadata_file_path))) + if (object_storage->exists(StoredObject(storage_metadata_path))) return false; - Iceberg::writeMessageToFile(metadata_file_content, metadata_file_path, object_storage, context, /* write-if-none-match */ "*", "", compression_method); + Iceberg::writeMessageToFile( + metadata_file_content, + storage_metadata_path, + object_storage, + context, + /* write-if-none-match */ "*", + "", + metadata_file_info.compression_method); } catch (...) { @@ -205,13 +263,10 @@ bool writeMetadataFileAndVersionHint( if (try_write_version_hint) { - if (version_hint_content.starts_with('/')) - version_hint_content = version_hint_content.substr(1); - size_t i = 0; while (i < MAX_TRANSACTION_RETRIES) { - StoredObject object_info(version_hint_path); + StoredObject object_info(storage_version_hint_path); std::string version_hint_value; std::string etag; std::string write_if_none_match = "*"; @@ -219,17 +274,35 @@ bool writeMetadataFileAndVersionHint( { auto [object_data, object_metadata] = object_storage->readSmallObjectAndGetObjectMetadata(object_info, context->getReadSettings(), MAX_HINT_FILE_SIZE); version_hint_value = object_data; + boost::algorithm::trim(version_hint_value); etag = object_metadata.etag; write_if_none_match.clear(); } - auto [old_version, _1, _2] = getMetadataFileAndVersion(version_hint_value); - auto [new_version, _3, _4] = getMetadataFileAndVersion(version_hint_content); - if (old_version < new_version) + Int32 old_version = 0; + if (!version_hint_value.empty()) + { + if (std::all_of(version_hint_value.begin(), version_hint_value.end(), isdigit)) + { + old_version = std::stoi(version_hint_value); + } + else + { + old_version = getMetadataFileAndVersion(version_hint_value).version; + } + } + if (old_version < metadata_file_info.version) { try { - Iceberg::writeMessageToFile(version_hint_content, version_hint_path, object_storage, context, write_if_none_match, /* write-if-match */ etag); + /// Write just the version number for Spark/spec compatibility. + Iceberg::writeMessageToFile( + std::to_string(metadata_file_info.version), + storage_version_hint_path, + object_storage, + context, + write_if_none_match, + /* write-if-match */ etag); break; } catch (...) @@ -303,78 +376,6 @@ std::optional parseTransformAndArgument(const String & tra return std::nullopt; } -// This function is used to get the file path inside the directory which corresponds to iceberg table from the full blob path which is written in manifest and metadata files. -// For example, if the full blob path is s3://bucket/table_name/data/00000-1-1234567890.avro, the function will return table_name/data/00000-1-1234567890.avro -// Common path should end with "" or "/". -std::string getProperFilePathFromMetadataInfo(std::string_view data_path, std::string_view common_path, std::string_view table_location) -{ - auto trim_backward_slash = [](std::string_view str) -> std::string_view - { - if (str.ends_with('/')) - { - return str.substr(0, str.size() - 1); - } - return str; - }; - auto trim_forward_slash = [](std::string_view str) -> std::string_view - { - if (str.starts_with('/')) - { - return str.substr(1); - } - return str; - }; - common_path = trim_backward_slash(common_path); - table_location = trim_backward_slash(table_location); - - if (data_path.starts_with(table_location) && table_location.ends_with(common_path)) - { - return std::filesystem::path{common_path} / trim_forward_slash(data_path.substr(table_location.size())); - } - - - auto pos = data_path.find(common_path); - /// Valid situation when data and metadata files are stored in different directories. - if (pos == std::string::npos) - { - /// connection://bucket - auto prefix = table_location.substr(0, table_location.size() - common_path.size()); - return std::string{data_path.substr(prefix.size())}; - } - - size_t good_pos = std::string::npos; - while (pos != std::string::npos) - { - auto potential_position = pos + common_path.size(); - if ((std::string_view(data_path.data() + potential_position, 6) == "/data/") - || (std::string_view(data_path.data() + potential_position, 10) == "/metadata/")) - { - good_pos = pos; - break; - } - size_t new_pos = data_path.find(common_path, pos + 1); - if (new_pos == std::string::npos) - { - break; - } - pos = new_pos; - } - - - if (good_pos != std::string::npos) - { - return std::string{data_path.substr(good_pos)}; - } - else if (pos != std::string::npos) - { - return std::string{data_path.substr(pos)}; - } - else - { - throw ::DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Expected to find '{}' in data path: '{}'", common_path, data_path); - } -} - enum class MostRecentMetadataFileSelectionWay { BY_LAST_UPDATED_MS_FIELD, @@ -1141,6 +1142,7 @@ MetadataFileWithInfo getLatestOrExplicitMetadataFileAndVersion( const ContextPtr & local_context, Poco::Logger * log, const std::optional & table_uuid, + CompressionMethod known_compression_method, bool force_fetch_latest_metadata) { if (data_lake_settings[DataLakeStorageSetting::iceberg_metadata_file_path].changed) @@ -1186,19 +1188,17 @@ MetadataFileWithInfo getLatestOrExplicitMetadataFileAndVersion( StoredObject version_hint(version_hint_path); auto buf = object_storage->readObject(version_hint, ReadSettings{}); readString(metadata_file, *buf); - if (!metadata_file.ends_with(".metadata.json")) + auto resolved + = resolveMetadataFilenameFromVersionHint(metadata_file, table_path, object_storage, known_compression_method, local_context); + if (resolved.has_value()) { - if (std::all_of(metadata_file.begin(), metadata_file.end(), isdigit)) - metadata_file = "v" + metadata_file + ".metadata.json"; - else - metadata_file = metadata_file + ".metadata.json"; + LOG_TEST(log, "Version hint file points to {}, will read from this metadata file", *resolved); + ProfileEvents::increment(ProfileEvents::IcebergVersionHintUsed); + return getMetadataFileAndVersion(std::filesystem::path(table_path) / "metadata" / fs::path(*resolved).filename()); } - LOG_TEST(log, "Version hint file points to {}, will read from this metadata file", metadata_file); - ProfileEvents::increment(ProfileEvents::IcebergVersionHintUsed); - - return getMetadataFileAndVersion(std::filesystem::path(table_path) / "metadata" / fs::path(metadata_file).filename()); + LOG_WARNING(log, "Version hint content '{}' could not be resolved to a metadata file, falling back to listing", metadata_file); } - else + { return getLatestMetadataFileAndVersion( object_storage, table_path, data_lake_settings, metadata_cache, local_context, table_uuid, false, force_fetch_latest_metadata); diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h index afb2d31f1be3..0c1c70a7666a 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -42,17 +43,13 @@ void writeMessageToFile( /// Maybe return false if failed to write metadata.json /// Will try to write hint multiple times, but will not report failure to write hint. bool writeMetadataFileAndVersionHint( - const std::string & metadata_file_path, + const IcebergPathResolver & resolver, + const DB::GeneratedMetadataFileWithInfo & metadata_file_info, const std::string & metadata_file_content, - const std::string & version_hint_path, - std::string version_hint_content, + const IcebergPathFromMetadata & version_hint_path, DB::ObjectStoragePtr object_storage, DB::ContextPtr context, - DB::CompressionMethod compression_method, - bool try_write_version_hint -); - -std::string getProperFilePathFromMetadataInfo(std::string_view data_path, std::string_view common_path, std::string_view table_location); + bool try_write_version_hint); struct TransformAndArgument { @@ -100,6 +97,7 @@ MetadataFileWithInfo getLatestOrExplicitMetadataFileAndVersion( const ContextPtr & local_context, Poco::Logger * log, const std::optional & table_uuid, + CompressionMethod known_compression_method, bool force_fetch_latest_metadata = true); std::pair parseTableSchemaV1Method(const Poco::JSON::Object::Ptr & metadata_object); diff --git a/src/Storages/ObjectStorage/DataLakes/Paimon/PaimonClient.cpp b/src/Storages/ObjectStorage/DataLakes/Paimon/PaimonClient.cpp index 78d7fb7f62a1..464ba68e1d64 100644 --- a/src/Storages/ObjectStorage/DataLakes/Paimon/PaimonClient.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Paimon/PaimonClient.cpp @@ -254,7 +254,7 @@ std::vector PaimonTableClient::getManifestMeta(String ma RelativePathWithMetadata relative_path(std::filesystem::path(table_location) / (PAIMON_MANIFEST_DIR) / manifest_list_path); auto manifest_list_buf = createReadBuffer(relative_path, object_storage, context, log); Iceberg::AvroForIcebergDeserializer manifest_list_deserializer( - std::move(manifest_list_buf), manifest_list_path, getFormatSettings(getContext())); + std::move(manifest_list_buf), Iceberg::IcebergPathFromMetadata::deserialize(manifest_list_path), getFormatSettings(getContext())); std::vector paimon_manifest_file_meta_vec; paimon_manifest_file_meta_vec.reserve(manifest_list_deserializer.rows()); @@ -276,7 +276,7 @@ PaimonTableClient::getDataManifest(String manifest_path, const PaimonTableSchema auto context = getContext(); RelativePathWithMetadata object_info(std::filesystem::path(table_location) / (PAIMON_MANIFEST_DIR) / manifest_path); auto manifest_buf = createReadBuffer(object_info, object_storage, context, log); - Iceberg::AvroForIcebergDeserializer manifest_deserializer(std::move(manifest_buf), manifest_path, getFormatSettings(getContext())); + Iceberg::AvroForIcebergDeserializer manifest_deserializer(std::move(manifest_buf), Iceberg::IcebergPathFromMetadata::deserialize(manifest_path), getFormatSettings(getContext())); PaimonManifest paimon_manifest; paimon_manifest.entries.reserve(manifest_deserializer.rows()); diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index ab31ec4db751..0bf4e1ec8355 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -41,6 +41,7 @@ #include #include #include +#include #if ENABLE_DISTRIBUTED_CACHE #include #include @@ -424,6 +425,12 @@ Chunk StorageObjectStorageSource::generate() path); } + const String * iceberg_metadata_file_path = nullptr; +#if USE_AVRO + if (const auto * iceberg_info = dynamic_cast(object_info.get())) + iceberg_metadata_file_path = &iceberg_info->info.data_object_file_path_key.serialize(); +#endif + VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk( chunk, read_from_format_info.requested_virtual_columns, @@ -435,6 +442,7 @@ Chunk StorageObjectStorageSource::generate() .etag = &(object_metadata->etag), .tags = &(object_metadata->tags), .data_lake_snapshot_version = file_iterator->getSnapshotVersion(), + .iceberg_metadata_file_path = iceberg_metadata_file_path, }, read_context); diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index cce131a5b959..9ff0d939b3dc 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -147,6 +147,7 @@ static NamesAndTypesList getCommonVirtualsForFileLikeStorage() {"_tags", std::make_shared(std::make_shared(), std::make_shared())}, {"_data_lake_snapshot_version", makeNullable(std::make_shared())}, {"_row_number", makeNullable(std::make_shared())}, + {"_iceberg_metadata_file_path", std::make_shared(std::make_shared())}, }; } @@ -399,6 +400,7 @@ void addRequestedFileLikeStorageVirtualsToChunk( } else if (virtual_column.name == "_row_number") { +#if USE_PARQUET auto chunk_info = chunk.getChunkInfos().get(); if (chunk_info) { @@ -413,8 +415,22 @@ void addRequestedFileLikeStorageVirtualsToChunk( chunk.addColumn(ColumnNullable::create(std::move(column), std::move(null_map))); return; } - /// Row numbers not known, _row_number = NULL. + else + { + /// Row numbers not known, _row_number = NULL. + chunk.addColumn(virtual_column.type->createColumnConstWithDefaultValue(chunk.getNumRows())->convertToFullColumnIfConst()); + } +#else + // If Parquet format is not used, we don't have row numbers info, so _row_number = NULL. chunk.addColumn(virtual_column.type->createColumnConstWithDefaultValue(chunk.getNumRows())->convertToFullColumnIfConst()); +#endif + } + else if (virtual_column.name == "_iceberg_metadata_file_path") + { + if (virtual_values.iceberg_metadata_file_path) + chunk.addColumn(virtual_column.type->createColumnConst(chunk.getNumRows(), *virtual_values.iceberg_metadata_file_path)->convertToFullColumnIfConst()); + else + chunk.addColumn(virtual_column.type->createColumnConstWithDefaultValue(chunk.getNumRows())->convertToFullColumnIfConst()); } else if (auto it = hive_map.find(virtual_column.getNameInStorage()); it != hive_map.end()) { diff --git a/src/Storages/VirtualColumnUtils.h b/src/Storages/VirtualColumnUtils.h index bcd3b9974b1d..8b40a4e5fd58 100644 --- a/src/Storages/VirtualColumnUtils.h +++ b/src/Storages/VirtualColumnUtils.h @@ -135,6 +135,9 @@ struct VirtualsForFileLikeStorage const String * etag { nullptr }; const std::map * tags { nullptr }; std::optional data_lake_snapshot_version { std::nullopt }; + /// Original file path as stored in Iceberg metadata (before resolution to storage path). + /// Used by Iceberg position deletes to reference data files in the metadata path format. + const String * iceberg_metadata_file_path { nullptr }; }; void addRequestedFileLikeStorageVirtualsToChunk( diff --git a/tests/integration/test_storage_iceberg_concurrent/configs/config.d/query_log.xml b/tests/integration/test_storage_iceberg_concurrent/configs/config.d/query_log.xml deleted file mode 100644 index a63e91f41fbc..000000000000 --- a/tests/integration/test_storage_iceberg_concurrent/configs/config.d/query_log.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - system - query_log
-
-
diff --git a/tests/integration/test_storage_iceberg_concurrent/conftest.py b/tests/integration/test_storage_iceberg_concurrent/conftest.py index 2e61a6d4170e..4f2e78eb76ee 100644 --- a/tests/integration/test_storage_iceberg_concurrent/conftest.py +++ b/tests/integration/test_storage_iceberg_concurrent/conftest.py @@ -55,12 +55,6 @@ def get_spark(cluster : ClickHouseCluster): builder = ( pyspark.sql.SparkSession.builder \ .appName("IcebergS3Example") \ - .config("spark.jars.repositories", "https://repo1.maven.org/maven2") \ - .config("spark.jars.packages", - f'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:{iceberg_version},' - f'org.apache.spark:spark-avro_2.12:{spark_version},' - f'org.apache.hadoop:hadoop-aws:{hadoop_aws_version},' - f'com.amazonaws:aws-java-sdk-bundle:{jdk_bundle}')\ .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \ .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") \ .config("spark.sql.catalog.spark_catalog.type", "hadoop") \ @@ -88,8 +82,6 @@ def started_cluster_iceberg(): cluster.add_instance( "node1", main_configs=[ - "configs/config.d/query_log.xml", - "configs/config.d/cluster.xml", "configs/config.d/named_collections.xml", ], user_configs=["configs/users.d/users.xml"], diff --git a/tests/integration/test_storage_iceberg_interoperability_azure/__init__.py b/tests/integration/test_storage_iceberg_interoperability_azure/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/integration/test_storage_iceberg_interoperability_azure/configs/config.d/named_collections.xml b/tests/integration/test_storage_iceberg_interoperability_azure/configs/config.d/named_collections.xml new file mode 100644 index 000000000000..b5bad0ffdc71 --- /dev/null +++ b/tests/integration/test_storage_iceberg_interoperability_azure/configs/config.d/named_collections.xml @@ -0,0 +1,8 @@ + + + + devstoreaccount1 + Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw== + + + diff --git a/tests/integration/test_storage_iceberg_interoperability_azure/configs/users.d/users.xml b/tests/integration/test_storage_iceberg_interoperability_azure/configs/users.d/users.xml new file mode 100644 index 000000000000..4b6ba057ecb1 --- /dev/null +++ b/tests/integration/test_storage_iceberg_interoperability_azure/configs/users.d/users.xml @@ -0,0 +1,9 @@ + + + + + default + 1 + + + diff --git a/tests/integration/test_storage_iceberg_interoperability_azure/conftest.py b/tests/integration/test_storage_iceberg_interoperability_azure/conftest.py new file mode 100644 index 000000000000..11ebea147acd --- /dev/null +++ b/tests/integration/test_storage_iceberg_interoperability_azure/conftest.py @@ -0,0 +1,132 @@ +import pytest +import logging +import pyspark +import os +import os.path as p + +from helpers.cluster import ClickHouseCluster +from helpers.iceberg_utils import get_uuid_str +from helpers.spark_tools import ResilientSparkSession, write_spark_log_config + + +AZURE_ACCOUNT_NAME = "devstoreaccount1" +AZURE_CONTAINER = "testcontainer" + + +def get_spark(log_dir=None): + """ + Configure Spark to write Iceberg tables to Azurite via WASB (HTTP). + + Key insight: hadoop-azure's emulator mode config must use the FQDN + (devstoreaccount1.blob.core.windows.net) because it does an exact match + with the account name extracted from the WASB URI. + """ + hadoop_version = "3.3.4" + + builder = ( + pyspark.sql.SparkSession.builder + .appName("IcebergAzureInteroperability") + .config("spark.jars.repositories", "https://repo1.maven.org/maven2") + .config("spark.sql.extensions", + "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") + .config("spark.sql.catalog.spark_catalog", + "org.apache.iceberg.spark.SparkSessionCatalog") + .config("spark.sql.catalog.spark_catalog.type", "hadoop") + .config("spark.sql.catalog.spark_catalog.warehouse", + f"wasb://{AZURE_CONTAINER}@{AZURE_ACCOUNT_NAME}.blob.core.windows.net" + f"/iceberg_data") + # Enable emulator mode with FQDN — this makes hadoop-azure + # connect to http://127.0.0.1:10000 using dev storage credentials. + .config("spark.hadoop.fs.azure.storage.emulator.account.name", + f"{AZURE_ACCOUNT_NAME}.blob.core.windows.net") + .master("local") + ) + + if log_dir: + props_path = write_spark_log_config(log_dir) + builder = builder.config( + "spark.driver.extraJavaOptions", + f"-Dlog4j2.configurationFile=file:{props_path}", + ) + + return builder.getOrCreate() + + +@pytest.fixture(scope="package") +def started_cluster_iceberg(): + try: + # Force Azurite to port 10000 — the emulator mode hardcodes this port. + cluster = ClickHouseCluster( + __file__, with_spark=True, azurite_default_port=10000 + ) + cluster.add_instance( + "node1", + main_configs=[ + "configs/config.d/named_collections.xml", + ], + user_configs=["configs/users.d/users.xml"], + with_azurite=True, + stay_alive=True, + mem_limit="15g", + ) + + logging.info("Starting cluster...") + cluster.start() + + # Create test container + container_client = cluster.blob_service_client.get_container_client( + AZURE_CONTAINER + ) + if not container_client.exists(): + container_client.create_container() + cluster.azure_container_name = AZURE_CONTAINER + + cluster.spark_session = ResilientSparkSession( + lambda: get_spark(cluster.instances_dir) + ) + + yield cluster + + finally: + cluster.shutdown() + + +def test_spark_write_and_read(started_cluster_iceberg): + """Verify Spark can write to and read from Azurite via WASB emulator mode.""" + spark = started_cluster_iceberg.spark_session + + TABLE_NAME = "test_spark_roundtrip_" + get_uuid_str() + + # Write + spark.sql( + f""" + CREATE TABLE {TABLE_NAME} ( + number INT + ) + USING iceberg + OPTIONS('format-version'='2'); + """ + ) + + spark.sql( + f""" + INSERT INTO {TABLE_NAME} + SELECT id as number FROM range(100) + """ + ) + + # Read back + df = spark.sql(f"SELECT count(*) as cnt FROM {TABLE_NAME}").collect() + count = df[0].cnt + logging.info(f"Spark read back {count} rows") + assert count == 100, f"Expected 100 rows, got {count}" + + # List blobs to see what paths Spark actually wrote + blob_client = started_cluster_iceberg.blob_service_client + container_client = blob_client.get_container_client(AZURE_CONTAINER) + blobs = list(container_client.list_blobs()) + print(f"Blobs in container ({len(blobs)}):") + for blob in blobs[:20]: + print(f" {blob.name}") + + assert len(blobs) > 0, "No blobs written to Azurite!" diff --git a/tests/integration/test_storage_iceberg_interoperability_azure/test_interoperability.py b/tests/integration/test_storage_iceberg_interoperability_azure/test_interoperability.py new file mode 100644 index 000000000000..06acc86383c6 --- /dev/null +++ b/tests/integration/test_storage_iceberg_interoperability_azure/test_interoperability.py @@ -0,0 +1,270 @@ +import logging + +from helpers.iceberg_utils import get_uuid_str + +AZURE_CONTAINER = "testcontainer" + +def test_spark_write_ch_read_append(started_cluster_iceberg): + """Spark writes, CH reads, Spark appends, CH reads updated data.""" + instance = started_cluster_iceberg.instances["node1"] + spark = started_cluster_iceberg.spark_session + + TABLE_NAME = "test_spark_append_" + get_uuid_str() + azurite_url = started_cluster_iceberg.env_variables["AZURITE_STORAGE_ACCOUNT_URL"] + blob_path = f"iceberg_data/default/{TABLE_NAME}/" + + # Spark creates the table and inserts initial data + spark.sql( + f""" + CREATE TABLE {TABLE_NAME} ( + number INT + ) + USING iceberg + OPTIONS('format-version'='2'); + """ + ) + spark.sql(f"INSERT INTO {TABLE_NAME} SELECT id as number FROM range(100)") + + # Create ClickHouse table pointing to the same Azurite location + instance.query( + f""" + CREATE TABLE {TABLE_NAME} + ENGINE=IcebergAzure(azure, + container = '{AZURE_CONTAINER}', + storage_account_url = '{azurite_url}', + blob_path = '{blob_path}') + """ + ) + + # CH reads Spark's data + rows = int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) + assert rows == 100, f"Expected 100 rows, got {rows}" + + result = instance.query(f"SELECT sum(number) FROM {TABLE_NAME}") + assert int(result) == 4950, f"Expected sum 4950, got {result.strip()}" + + # Spark appends more data + spark.sql(f"INSERT INTO {TABLE_NAME} SELECT id + 100 as number FROM range(50)") + + # CH reads the updated data + rows = int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) + assert rows == 150, f"Expected 150 rows after append, got {rows}" + + +def test_ch_write_spark_read(started_cluster_iceberg): + """ClickHouse writes to an Iceberg table on Azurite that Spark created. + Tests that CH can correctly resolve Spark's wasb:// metadata paths and + append new data while preserving the existing Spark-written data. + Verifies Spark can read back via both SQL catalog and wasb:// path.""" + instance = started_cluster_iceberg.instances["node1"] + spark = started_cluster_iceberg.spark_session + + TABLE_NAME = "test_ch_write_" + get_uuid_str() + azurite_url = started_cluster_iceberg.env_variables["AZURITE_STORAGE_ACCOUNT_URL"] + blob_path = f"iceberg_data/default/{TABLE_NAME}/" + + # Spark creates the table and inserts initial data + spark.sql( + f""" + CREATE TABLE {TABLE_NAME} ( + number INT + ) + USING iceberg + OPTIONS('format-version'='2'); + """ + ) + spark.sql(f"INSERT INTO {TABLE_NAME} SELECT id as number FROM range(10)") + + # Create ClickHouse table pointing to the same Azurite location. + # iceberg_use_version_hint writes version-hint.text so Spark's HadoopCatalog + # can discover the latest metadata version after session restart. + instance.query( + f""" + CREATE TABLE {TABLE_NAME} + ENGINE=IcebergAzure(azure, + container = '{AZURE_CONTAINER}', + storage_account_url = '{azurite_url}', + blob_path = '{blob_path}') + SETTINGS iceberg_use_version_hint = 1 + """ + ) + + # CH reads Spark's data + rows = int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) + assert rows == 10, f"Expected 10 rows from Spark, got {rows}" + + # ClickHouse writes more data. + # write_full_path_in_iceberg_metadata is needed so Spark can resolve the + # data file paths (Spark expects wasb:// URIs, not relative paths). + insert_settings = { + "allow_insert_into_iceberg": 1, + "write_full_path_in_iceberg_metadata": 1, + } + instance.query(f"INSERT INTO {TABLE_NAME} VALUES (42)", settings=insert_settings) + instance.query(f"INSERT INTO {TABLE_NAME} VALUES (123)", settings=insert_settings) + + # ClickHouse can read its own writes (10 from Spark + 2 from CH) + rows = int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) + assert rows == 12, f"Expected 12 rows, got {rows}" + + result = instance.query(f"SELECT sum(number) FROM {TABLE_NAME}") + expected_sum = sum(range(10)) + 42 + 123 # 45 + 42 + 123 = 210 + assert int(result) == expected_sum, f"Expected sum {expected_sum}, got {result.strip()}" + + # Spark should also see the data written by ClickHouse. + started_cluster_iceberg.spark_session._restart() + spark = started_cluster_iceberg.spark_session + + # Read via SQL catalog + df = spark.sql(f"SELECT * FROM {TABLE_NAME}").collect() + assert len(df) == 12, f"Spark SQL expected 12 rows, got {len(df)}" + spark_values = sorted([row.number for row in df]) + assert 42 in spark_values, f"Spark SQL missing CH-written value 42: {spark_values}" + assert 123 in spark_values, f"Spark SQL missing CH-written value 123: {spark_values}" + + +def test_spark_delete_ch_read(started_cluster_iceberg): + """Spark creates a table, inserts data, deletes some rows, and CH sees the deletions.""" + instance = started_cluster_iceberg.instances["node1"] + spark = started_cluster_iceberg.spark_session + + TABLE_NAME = "test_spark_delete_" + get_uuid_str() + azurite_url = started_cluster_iceberg.env_variables["AZURITE_STORAGE_ACCOUNT_URL"] + blob_path = f"iceberg_data/default/{TABLE_NAME}/" + + # Spark creates the table with merge-on-read delete mode (position deletes) + spark.sql( + f""" + CREATE TABLE {TABLE_NAME} ( + number INT + ) + USING iceberg + TBLPROPERTIES ( + 'format-version' = '2', + 'write.update.mode' = 'merge-on-read', + 'write.delete.mode' = 'merge-on-read', + 'write.merge.mode' = 'merge-on-read' + ); + """ + ) + spark.sql(f"INSERT INTO {TABLE_NAME} SELECT id as number FROM range(100)") + + # Create ClickHouse table + instance.query( + f""" + CREATE TABLE {TABLE_NAME} + ENGINE=IcebergAzure(azure, + container = '{AZURE_CONTAINER}', + storage_account_url = '{azurite_url}', + blob_path = '{blob_path}') + """ + ) + + # CH reads all 100 rows + rows = int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) + assert rows == 100, f"Expected 100 rows, got {rows}" + + # Spark deletes rows where number < 20 + spark.sql(f"DELETE FROM {TABLE_NAME} WHERE number < 20") + + # CH should see only 80 rows + rows = int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) + assert rows == 80, f"Expected 80 rows after first delete, got {rows}" + + result = int(instance.query(f"SELECT min(number) FROM {TABLE_NAME}")) + assert result == 20, f"Expected min 20 after delete, got {result}" + + # Spark deletes more rows + spark.sql(f"DELETE FROM {TABLE_NAME} WHERE number >= 90") + + # CH should see only 70 rows (20..89) + rows = int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) + assert rows == 70, f"Expected 70 rows after second delete, got {rows}" + + result = int(instance.query(f"SELECT max(number) FROM {TABLE_NAME}")) + assert result == 89, f"Expected max 89 after delete, got {result}" + + result = int(instance.query(f"SELECT sum(number) FROM {TABLE_NAME}")) + expected_sum = sum(range(20, 90)) + assert result == expected_sum, f"Expected sum {expected_sum}, got {result}" + + +def test_ch_delete_spark_read(started_cluster_iceberg): + """Spark creates a table, CH deletes some rows, and Spark sees the deletions.""" + instance = started_cluster_iceberg.instances["node1"] + spark = started_cluster_iceberg.spark_session + + TABLE_NAME = "test_ch_delete_" + get_uuid_str() + azurite_url = started_cluster_iceberg.env_variables["AZURITE_STORAGE_ACCOUNT_URL"] + blob_path = f"iceberg_data/default/{TABLE_NAME}/" + + # Spark creates the table and inserts data + spark.sql( + f""" + CREATE TABLE {TABLE_NAME} ( + number INT + ) + USING iceberg + OPTIONS('format-version'='2'); + """ + ) + spark.sql(f"INSERT INTO {TABLE_NAME} SELECT id as number FROM range(50)") + + # Create ClickHouse table with version hint so Spark can discover CH's metadata updates + instance.query( + f""" + CREATE TABLE {TABLE_NAME} + ENGINE=IcebergAzure(azure, + container = '{AZURE_CONTAINER}', + storage_account_url = '{azurite_url}', + blob_path = '{blob_path}') + SETTINGS iceberg_use_version_hint = 1 + """ + ) + + # CH reads all 50 rows + rows = int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) + assert rows == 50, f"Expected 50 rows, got {rows}" + + # CH deletes some rows + delete_settings = { + "allow_insert_into_iceberg": 1, + "write_full_path_in_iceberg_metadata": 1, + } + instance.query( + f"ALTER TABLE {TABLE_NAME} DELETE WHERE number < 10", + settings=delete_settings, + ) + + # CH sees the deletion + rows = int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) + assert rows == 40, f"Expected 40 rows after CH delete, got {rows}" + + # Spark should also see the deletion + started_cluster_iceberg.spark_session._restart() + spark = started_cluster_iceberg.spark_session + + df = spark.sql(f"SELECT * FROM {TABLE_NAME}").collect() + assert len(df) == 40, f"Spark expected 40 rows after CH delete, got {len(df)}" + spark_values = sorted([row.number for row in df]) + assert min(spark_values) == 10, f"Spark expected min 10, got {min(spark_values)}" + + # CH deletes more rows + instance.query( + f"ALTER TABLE {TABLE_NAME} DELETE WHERE number >= 40", + settings=delete_settings, + ) + + # CH sees the deletion + rows = int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) + assert rows == 30, f"Expected 30 rows after second CH delete, got {rows}" + + # Spark should also see the second deletion + started_cluster_iceberg.spark_session._restart() + spark = started_cluster_iceberg.spark_session + + df = spark.sql(f"SELECT * FROM {TABLE_NAME}").collect() + assert len(df) == 30, f"Spark expected 30 rows after second CH delete, got {len(df)}" + spark_values = sorted([row.number for row in df]) + assert spark_values == list(range(10, 40)), \ + f"Spark expected values 10..39, got {spark_values}" \ No newline at end of file diff --git a/tests/integration/test_storage_iceberg_interoperability_local/__init__.py b/tests/integration/test_storage_iceberg_interoperability_local/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/integration/test_storage_iceberg_concurrent/configs/config.d/cluster.xml b/tests/integration/test_storage_iceberg_interoperability_local/configs/config.d/cluster.xml similarity index 68% rename from tests/integration/test_storage_iceberg_concurrent/configs/config.d/cluster.xml rename to tests/integration/test_storage_iceberg_interoperability_local/configs/config.d/cluster.xml index c66b0e1a1057..7927d4613a28 100644 --- a/tests/integration/test_storage_iceberg_concurrent/configs/config.d/cluster.xml +++ b/tests/integration/test_storage_iceberg_interoperability_local/configs/config.d/cluster.xml @@ -6,14 +6,6 @@ node1 9000 - - node2 - 9000 - - - node3 - 9000 - diff --git a/tests/integration/test_storage_iceberg_interoperability_local/configs/config.d/named_collections.xml b/tests/integration/test_storage_iceberg_interoperability_local/configs/config.d/named_collections.xml new file mode 100644 index 000000000000..ca41a070a178 --- /dev/null +++ b/tests/integration/test_storage_iceberg_interoperability_local/configs/config.d/named_collections.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/tests/integration/test_storage_iceberg_interoperability_local/configs/users.d/users.xml b/tests/integration/test_storage_iceberg_interoperability_local/configs/users.d/users.xml new file mode 100644 index 000000000000..4b6ba057ecb1 --- /dev/null +++ b/tests/integration/test_storage_iceberg_interoperability_local/configs/users.d/users.xml @@ -0,0 +1,9 @@ + + + + + default + 1 + + + diff --git a/tests/integration/test_storage_iceberg_interoperability_local/conftest.py b/tests/integration/test_storage_iceberg_interoperability_local/conftest.py new file mode 100644 index 000000000000..b38a67513246 --- /dev/null +++ b/tests/integration/test_storage_iceberg_interoperability_local/conftest.py @@ -0,0 +1,160 @@ +import pytest +import logging +import pyspark +import os + +from helpers.cluster import ClickHouseCluster +from helpers.iceberg_utils import get_uuid_str +from helpers.spark_tools import ResilientSparkSession, write_spark_log_config + + +# Each node gets its own iceberg data directory so they don't see each other's writes. +# external_dirs mounts / (host) → (container). +# We then create symlinks on the host so that the container path also resolves +# on the host — this way Iceberg metadata with absolute paths works for both +# Spark (host) and ClickHouse (container). +ICEBERG_DIR_NODE1 = "/var/lib/clickhouse/user_files/iceberg_node1" +ICEBERG_DIR_NODE2 = "/var/lib/clickhouse/user_files/iceberg_node2" + + +def create_host_symlink(container_path, host_path): + """ + Create a symlink on the host from container_path → host_path. + After this, both Spark (on host) and ClickHouse (in container) + can use the same absolute path to access the data. + """ + os.makedirs(os.path.dirname(container_path), exist_ok=True) + if os.path.exists(container_path): + if os.path.islink(container_path): + os.remove(container_path) + else: + return # Real directory exists (e.g., actual ClickHouse installation), don't touch it + os.symlink(host_path, container_path) + logging.info(f"Created symlink: {container_path} → {host_path}") + + +def cleanup_host_symlink(container_path): + """Remove symlink created by create_host_symlink.""" + if os.path.islink(container_path): + os.remove(container_path) + logging.info(f"Removed symlink: {container_path}") + + +def get_spark(warehouse_node1, warehouse_node2, log_dir=None): + builder = ( + pyspark.sql.SparkSession.builder + .appName("IcebergLocalTwoNodes") + .config("spark.sql.extensions", + "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") + # Catalog for node1 + .config("spark.sql.catalog.node1_catalog", + "org.apache.iceberg.spark.SparkCatalog") + .config("spark.sql.catalog.node1_catalog.type", "hadoop") + .config("spark.sql.catalog.node1_catalog.warehouse", warehouse_node1) + # Catalog for node2 + .config("spark.sql.catalog.node2_catalog", + "org.apache.iceberg.spark.SparkCatalog") + .config("spark.sql.catalog.node2_catalog.type", "hadoop") + .config("spark.sql.catalog.node2_catalog.warehouse", warehouse_node2) + .master("local") + ) + + if log_dir: + props_path = write_spark_log_config(log_dir) + builder = builder.config( + "spark.driver.extraJavaOptions", + f"-Dlog4j2.configurationFile=file:{props_path}", + ) + + return builder.getOrCreate() + + +@pytest.fixture(scope="package") +def started_cluster_iceberg(): + symlinks = [] + try: + cluster = ClickHouseCluster(__file__, with_spark=True) + cluster.add_instance( + "node1", + main_configs=[ + "configs/config.d/cluster.xml", + "configs/config.d/named_collections.xml", + ], + user_configs=["configs/users.d/users.xml"], + stay_alive=True, + mem_limit="15g", + external_dirs=[ICEBERG_DIR_NODE1], + ) + cluster.add_instance( + "node2", + main_configs=[ + "configs/config.d/cluster.xml", + "configs/config.d/named_collections.xml", + ], + user_configs=["configs/users.d/users.xml"], + stay_alive=True, + mem_limit="15g", + external_dirs=[ICEBERG_DIR_NODE2], + ) + + logging.info("Starting cluster...") + cluster.start() + + # external_dirs creates: + # host: /var/lib/clickhouse/user_files/iceberg_node1 + # container: /var/lib/clickhouse/user_files/iceberg_node1 + # + # Create symlinks on the host so the container path resolves to the + # host path. Now both Spark and ClickHouse use the same absolute paths. + for iceberg_dir in [ICEBERG_DIR_NODE1, ICEBERG_DIR_NODE2]: + host_path = os.path.join( + cluster.instances_dir, iceberg_dir.lstrip("/") + ) + create_host_symlink(iceberg_dir, host_path) + symlinks.append(iceberg_dir) + + # Both Spark and ClickHouse use the container paths. + # On the host, these resolve via symlinks to the actual data. + cluster.spark_session = ResilientSparkSession( + lambda: get_spark(ICEBERG_DIR_NODE1, ICEBERG_DIR_NODE2, cluster.instances_dir) + ) + + yield cluster + + finally: + for link in symlinks: + cleanup_host_symlink(link) + cluster.shutdown() + + +def test_spark_write_and_read(started_cluster_iceberg): + """Verify Spark can write to and read from local filesystem via Iceberg.""" + spark = started_cluster_iceberg.spark_session + + TABLE_NAME = "test_spark_roundtrip_" + get_uuid_str() + + # Write + spark.sql( + f""" + CREATE TABLE node1_catalog.default.{TABLE_NAME} ( + number INT + ) + USING iceberg + OPTIONS('format-version'='2'); + """ + ) + + spark.sql( + f""" + INSERT INTO node1_catalog.default.{TABLE_NAME} + SELECT id as number FROM range(100) + """ + ) + + # Read back + df = spark.sql( + f"SELECT count(*) as cnt FROM node1_catalog.default.{TABLE_NAME}" + ).collect() + count = df[0].cnt + logging.info(f"Spark read back {count} rows") + assert count == 100, f"Expected 100 rows, got {count}" diff --git a/tests/integration/test_storage_iceberg_interoperability_local/test_interoperability.py b/tests/integration/test_storage_iceberg_interoperability_local/test_interoperability.py new file mode 100644 index 000000000000..0d11b0f1a1f7 --- /dev/null +++ b/tests/integration/test_storage_iceberg_interoperability_local/test_interoperability.py @@ -0,0 +1,326 @@ +from helpers.iceberg_utils import get_uuid_str + +ICEBERG_DIR_NODE1 = "/var/lib/clickhouse/user_files/iceberg_node1" + + +def test_nodes_dont_see_each_other(started_cluster_iceberg): + """ + Spark writes different data to each node's local directory. + Each node only sees its own data. + """ + node1 = started_cluster_iceberg.instances["node1"] + node2 = started_cluster_iceberg.instances["node2"] + spark = started_cluster_iceberg.spark_session + + TABLE_NAME = "test_isolation_" + get_uuid_str() + + # Create Iceberg tables via Spark — one per node catalog + spark.sql( + f""" + CREATE TABLE node1_catalog.default.{TABLE_NAME} ( + number INT + ) + USING iceberg + OPTIONS('format-version'='2'); + """ + ) + + spark.sql( + f""" + CREATE TABLE node2_catalog.default.{TABLE_NAME} ( + number INT + ) + USING iceberg + OPTIONS('format-version'='2'); + """ + ) + + # Write 100 rows to node1, 200 rows to node2 + spark.sql( + f""" + INSERT INTO node1_catalog.default.{TABLE_NAME} + SELECT id as number FROM range(100) + """ + ) + + spark.sql( + f""" + INSERT INTO node2_catalog.default.{TABLE_NAME} + SELECT id as number FROM range(200) + """ + ) + + # Create ClickHouse tables — each node reads from its own iceberg directory + node1.query( + f""" + CREATE TABLE {TABLE_NAME} + ENGINE=IcebergLocal(local, + path = '/var/lib/clickhouse/user_files/iceberg_node1/default/{TABLE_NAME}') + """ + ) + node2.query( + f""" + CREATE TABLE {TABLE_NAME} + ENGINE=IcebergLocal(local, + path = '/var/lib/clickhouse/user_files/iceberg_node2/default/{TABLE_NAME}') + """ + ) + + # Each node should only see its own data + rows_node1 = int(node1.query(f"SELECT count() FROM {TABLE_NAME}")) + rows_node2 = int(node2.query(f"SELECT count() FROM {TABLE_NAME}")) + + assert rows_node1 == 100, f"node1: expected 100 rows, got {rows_node1}" + assert rows_node2 == 200, f"node2: expected 200 rows, got {rows_node2}" + + # Append more data to node1 only + spark.sql( + f""" + INSERT INTO node1_catalog.default.{TABLE_NAME} + SELECT id + 100 as number FROM range(50) + """ + ) + + rows_node1 = int(node1.query(f"SELECT count() FROM {TABLE_NAME}")) + rows_node2 = int(node2.query(f"SELECT count() FROM {TABLE_NAME}")) + + assert rows_node1 == 150, f"node1: expected 150 rows after append, got {rows_node1}" + assert rows_node2 == 200, f"node2: should still have 200 rows, got {rows_node2}" + + +def test_ch_write_spark_read(started_cluster_iceberg): + """ + Spark creates a table, ClickHouse writes to it, Spark reads back. + Validates that the external_dirs mount works bidirectionally. + """ + node1 = started_cluster_iceberg.instances["node1"] + spark = started_cluster_iceberg.spark_session + + TABLE_NAME = "test_ch_write_spark_read_" + get_uuid_str() + + # Spark creates the table structure + spark.sql( + f""" + CREATE TABLE node1_catalog.default.{TABLE_NAME} ( + number INT + ) + USING iceberg + OPTIONS('format-version'='2'); + """ + ) + + # Create ClickHouse table pointing to the same location + node1.query( + f""" + CREATE TABLE {TABLE_NAME} + ENGINE=IcebergLocal(local, + path = '/var/lib/clickhouse/user_files/iceberg_node1/default/{TABLE_NAME}') + """ + ) + + # ClickHouse writes data + node1.query( + f"INSERT INTO {TABLE_NAME} VALUES (42)", + settings={"allow_insert_into_iceberg": 1}, + ) + node1.query( + f"INSERT INTO {TABLE_NAME} VALUES (123)", + settings={"allow_insert_into_iceberg": 1}, + ) + + # ClickHouse can read its own writes + assert int(node1.query(f"SELECT count() FROM {TABLE_NAME}")) == 2 + + # Spark should also see the data written by ClickHouse. + # Spark's catalog caches metadata, so we need to refresh it first. + spark.sql(f"REFRESH TABLE node1_catalog.default.{TABLE_NAME}") + + df = spark.sql( + f"SELECT * FROM node1_catalog.default.{TABLE_NAME}" + ).collect() + assert len(df) == 2, f"Spark expected 2 rows, got {len(df)}" + + spark_values = sorted([row.number for row in df]) + assert spark_values == [42, 123], f"Spark got unexpected values: {spark_values}" + + +def test_spark_write_ch_read_append(started_cluster_iceberg): + """Spark writes, CH reads, Spark appends, CH reads updated data.""" + node1 = started_cluster_iceberg.instances["node1"] + spark = started_cluster_iceberg.spark_session + + TABLE_NAME = "test_spark_append_" + get_uuid_str() + + # Spark creates the table and inserts initial data + spark.sql( + f""" + CREATE TABLE node1_catalog.default.{TABLE_NAME} ( + number INT + ) + USING iceberg + OPTIONS('format-version'='2'); + """ + ) + spark.sql( + f"INSERT INTO node1_catalog.default.{TABLE_NAME} SELECT id as number FROM range(100)" + ) + + # Create ClickHouse table pointing to the same location + node1.query( + f""" + CREATE TABLE {TABLE_NAME} + ENGINE=IcebergLocal(local, + path = '{ICEBERG_DIR_NODE1}/default/{TABLE_NAME}') + """ + ) + + # CH reads Spark's data + rows = int(node1.query(f"SELECT count() FROM {TABLE_NAME}")) + assert rows == 100, f"Expected 100 rows, got {rows}" + + result = node1.query(f"SELECT sum(number) FROM {TABLE_NAME}") + assert int(result) == 4950, f"Expected sum 4950, got {result.strip()}" + + # Spark appends more data + spark.sql( + f"INSERT INTO node1_catalog.default.{TABLE_NAME} SELECT id + 100 as number FROM range(50)" + ) + + # CH reads the updated data + rows = int(node1.query(f"SELECT count() FROM {TABLE_NAME}")) + assert rows == 150, f"Expected 150 rows after append, got {rows}" + + +def test_spark_delete_ch_read(started_cluster_iceberg): + """Spark creates a table, inserts data, deletes some rows, and CH sees the deletions.""" + node1 = started_cluster_iceberg.instances["node1"] + spark = started_cluster_iceberg.spark_session + + TABLE_NAME = "test_spark_delete_" + get_uuid_str() + + # Spark creates the table with merge-on-read delete mode (position deletes) + spark.sql( + f""" + CREATE TABLE node1_catalog.default.{TABLE_NAME} ( + number INT + ) + USING iceberg + TBLPROPERTIES ( + 'format-version' = '2', + 'write.update.mode' = 'merge-on-read', + 'write.delete.mode' = 'merge-on-read', + 'write.merge.mode' = 'merge-on-read' + ); + """ + ) + spark.sql( + f"INSERT INTO node1_catalog.default.{TABLE_NAME} SELECT id as number FROM range(100)" + ) + + # Create ClickHouse table + node1.query( + f""" + CREATE TABLE {TABLE_NAME} + ENGINE=IcebergLocal(local, + path = '{ICEBERG_DIR_NODE1}/default/{TABLE_NAME}') + """ + ) + + # CH reads all 100 rows + rows = int(node1.query(f"SELECT count() FROM {TABLE_NAME}")) + assert rows == 100, f"Expected 100 rows, got {rows}" + + # Spark deletes rows where number < 20 + spark.sql(f"DELETE FROM node1_catalog.default.{TABLE_NAME} WHERE number < 20") + + # CH should see only 80 rows + rows = int(node1.query(f"SELECT count() FROM {TABLE_NAME}")) + assert rows == 80, f"Expected 80 rows after first delete, got {rows}" + + result = int(node1.query(f"SELECT min(number) FROM {TABLE_NAME}")) + assert result == 20, f"Expected min 20 after delete, got {result}" + + # Spark deletes more rows + spark.sql(f"DELETE FROM node1_catalog.default.{TABLE_NAME} WHERE number >= 90") + + # CH should see only 70 rows (20..89) + rows = int(node1.query(f"SELECT count() FROM {TABLE_NAME}")) + assert rows == 70, f"Expected 70 rows after second delete, got {rows}" + + result = int(node1.query(f"SELECT max(number) FROM {TABLE_NAME}")) + assert result == 89, f"Expected max 89 after delete, got {result}" + + result = int(node1.query(f"SELECT sum(number) FROM {TABLE_NAME}")) + expected_sum = sum(range(20, 90)) + assert result == expected_sum, f"Expected sum {expected_sum}, got {result}" + + +def test_ch_delete_spark_read(started_cluster_iceberg): + """Spark creates a table, CH deletes some rows, and Spark sees the deletions.""" + node1 = started_cluster_iceberg.instances["node1"] + spark = started_cluster_iceberg.spark_session + + TABLE_NAME = "test_ch_delete_" + get_uuid_str() + + # Spark creates the table and inserts data + spark.sql( + f""" + CREATE TABLE node1_catalog.default.{TABLE_NAME} ( + number INT + ) + USING iceberg + OPTIONS('format-version'='2'); + """ + ) + spark.sql( + f"INSERT INTO node1_catalog.default.{TABLE_NAME} SELECT id as number FROM range(50)" + ) + + # Create ClickHouse table + node1.query( + f""" + CREATE TABLE {TABLE_NAME} + ENGINE=IcebergLocal(local, + path = '{ICEBERG_DIR_NODE1}/default/{TABLE_NAME}') + """ + ) + + # CH reads all 50 rows + rows = int(node1.query(f"SELECT count() FROM {TABLE_NAME}")) + assert rows == 50, f"Expected 50 rows, got {rows}" + + # CH deletes some rows + delete_settings = {"allow_insert_into_iceberg": 1} + node1.query( + f"ALTER TABLE {TABLE_NAME} DELETE WHERE number < 10", + settings=delete_settings, + ) + + # CH sees the deletion + rows = int(node1.query(f"SELECT count() FROM {TABLE_NAME}")) + assert rows == 40, f"Expected 40 rows after CH delete, got {rows}" + + # Spark should also see the deletion + spark.sql(f"REFRESH TABLE node1_catalog.default.{TABLE_NAME}") + df = spark.sql(f"SELECT * FROM node1_catalog.default.{TABLE_NAME}").collect() + assert len(df) == 40, f"Spark expected 40 rows after CH delete, got {len(df)}" + spark_values = sorted([row.number for row in df]) + assert min(spark_values) == 10, f"Spark expected min 10, got {min(spark_values)}" + + # CH deletes more rows + node1.query( + f"ALTER TABLE {TABLE_NAME} DELETE WHERE number >= 40", + settings=delete_settings, + ) + + # CH sees the deletion + rows = int(node1.query(f"SELECT count() FROM {TABLE_NAME}")) + assert rows == 30, f"Expected 30 rows after second CH delete, got {rows}" + + # Spark should also see the second deletion + spark.sql(f"REFRESH TABLE node1_catalog.default.{TABLE_NAME}") + df = spark.sql(f"SELECT * FROM node1_catalog.default.{TABLE_NAME}").collect() + assert len(df) == 30, f"Spark expected 30 rows after second CH delete, got {len(df)}" + spark_values = sorted([row.number for row in df]) + assert spark_values == list(range(10, 40)), \ + f"Spark expected values 10..39, got {spark_values}" diff --git a/tests/integration/test_storage_iceberg_with_spark/test_writes_create_version_hint.py b/tests/integration/test_storage_iceberg_with_spark/test_writes_create_version_hint.py index bc785ba08280..cd8cce7eda29 100644 --- a/tests/integration/test_storage_iceberg_with_spark/test_writes_create_version_hint.py +++ b/tests/integration/test_storage_iceberg_with_spark/test_writes_create_version_hint.py @@ -24,9 +24,8 @@ def test_writes_create_version_hint(started_cluster_iceberg_with_spark, format_v f"/var/lib/clickhouse/user_files/iceberg_data/default/{TABLE_NAME}/", ) - target_suffix = b'v1.metadata.json' with open(f"/var/lib/clickhouse/user_files/iceberg_data/default/{TABLE_NAME}/metadata/version-hint.text", "rb") as f: - assert f.read()[-len(target_suffix):] == target_suffix + assert f.read().strip() == b'1' instance.query(f"INSERT INTO {TABLE_NAME} VALUES ('123', 1);", settings={"allow_insert_into_iceberg": 1}) assert instance.query(f"SELECT * FROM {TABLE_NAME} ORDER BY ALL", ) == '123\t1\n' @@ -38,9 +37,8 @@ def test_writes_create_version_hint(started_cluster_iceberg_with_spark, format_v f"/var/lib/clickhouse/user_files/iceberg_data/default/{TABLE_NAME}/", ) - target_suffix = b'v2.metadata.json' with open(f"/var/lib/clickhouse/user_files/iceberg_data/default/{TABLE_NAME}/metadata/version-hint.text", "rb") as f: - assert f.read()[-len(target_suffix):] == target_suffix + assert f.read().strip() == b'2' df = spark.read.format("iceberg").load(f"/var/lib/clickhouse/user_files/iceberg_data/default/{TABLE_NAME}").collect() assert len(df) == 1 From 97624046745cf189dbb377f131fab304e04a5ec4 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 12 May 2026 12:37:06 -0300 Subject: [PATCH 2/4] attempt to fix some conflicts --- .../DataLakes/IDataLakeMetadata.h | 1 - .../DataLakes/Iceberg/IcebergMetadata.cpp | 101 ++++++++---------- .../DataLakes/Iceberg/IcebergMetadata.h | 5 +- .../DataLakes/Iceberg/IcebergWrites.cpp | 28 ++--- .../DataLakes/Iceberg/MultipleFileWriter.cpp | 2 +- .../ObjectStorage/DataLakes/Iceberg/Utils.cpp | 2 +- .../ObjectStorage/StorageObjectStorage.cpp | 1 - 7 files changed, 58 insertions(+), 82 deletions(-) diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h index d7423dd115d6..106a812cf7da 100644 --- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h +++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h @@ -226,7 +226,6 @@ class IDataLakeMetadata : boost::noncopyable const std::vector & /* partition_values */, SharedHeader /* sample_block */, const std::vector & /* data_file_paths */, - StorageObjectStorageConfigurationPtr /* configuration */, ContextPtr /* context */) { throwNotImplemented("commitExportPartitionTransaction"); diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp index 7eb6d839a1fa..30a2f7fbd6e1 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp @@ -1448,15 +1448,13 @@ bool IcebergMetadata::commitImportPartitionTransactionImpl( const std::vector & partition_columns, const std::vector & partition_types, SharedHeader sample_block, - const std::vector & data_file_paths, + const std::vector & data_file_paths_in_metadata, const std::vector & per_file_stats, Int64 total_data_files, Int64 total_rows, Int64 total_chunks_size, std::shared_ptr catalog, const StorageID & table_id, - const String & blob_storage_type_name, - const String & blob_storage_namespace_name, ContextPtr context) { /// this check also exists here because the metadata might have been updated upon retry attempts. @@ -1470,14 +1468,24 @@ bool IcebergMetadata::commitImportPartitionTransactionImpl( CompressionMethod metadata_compression_method = persistent_components.metadata_compression_method; - auto [metadata_name, storage_metadata_name] = filename_generator.generateMetadataName(); + const auto & resolver = persistent_components.path_resolver; + auto metadata_info = filename_generator.generateMetadataPathWithInfo(); Int64 parent_snapshot = -1; if (metadata->has(Iceberg::f_current_snapshot_id)) parent_snapshot = metadata->getValue(Iceberg::f_current_snapshot_id); - auto [new_snapshot, manifest_list_name, storage_manifest_list_name] = MetadataGenerator(metadata).generateNextMetadata( - filename_generator, metadata_name, parent_snapshot, total_data_files, total_rows, total_chunks_size, total_data_files, /* added_delete_files */0, /* num_deleted_rows */0); + auto [new_snapshot, manifest_list_path] = MetadataGenerator(metadata).generateNextMetadata( + filename_generator, + metadata_info.path, + parent_snapshot, + total_data_files, + total_rows, + total_chunks_size, + total_data_files, + /* added_delete_files */ 0, + /* num_deleted_rows */ 0); + auto storage_manifest_list_name = resolver.resolve(manifest_list_path); /// Embed the stable transaction identifier in the snapshot summary so that a retry after crash /// can detect the commit already happened by scanning the live snapshots array, without extra S3 @@ -1485,8 +1493,8 @@ bool IcebergMetadata::commitImportPartitionTransactionImpl( new_snapshot->getObject(Iceberg::f_summary)->set( Iceberg::f_clickhouse_export_partition_transaction_id, transaction_id); - String manifest_entry_name; - String storage_manifest_entry_name; + Iceberg::IcebergPathFromMetadata manifest_entry_path; + String storage_manifest_entry_path; Int64 manifest_lengths = 0; /// Tracks whether the snapshot has become visible to readers. @@ -1503,7 +1511,7 @@ bool IcebergMetadata::commitImportPartitionTransactionImpl( /// because this replica or some other replica might attempt to commit the same transaction later /// todo arthur: in the future, we should consider failing the entire task if retry_because_of_metadata_conflict = true - object_storage->removeObjectIfExists(StoredObject(storage_manifest_entry_name)); + object_storage->removeObjectIfExists(StoredObject(storage_manifest_entry_path)); object_storage->removeObjectIfExists(StoredObject(storage_manifest_list_name)); if (retry_because_of_metadata_conflict) @@ -1534,6 +1542,7 @@ bool IcebergMetadata::commitImportPartitionTransactionImpl( context, getLogger("IcebergWrites").get(), persistent_components.table_uuid, + metadata_compression_method, true); } @@ -1578,13 +1587,12 @@ bool IcebergMetadata::commitImportPartitionTransactionImpl( try { { - auto result = filename_generator.generateManifestEntryName(); - manifest_entry_name = result.path_in_metadata; - storage_manifest_entry_name = result.path_in_storage; + manifest_entry_path = filename_generator.generateManifestEntryName(); + storage_manifest_entry_path = resolver.resolve(manifest_entry_path); } auto buffer_manifest_entry = object_storage->writeObject( - StoredObject(storage_manifest_entry_name), WriteMode::Rewrite, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, context->getWriteSettings()); + StoredObject(storage_manifest_entry_path), WriteMode::Rewrite, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, context->getWriteSettings()); try { @@ -1598,7 +1606,7 @@ bool IcebergMetadata::commitImportPartitionTransactionImpl( partition_columns, partition_values, partition_types, - data_file_paths, + data_file_paths_in_metadata, std::nullopt, /// per_file_stats is filled, no need for the generic aggregate sample_block, new_snapshot, @@ -1624,7 +1632,7 @@ bool IcebergMetadata::commitImportPartitionTransactionImpl( try { generateManifestList( - filename_generator, metadata, object_storage, context, {manifest_entry_name}, new_snapshot, manifest_lengths, *buffer_manifest_list, Iceberg::FileContentType::DATA, true); + persistent_components.path_resolver, metadata, object_storage, context, {manifest_entry_path}, new_snapshot, {manifest_lengths}, *buffer_manifest_list, Iceberg::FileContentType::DATA, true); buffer_manifest_list->finalize(); } catch (...) @@ -1639,30 +1647,27 @@ bool IcebergMetadata::commitImportPartitionTransactionImpl( Poco::JSON::Stringifier::stringify(metadata, oss, 4); std::string json_representation = removeEscapedSlashes(oss.str()); - LOG_DEBUG(log, "Writing new metadata file {}", storage_metadata_name); - auto hint = filename_generator.generateVersionHint(); + LOG_DEBUG(log, "Writing new metadata file {}", metadata_info.path); + auto hint_path = filename_generator.generateVersionHint(); if (!writeMetadataFileAndVersionHint( - storage_metadata_name, + persistent_components.path_resolver, + metadata_info, json_representation, - hint.path_in_storage, - storage_metadata_name, + hint_path, object_storage, context, - metadata_compression_method, data_lake_settings[DataLakeStorageSetting::iceberg_use_version_hint])) { - LOG_DEBUG(log, "Failed to write metadata {}, retrying", storage_metadata_name); + LOG_DEBUG(log, "Failed to write metadata {}, retrying", metadata_info.path); cleanup(true); return false; } - LOG_DEBUG(log, "Metadata file {} written", storage_metadata_name); + LOG_DEBUG(log, "Metadata file {} written", metadata_info.path); if (catalog) { - String catalog_filename = metadata_name; - if (!catalog_filename.starts_with(blob_storage_type_name)) - catalog_filename = blob_storage_type_name + "://" + blob_storage_namespace_name + "/" + metadata_name; + auto catalog_filename = resolver.resolveForCatalog(metadata_info.path); const auto & [namespace_name, table_name] = DataLake::parseTableName(table_id.getTableName()); if (!catalog->updateMetadata(namespace_name, table_name, catalog_filename, new_snapshot)) @@ -1735,9 +1740,13 @@ void IcebergMetadata::commitExportPartitionTransaction( const std::vector & partition_values, SharedHeader sample_block, const std::vector & data_file_paths, - StorageObjectStorageConfigurationPtr configuration, ContextPtr context) { + std::vector data_file_paths_in_metadata; + for (const auto & path : data_file_paths) + { + data_file_paths_in_metadata.push_back(Iceberg::IcebergPathFromMetadata::deserialize(path)); + } MetadataFileWithInfo updated_metadata_file_info = getLatestOrExplicitMetadataFileAndVersion( object_storage, @@ -1747,6 +1756,7 @@ void IcebergMetadata::commitExportPartitionTransaction( context, getLogger("IcebergMetadata").get(), persistent_components.table_uuid, + persistent_components.metadata_compression_method, true); /// Latest metadata is ALWAYS necessary to commit - but we abort in case schema or partition spec changed @@ -1797,37 +1807,22 @@ void IcebergMetadata::commitExportPartitionTransaction( const auto partition_types = partitioner.getResultTypes(); const auto metadata_compression_method = persistent_components.metadata_compression_method; - auto config_path = persistent_components.table_path; - if (config_path.empty() || config_path.back() != '/') - config_path += "/"; - if (!config_path.starts_with('/')) - config_path = '/' + config_path; - - FileNamesGenerator filename_generator; - if (!context->getSettingsRef()[Setting::write_full_path_in_iceberg_metadata]) - { - filename_generator = FileNamesGenerator( - config_path, config_path, (catalog != nullptr && catalog->isTransactional()), metadata_compression_method, write_format); - } - else - { - auto bucket = metadata->getValue(Iceberg::f_location); - if (bucket.empty() || bucket.back() != '/') - bucket += "/"; - filename_generator = FileNamesGenerator( - bucket, config_path, (catalog != nullptr && catalog->isTransactional()), metadata_compression_method, write_format); - } + + FileNamesGenerator filename_generator = FileNamesGenerator( + persistent_components.path_resolver.getTableLocation(), + (catalog != nullptr && catalog->isTransactional()), metadata_compression_method, write_format); + filename_generator.setVersion(updated_metadata_file_info.version + 1); /// Load per-file sidecar stats, necessary to populate the manifest file stats. std::vector per_file_stats; - const Int64 total_data_files = static_cast(data_file_paths.size()); + const Int64 total_data_files = static_cast(data_file_paths_in_metadata.size()); Int64 total_rows = 0; Int64 total_chunks_size = 0; - per_file_stats.reserve(data_file_paths.size()); - for (const auto & path : data_file_paths) + per_file_stats.reserve(data_file_paths_in_metadata.size()); + for (const auto & path : data_file_paths_in_metadata) { - const auto sidecar_path = getIcebergExportPartSidecarStoragePath(path); + const auto sidecar_path = getIcebergExportPartSidecarStoragePath(persistent_components.path_resolver.resolve(path)); auto stats = readDataFileSidecar(sidecar_path, object_storage, context); total_rows += stats.record_count; total_chunks_size += stats.file_size_in_bytes; @@ -1849,15 +1844,13 @@ void IcebergMetadata::commitExportPartitionTransaction( partition_columns, partition_types, sample_block, - data_file_paths, + data_file_paths_in_metadata, per_file_stats, total_data_files, total_rows, total_chunks_size, catalog, table_id, - configuration->getTypeName(), - configuration->getNamespace(), context)) { return; diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h index 664473fef031..d3a62eb9cd8b 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h @@ -138,7 +138,6 @@ class IcebergMetadata : public IDataLakeMetadata const std::vector & partition_values, SharedHeader sample_block, const std::vector & data_file_paths, - StorageObjectStorageConfigurationPtr configuration, ContextPtr context) override; CompressionMethod getCompressionMethod() const { return persistent_components.metadata_compression_method; } @@ -214,15 +213,13 @@ class IcebergMetadata : public IDataLakeMetadata const std::vector & partition_columns, const std::vector & partition_types, SharedHeader sample_block, - const std::vector & data_file_paths, + const std::vector & data_file_paths_in_metadata, const std::vector & per_file_stats, Int64 total_data_files, Int64 total_rows, Int64 total_chunks_size, std::shared_ptr catalog, const StorageID & table_id, - const String & blob_storage_type_name, - const String & blob_storage_namespace_name, ContextPtr context); LoggerPtr log; diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergWrites.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergWrites.cpp index 160f8f6e5349..1af1659f9808 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergWrites.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergWrites.cpp @@ -1326,25 +1326,9 @@ IcebergImportSink::IcebergImportSink( const auto metadata_compression_method = persistent_table_components.metadata_compression_method; - auto config_path = persistent_table_components.table_path; - if (config_path.empty() || config_path.back() != '/') - config_path += "/"; - if (!config_path.starts_with('/')) - config_path = '/' + config_path; - - if (!context_->getSettingsRef()[Setting::write_full_path_in_iceberg_metadata]) - { - filename_generator = FileNamesGenerator( - config_path, config_path, (catalog != nullptr && catalog->isTransactional()), metadata_compression_method, write_format); - } - else - { - auto bucket = metadata_json->getValue(Iceberg::f_location); - if (bucket.empty() || bucket.back() != '/') - bucket += "/"; - filename_generator = FileNamesGenerator( - bucket, config_path, (catalog != nullptr && catalog->isTransactional()), metadata_compression_method, write_format); - } + filename_generator = FileNamesGenerator( + persistent_table_components.path_resolver.getTableLocation(), + (catalog != nullptr && catalog->isTransactional()), metadata_compression_method, write_format); const auto [last_version, unused_meta_path, unused_compression] = getLatestOrExplicitMetadataFileAndVersion( object_storage, @@ -1353,7 +1337,10 @@ IcebergImportSink::IcebergImportSink( persistent_table_components.metadata_cache, context_, getLogger("IcebergWrites").get(), - persistent_table_components.table_uuid); + persistent_table_components.table_uuid, + metadata_compression_method, + true); + (void)unused_meta_path; (void)unused_compression; @@ -1364,6 +1351,7 @@ IcebergImportSink::IcebergImportSink( context->getSettingsRef()[Setting::iceberg_insert_max_bytes_in_data_file], current_schema->getArray(Iceberg::f_fields), filename_generator, + persistent_table_components.path_resolver, object_storage, context, format_settings, diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/MultipleFileWriter.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/MultipleFileWriter.cpp index c52498b00044..be5477f7b608 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/MultipleFileWriter.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/MultipleFileWriter.cpp @@ -53,7 +53,7 @@ void MultipleFileWriter::startNewFile() data_file_names.push_back(metadata_path); if (new_file_path_callback) - new_file_path_callback(metadata_path); + new_file_path_callback(metadata_path.serialize()); buffer = object_storage->writeObject( StoredObject(storage_path), WriteMode::Rewrite, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, context->getWriteSettings()); diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp index 5c7f24ebb805..aebf9e99a508 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp @@ -129,7 +129,7 @@ static bool isTemporaryMetadataFile(const String & file_name) return Poco::UUID{}.tryParse(substring); } -static MetadataFileWithInfo getMetadataFileAndVersion(const std::string & path) +MetadataFileWithInfo getMetadataFileAndVersion(const std::string & path) { String file_name = std::filesystem::path(path).filename(); if (isTemporaryMetadataFile(file_name)) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 226c64039c32..50e53f477141 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -702,7 +702,6 @@ void StorageObjectStorage::commitExportPartitionTransaction( iceberg_commit_export_partition_arguments.partition_values, std::make_shared(getInMemoryMetadataPtr()->getSampleBlock()), exported_paths, - configuration, local_context); return; } From e5ef89fa4b86e39dcc961dea4c2eb8802bdeff72 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 21 Mar 2026 20:04:36 +0100 Subject: [PATCH 3/4] Fix `return` to `continue` in `addRequestedFileLikeStorageVirtualsToChunk` In `addRequestedFileLikeStorageVirtualsToChunk`, the `_row_number` handling block uses `return` instead of `continue` after adding the column to the chunk. This causes the function to exit the loop early, skipping any remaining virtual columns (e.g. `_data_lake_snapshot_version`). When a query requests both `_row_number` and another virtual column after it, the chunk has fewer columns than expected, resulting in: "Invalid number of columns in chunk pushed to OutputPort." The fix was originally in #100116 but was lost during merge because #100208 (revert of #99163) had reintroduced the `return` on master after the fix branch had already resolved it via a different code structure. The regression test `04050_iceberg_virtual_columns_return_bug` is already on master from #100116. https://s3.amazonaws.com/clickhouse-test-reports/json.html?PR=100177&sha=f10711f066fd101124e088ce33061de51ebae0e9&name_0=PR&name_1=Stateless%20tests%20%28amd_debug%2C%20parallel%29 https://github.com/ClickHouse/ClickHouse/issues/87890 Co-Authored-By: Claude Opus 4.6 (1M context) --- src/Storages/VirtualColumnUtils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 9ff0d939b3dc..73e07184859f 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -413,7 +413,7 @@ void addRequestedFileLikeStorageVirtualsToChunk( column->insertValue(i + row_num_offset); auto null_map = ColumnUInt8::create(chunk.getNumRows(), static_cast(0)); chunk.addColumn(ColumnNullable::create(std::move(column), std::move(null_map))); - return; + continue; } else { From 0d8b7204c95d84aee320363ea35323ec10a50610 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 15 May 2026 09:15:50 -0300 Subject: [PATCH 4/4] fix total file bytes --- .../ObjectStorage/DataLakes/Iceberg/MultipleFileWriter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/MultipleFileWriter.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/MultipleFileWriter.cpp index be5477f7b608..4b3afa826089 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/MultipleFileWriter.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/MultipleFileWriter.cpp @@ -103,7 +103,7 @@ void MultipleFileWriter::finalize() per_file_record_counts.push_back(static_cast(*current_file_num_rows)); /// todo arthur fix the wrong counter for file bytes, probably by backporting something else - per_file_byte_sizes.push_back(static_cast(buffer_bytes)); + per_file_byte_sizes.push_back(static_cast(total_bytes)); per_file_stats_list.push_back(current_file_stats); }