diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp index 6f79801514e6..eb8cd68d8c66 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp @@ -141,14 +141,30 @@ Iceberg::MetadataFileWithInfo getMetadataFileAndVersion(const std::string & path /// v.metadata.json /// v-.metadata.json - generated by FileNamesGenerator::generateMetadataName with use_uuid_in_metadata flag if (file_name.starts_with('v')) - version_str = String(file_name.begin() + 1, file_name.begin() + file_name.find_first_of(".-")); + { + auto delim_pos = file_name.find_first_of(".-"); + if (delim_pos == String::npos || delim_pos <= 1) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Bad metadata file name: '{}'. Expected `vN.metadata.json`, `vN-.metadata.json` or `N-.metadata.json` where N is a version number", + file_name); + version_str = String(file_name.begin() + 1, file_name.begin() + delim_pos); + } /// -.metadata.json else - version_str = String(file_name.begin(), file_name.begin() + file_name.find_first_of('-')); + { + auto dash_pos = file_name.find_first_of('-'); + if (dash_pos == String::npos || dash_pos == 0) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Bad metadata file name: '{}'. Expected `vN.metadata.json`, `vN-.metadata.json` or `N-.metadata.json` where N is a version number", + file_name); + version_str = String(file_name.begin(), file_name.begin() + dash_pos); + } if (!std::all_of(version_str.begin(), version_str.end(), isdigit)) throw Exception( - ErrorCodes::BAD_ARGUMENTS, "Bad metadata file name: '{}'. Expected vN.metadata.json where N is a number", file_name); + ErrorCodes::BAD_ARGUMENTS, "Bad metadata file name: '{}'. Expected vN.metadata.json, vN-.metadata.json or N-.metadata.json where N is a version number", file_name); return MetadataFileWithInfo{ .version = std::stoi(version_str), @@ -336,7 +352,8 @@ std::string getProperFilePathFromMetadataInfo(std::string_view data_path, std::s common_path = trim_backward_slash(common_path); table_location = trim_backward_slash(table_location); - if (data_path.starts_with(table_location) && table_location.ends_with(common_path)) + if (data_path.starts_with(table_location) + && (data_path.size() == table_location.size() || data_path[table_location.size()] == '/')) { return std::filesystem::path{common_path} / trim_forward_slash(data_path.substr(table_location.size())); } @@ -348,6 +365,15 @@ std::string getProperFilePathFromMetadataInfo(std::string_view data_path, std::s { /// connection://bucket auto prefix = table_location.substr(0, table_location.size() - common_path.size()); + if (data_path.size() < prefix.size()) + { + throw ::DB::Exception( + DB::ErrorCodes::BAD_ARGUMENTS, + "Iceberg path resolution failed in the second branch. raw_path='{}', table_location='{}', table_root='{}'", + data_path, + table_location, + common_path); + } return std::string{data_path.substr(prefix.size())}; } diff --git a/tests/queries/0_stateless/04033_iceberg_mismatched_location.reference b/tests/queries/0_stateless/04033_iceberg_mismatched_location.reference new file mode 100644 index 000000000000..2481fe970f6a --- /dev/null +++ b/tests/queries/0_stateless/04033_iceberg_mismatched_location.reference @@ -0,0 +1 @@ +BAD_ARGUMENTS diff --git a/tests/queries/0_stateless/04033_iceberg_mismatched_location.sh b/tests/queries/0_stateless/04033_iceberg_mismatched_location.sh new file mode 100755 index 000000000000..333f80e85622 --- /dev/null +++ b/tests/queries/0_stateless/04033_iceberg_mismatched_location.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# Tags: no-fasttest +# Reproduces https://github.com/ClickHouse/ClickHouse/issues/92348 +# Iceberg std::out_of_range exception when metadata `location` field differs from +# ClickHouse table path (e.g. table created by Spark with a different URI scheme/path). + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +TABLE_PATH="04033_iceberg_mismatched/${CLICKHOUSE_TEST_UNIQUE_NAME}" + +${CLICKHOUSE_CLIENT} --allow_experimental_insert_into_iceberg 1 -q " + DROP TABLE IF EXISTS t_iceberg_mismatch; + CREATE TABLE t_iceberg_mismatch (c0 Int) ENGINE = IcebergS3(s3_conn, filename = '${TABLE_PATH}'); + INSERT INTO t_iceberg_mismatch VALUES (1); + DROP TABLE IF EXISTS t_iceberg_mismatch; +" + +# Modify metadata: set location to a very long path, manifest-list to just the filename. +# This simulates a Spark-created table where ClickHouse rewrites the location field +# but Spark's manifest-list entries use short relative paths. +# Before the fix this caused std::out_of_range exception in IcebergPathResolver::resolve +# because table_location is longer than data_path (the manifest filename), +# causing unsigned underflow and out-of-range substr. +${CLICKHOUSE_CLIENT} -q " + SELECT * FROM s3(s3_conn, filename='${TABLE_PATH}/metadata/v2.metadata.json', structure='line String', format='LineAsString') +" | python3 -c " +import json, sys +m = json.load(sys.stdin) +m['location'] = 's3://some-bucket/warehouse/very/deep/nested/path/to/a/spark/created/table/location/with/extra/long/segments/to/ensure/prefix/exceeds/data/path/length' +for s in m.get('snapshots', []): + s['manifest-list'] = s['manifest-list'].rsplit('/', 1)[-1] +print(json.dumps(m)) +" | ${CLICKHOUSE_CLIENT} -q " + INSERT INTO FUNCTION s3(s3_conn, filename='${TABLE_PATH}/metadata/v2.metadata.json', structure='line String', format='LineAsString') + SETTINGS s3_truncate_on_insert=1 + SELECT * FROM input('line String') FORMAT LineAsString +" + +# Disable iceberg metadata cache as a client-level setting so the modified metadata +# file is actually re-read (not served from cache populated during CREATE TABLE above). +# Before the fix this would crash with std::out_of_range in IcebergPathResolver::resolve. +# After the fix it returns a proper BAD_ARGUMENTS error about unresolvable path. +${CLICKHOUSE_CLIENT} --use_iceberg_metadata_files_cache 0 -q " + SELECT * FROM icebergS3(s3_conn, filename='${TABLE_PATH}'); +" 2>&1 | grep -o "BAD_ARGUMENTS" | head -1 diff --git a/tests/queries/0_stateless/04034_iceberg_spark_style_location.reference b/tests/queries/0_stateless/04034_iceberg_spark_style_location.reference new file mode 100644 index 000000000000..d81cc0710eb6 --- /dev/null +++ b/tests/queries/0_stateless/04034_iceberg_spark_style_location.reference @@ -0,0 +1 @@ +42 diff --git a/tests/queries/0_stateless/04034_iceberg_spark_style_location.sh b/tests/queries/0_stateless/04034_iceberg_spark_style_location.sh new file mode 100755 index 000000000000..6267a5ebf573 --- /dev/null +++ b/tests/queries/0_stateless/04034_iceberg_spark_style_location.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Tags: no-fasttest +# Positive test: Spark-created Iceberg table where metadata location field uses +# a completely different scheme/bucket (s3a://spark-bucket/...) than ClickHouse's path. +# The manifest-list entries use full absolute paths with Spark's URI. +# IcebergPathResolver::resolve should strip table_location prefix and prepend table_root. + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +TABLE_PATH="04034_iceberg_spark/${CLICKHOUSE_TEST_UNIQUE_NAME}" + +${CLICKHOUSE_CLIENT} --allow_experimental_insert_into_iceberg 1 -q " + DROP TABLE IF EXISTS t_iceberg_spark; + CREATE TABLE t_iceberg_spark (c0 Int) ENGINE = IcebergS3(s3_conn, filename = '${TABLE_PATH}'); + INSERT INTO t_iceberg_spark VALUES (42); + DROP TABLE IF EXISTS t_iceberg_spark; +" + +# Simulate Spark: change location to s3a://spark-bucket/warehouse/spark_table +# and rewrite manifest-list paths to use the same Spark prefix. +SPARK_LOCATION='s3a://spark-bucket/warehouse/db/spark_table' +${CLICKHOUSE_CLIENT} --input_format_parallel_parsing 0 --output_format_parallel_formatting 0 -q " + SELECT * FROM s3(s3_conn, filename='${TABLE_PATH}/metadata/v2.metadata.json', structure='line String', format='LineAsString') +" | python3 -c " +import json, sys +m = json.load(sys.stdin) +old_location = m['location'].rstrip('/') +new_location = '${SPARK_LOCATION}'.rstrip('/') +m['location'] = new_location +for s in m.get('snapshots', []): + ml = s['manifest-list'] + # Strip old_location prefix (with or without trailing slash) and rejoin with new_location + for prefix in [old_location + '/', old_location]: + if ml.startswith(prefix): + s['manifest-list'] = new_location + '/' + ml[len(prefix):].lstrip('/') + break +print(json.dumps(m)) +" | ${CLICKHOUSE_CLIENT} -q " + INSERT INTO FUNCTION s3(s3_conn, filename='${TABLE_PATH}/metadata/v2.metadata.json', structure='line String', format='LineAsString') + SETTINGS s3_truncate_on_insert=1 + SELECT * FROM input('line String') FORMAT LineAsString +" + +# The query should succeed: IcebergPathResolver::resolve strips the Spark +# table_location prefix from the manifest-list path and prepends table_root. +${CLICKHOUSE_CLIENT} --use_iceberg_metadata_files_cache 0 -q " + SELECT * FROM icebergS3(s3_conn, filename='${TABLE_PATH}'); +" diff --git a/tests/queries/0_stateless/04061_iceberg_bad_metadata_file_name.reference b/tests/queries/0_stateless/04061_iceberg_bad_metadata_file_name.reference new file mode 100644 index 000000000000..2481fe970f6a --- /dev/null +++ b/tests/queries/0_stateless/04061_iceberg_bad_metadata_file_name.reference @@ -0,0 +1 @@ +BAD_ARGUMENTS diff --git a/tests/queries/0_stateless/04061_iceberg_bad_metadata_file_name.sh b/tests/queries/0_stateless/04061_iceberg_bad_metadata_file_name.sh new file mode 100755 index 000000000000..b701964ad41f --- /dev/null +++ b/tests/queries/0_stateless/04061_iceberg_bad_metadata_file_name.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +# Tags: no-fasttest +# Regression test: passing a garbage iceberg_metadata_file_path (e.g. '.*') +# used to cause std::length_error inside getMetadataFileAndVersion because +# find_first_of returned npos. After the fix it must return BAD_ARGUMENTS. + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# The exact query from the fuzzer report. +${CLICKHOUSE_CLIENT} -q " + SELECT * FROM icebergS3('http://localhost:11111/test/est', 'clickhouse', 'clickhouse', SETTINGS iceberg_metadata_file_path = '.*') +" 2>&1 | grep -o "BAD_ARGUMENTS" | head -1