Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 30 additions & 4 deletions src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,14 +141,30 @@ Iceberg::MetadataFileWithInfo getMetadataFileAndVersion(const std::string & path
/// v<V>.metadata.json
/// v<V>-<random-uuid>.metadata.json - generated by FileNamesGenerator::generateMetadataName with use_uuid_in_metadata flag
if (file_name.starts_with('v'))
version_str = String(file_name.begin() + 1, file_name.begin() + file_name.find_first_of(".-"));
{
auto delim_pos = file_name.find_first_of(".-");
if (delim_pos == String::npos || delim_pos <= 1)
throw Exception(
ErrorCodes::BAD_ARGUMENTS,
"Bad metadata file name: '{}'. Expected `vN.metadata.json`, `vN-<uuid>.metadata.json` or `N-<uuid>.metadata.json` where N is a version number",
file_name);
version_str = String(file_name.begin() + 1, file_name.begin() + delim_pos);
}
/// <V>-<random-uuid>.metadata.json
else
version_str = String(file_name.begin(), file_name.begin() + file_name.find_first_of('-'));
{
auto dash_pos = file_name.find_first_of('-');
if (dash_pos == String::npos || dash_pos == 0)
throw Exception(
ErrorCodes::BAD_ARGUMENTS,
"Bad metadata file name: '{}'. Expected `vN.metadata.json`, `vN-<uuid>.metadata.json` or `N-<uuid>.metadata.json` where N is a version number",
file_name);
version_str = String(file_name.begin(), file_name.begin() + dash_pos);
}

if (!std::all_of(version_str.begin(), version_str.end(), isdigit))
throw Exception(
ErrorCodes::BAD_ARGUMENTS, "Bad metadata file name: '{}'. Expected vN.metadata.json where N is a number", file_name);
ErrorCodes::BAD_ARGUMENTS, "Bad metadata file name: '{}'. Expected vN.metadata.json, vN-<uuid>.metadata.json or N-<uuid>.metadata.json where N is a version number", file_name);

return MetadataFileWithInfo{
.version = std::stoi(version_str),
Expand Down Expand Up @@ -336,7 +352,8 @@ std::string getProperFilePathFromMetadataInfo(std::string_view data_path, std::s
common_path = trim_backward_slash(common_path);
table_location = trim_backward_slash(table_location);

if (data_path.starts_with(table_location) && table_location.ends_with(common_path))
if (data_path.starts_with(table_location)
&& (data_path.size() == table_location.size() || data_path[table_location.size()] == '/'))
{
return std::filesystem::path{common_path} / trim_forward_slash(data_path.substr(table_location.size()));
}
Expand All @@ -348,6 +365,15 @@ std::string getProperFilePathFromMetadataInfo(std::string_view data_path, std::s
{
/// connection://bucket
auto prefix = table_location.substr(0, table_location.size() - common_path.size());
if (data_path.size() < prefix.size())
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I see this change is from ClickHouse#100420.
Do we want to backport this huge PR too?

{
throw ::DB::Exception(
DB::ErrorCodes::BAD_ARGUMENTS,
"Iceberg path resolution failed in the second branch. raw_path='{}', table_location='{}', table_root='{}'",
data_path,
table_location,
common_path);
}
return std::string{data_path.substr(prefix.size())};
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
BAD_ARGUMENTS
47 changes: 47 additions & 0 deletions tests/queries/0_stateless/04033_iceberg_mismatched_location.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env bash
# Tags: no-fasttest
# Reproduces https://github.com/ClickHouse/ClickHouse/issues/92348
# Iceberg std::out_of_range exception when metadata `location` field differs from
# ClickHouse table path (e.g. table created by Spark with a different URI scheme/path).

CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh

TABLE_PATH="04033_iceberg_mismatched/${CLICKHOUSE_TEST_UNIQUE_NAME}"

${CLICKHOUSE_CLIENT} --allow_experimental_insert_into_iceberg 1 -q "
DROP TABLE IF EXISTS t_iceberg_mismatch;
CREATE TABLE t_iceberg_mismatch (c0 Int) ENGINE = IcebergS3(s3_conn, filename = '${TABLE_PATH}');
INSERT INTO t_iceberg_mismatch VALUES (1);
DROP TABLE IF EXISTS t_iceberg_mismatch;
"

# Modify metadata: set location to a very long path, manifest-list to just the filename.
# This simulates a Spark-created table where ClickHouse rewrites the location field
# but Spark's manifest-list entries use short relative paths.
# Before the fix this caused std::out_of_range exception in IcebergPathResolver::resolve
# because table_location is longer than data_path (the manifest filename),
# causing unsigned underflow and out-of-range substr.
${CLICKHOUSE_CLIENT} -q "
SELECT * FROM s3(s3_conn, filename='${TABLE_PATH}/metadata/v2.metadata.json', structure='line String', format='LineAsString')
" | python3 -c "
import json, sys
m = json.load(sys.stdin)
m['location'] = 's3://some-bucket/warehouse/very/deep/nested/path/to/a/spark/created/table/location/with/extra/long/segments/to/ensure/prefix/exceeds/data/path/length'
for s in m.get('snapshots', []):
s['manifest-list'] = s['manifest-list'].rsplit('/', 1)[-1]
print(json.dumps(m))
" | ${CLICKHOUSE_CLIENT} -q "
INSERT INTO FUNCTION s3(s3_conn, filename='${TABLE_PATH}/metadata/v2.metadata.json', structure='line String', format='LineAsString')
SETTINGS s3_truncate_on_insert=1
SELECT * FROM input('line String') FORMAT LineAsString
"

# Disable iceberg metadata cache as a client-level setting so the modified metadata
# file is actually re-read (not served from cache populated during CREATE TABLE above).
# Before the fix this would crash with std::out_of_range in IcebergPathResolver::resolve.
# After the fix it returns a proper BAD_ARGUMENTS error about unresolvable path.
${CLICKHOUSE_CLIENT} --use_iceberg_metadata_files_cache 0 -q "
SELECT * FROM icebergS3(s3_conn, filename='${TABLE_PATH}');
" 2>&1 | grep -o "BAD_ARGUMENTS" | head -1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
42
50 changes: 50 additions & 0 deletions tests/queries/0_stateless/04034_iceberg_spark_style_location.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env bash
# Tags: no-fasttest
# Positive test: Spark-created Iceberg table where metadata location field uses
# a completely different scheme/bucket (s3a://spark-bucket/...) than ClickHouse's path.
# The manifest-list entries use full absolute paths with Spark's URI.
# IcebergPathResolver::resolve should strip table_location prefix and prepend table_root.

CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh

TABLE_PATH="04034_iceberg_spark/${CLICKHOUSE_TEST_UNIQUE_NAME}"

${CLICKHOUSE_CLIENT} --allow_experimental_insert_into_iceberg 1 -q "
DROP TABLE IF EXISTS t_iceberg_spark;
CREATE TABLE t_iceberg_spark (c0 Int) ENGINE = IcebergS3(s3_conn, filename = '${TABLE_PATH}');
INSERT INTO t_iceberg_spark VALUES (42);
DROP TABLE IF EXISTS t_iceberg_spark;
"

# Simulate Spark: change location to s3a://spark-bucket/warehouse/spark_table
# and rewrite manifest-list paths to use the same Spark prefix.
SPARK_LOCATION='s3a://spark-bucket/warehouse/db/spark_table'
${CLICKHOUSE_CLIENT} --input_format_parallel_parsing 0 --output_format_parallel_formatting 0 -q "
SELECT * FROM s3(s3_conn, filename='${TABLE_PATH}/metadata/v2.metadata.json', structure='line String', format='LineAsString')
" | python3 -c "
import json, sys
m = json.load(sys.stdin)
old_location = m['location'].rstrip('/')
new_location = '${SPARK_LOCATION}'.rstrip('/')
m['location'] = new_location
for s in m.get('snapshots', []):
ml = s['manifest-list']
# Strip old_location prefix (with or without trailing slash) and rejoin with new_location
for prefix in [old_location + '/', old_location]:
if ml.startswith(prefix):
s['manifest-list'] = new_location + '/' + ml[len(prefix):].lstrip('/')
break
print(json.dumps(m))
" | ${CLICKHOUSE_CLIENT} -q "
INSERT INTO FUNCTION s3(s3_conn, filename='${TABLE_PATH}/metadata/v2.metadata.json', structure='line String', format='LineAsString')
SETTINGS s3_truncate_on_insert=1
SELECT * FROM input('line String') FORMAT LineAsString
"

# The query should succeed: IcebergPathResolver::resolve strips the Spark
# table_location prefix from the manifest-list path and prepends table_root.
${CLICKHOUSE_CLIENT} --use_iceberg_metadata_files_cache 0 -q "
SELECT * FROM icebergS3(s3_conn, filename='${TABLE_PATH}');
"
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
BAD_ARGUMENTS
14 changes: 14 additions & 0 deletions tests/queries/0_stateless/04061_iceberg_bad_metadata_file_name.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env bash
# Tags: no-fasttest
# Regression test: passing a garbage iceberg_metadata_file_path (e.g. '.*')
# used to cause std::length_error inside getMetadataFileAndVersion because
# find_first_of returned npos. After the fix it must return BAD_ARGUMENTS.

CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh

# The exact query from the fuzzer report.
${CLICKHOUSE_CLIENT} -q "
SELECT * FROM icebergS3('http://localhost:11111/test/est', 'clickhouse', 'clickhouse', SETTINGS iceberg_metadata_file_path = '.*')
" 2>&1 | grep -o "BAD_ARGUMENTS" | head -1
Loading