diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index dac72f844453..62c4ac5482f1 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -39,10 +40,12 @@ #include #include #include +#include #include #include #include #include +#include /// UINT16 and UINT32 are processed separately, see comments in readColumnFromArrowColumn. @@ -86,6 +89,25 @@ static bool emptyTimezoneAsUTC(const std::string & format_name, const FormatSett return format_name == "Parquet" && format_settings.parquet.local_time_as_utc; } +static bool isUUIDField(const arrow::Field & field) +{ + // Check for our ClickHouse/Arrow extension name + if (field.HasMetadata()) + { + auto metadata = field.metadata(); + auto ext_name = metadata->Get("ARROW:extension:name"); + if (ext_name.ok() && *ext_name == "arrow.uuid") + return true; + + // Also check the Parquet logical type hint we added to the writer + auto pq_type = metadata->Get("PARQUET:logical_type"); + if (pq_type.ok() && *pq_type == "UUID") + return true; + } + return field.type()->id() == arrow::Type::EXTENSION && + std::static_pointer_cast(field.type())->extension_name() == "arrow.uuid"; +} + /// Inserts numeric data right into internal column data to reduce an overhead template > static ColumnWithTypeAndName readColumnWithNumericData(const std::shared_ptr & arrow_column, const String & column_name) @@ -502,6 +524,61 @@ static ColumnWithTypeAndName readColumnWithDecimalData(const std::shared_ptr(arrow_column, column_name, internal_type); } +static ColumnWithTypeAndName readColumnWithUUIDFromFixedBinaryData( + const std::shared_ptr & arrow_column, + const std::string & column_name, + DataTypePtr type_hint) +{ + auto column = type_hint->createColumn(); + auto & column_data = assert_cast &>(*column).getData(); + column_data.reserve(arrow_column->length()); + + for (int chunk_i = 0, num_chunks = arrow_column->num_chunks(); chunk_i < num_chunks; ++chunk_i) + { + const auto & arrow_chunk = *(arrow_column->chunk(chunk_i)); + const auto & fixed_binary_array = assert_cast(arrow_chunk); + + // Security check: Ensure we actually got 16 bytes per row + if (fixed_binary_array.byte_width() != sizeof(UUID)) + throw Exception(ErrorCodes::INCORRECT_DATA, + "Cannot read UUID from Arrow FixedSizeBinary array with byte_width != {}", sizeof(UUID)); + + for (int64_t i = 0; i < fixed_binary_array.length(); ++i) + { + if (fixed_binary_array.IsNull(i)) + { + // The Nullable wrapper handles the actual null map later; just insert a dummy value + column_data.emplace_back(UUID{}); + } + else + { + UUID res; + std::memcpy(&res, fixed_binary_array.GetValue(i), 16); + + auto * bytes = reinterpret_cast(&res); + + // Only swap the 64-bit halves back if the host CPU is Little-Endian + if constexpr (std::endian::native == std::endian::little) + { + std::reverse(bytes, bytes + 8); + std::reverse(bytes + 8, bytes + 16); + } + else + { + // Big-Endian: The bytes are already in network order, but the + // 64-bit halves are in the wrong order for Arrow (low||high). + // Swap the first 8 bytes with the second 8 bytes. + std::swap_ranges(bytes, bytes + 8, bytes + 8); + } + + column_data.emplace_back(res); + } + } + } + + return {std::move(column), type_hint, column_name}; +} + /// Creates a null bytemap from arrow's null bitmap static ColumnPtr readByteMapFromArrowColumn(const std::shared_ptr & arrow_column) { @@ -945,11 +1022,49 @@ static ColumnWithTypeAndName readNonNullableColumnFromArrowColumn( } return readColumnWithStringData(arrow_column, column_name); } + case arrow::Type::EXTENSION: + { + // Unwrap the Extension array into raw physical chunks + auto ext_type = std::static_pointer_cast(arrow_column->type()); + std::vector> storage_chunks; + + for (int i = 0; i < arrow_column->num_chunks(); ++i) + { + auto ext_array = std::static_pointer_cast(arrow_column->chunk(i)); + storage_chunks.push_back(ext_array->storage()); + } + + auto storage_column = std::make_shared(storage_chunks, ext_type->storage_type()); + + std::shared_ptr storage_field = nullptr; + + if (arrow_field) + storage_field = std::make_shared(arrow_field->name(), + ext_type->storage_type(), + arrow_field->nullable(), + arrow_field->metadata()); + + return readNonNullableColumnFromArrowColumn( + storage_column, + column_name, + full_column_name, + dictionary_infos, + type_hint, + is_map_nested_column, + make_nullable_if_low_cardinality, + geo_metadata, + settings, + storage_field, + parquet_columns_to_clickhouse, + clickhouse_columns_to_parquet); + } case arrow::Type::FIXED_SIZE_BINARY: { - if (type_hint) + DataTypePtr hint_to_check = type_hint ? removeNullable(type_hint) : nullptr; + + if (hint_to_check) { - switch (type_hint->getTypeId()) + switch (hint_to_check->getTypeId()) { case TypeIndex::Int128: return readColumnWithBigIntegerFromFixedBinaryData(arrow_column, column_name, type_hint); @@ -959,11 +1074,20 @@ static ColumnWithTypeAndName readNonNullableColumnFromArrowColumn( return readColumnWithBigIntegerFromFixedBinaryData(arrow_column, column_name, type_hint); case TypeIndex::UInt256: return readColumnWithBigIntegerFromFixedBinaryData(arrow_column, column_name, type_hint); + case TypeIndex::UUID: + return readColumnWithUUIDFromFixedBinaryData(arrow_column, column_name, type_hint); default: break; } } + /// Correctly triggers the UUID reader for metadata-flagged columns. + if (arrow_field && isUUIDField(*arrow_field)) + { + return readColumnWithUUIDFromFixedBinaryData(arrow_column, column_name, std::make_shared()); + } + + // Default fallback return readColumnWithFixedStringData(arrow_column, column_name); } case arrow::Type::LARGE_STRING: @@ -1461,12 +1585,73 @@ static void checkStatus(const arrow::Status & status, const String & column_name throw Exception{ErrorCodes::UNKNOWN_EXCEPTION, "Error with a {} column '{}': {}.", format_name, column_name, status.ToString()}; } +static std::shared_ptr unwrapArrowExtensionTypesRecursively(const std::shared_ptr & type) +{ + if (!type) return type; + + if (type->id() == arrow::Type::EXTENSION) + return std::static_pointer_cast(type)->storage_type(); + + if (type->id() == arrow::Type::LIST) + { + auto list_type = std::static_pointer_cast(type); + auto value_field = list_type->value_field(); + return arrow::list(value_field->WithType(unwrapArrowExtensionTypesRecursively(value_field->type()))); + } + + if (type->id() == arrow::Type::LARGE_LIST) + { + auto large_list_type = std::static_pointer_cast(type); + auto value_field = large_list_type->value_field(); + return arrow::large_list(value_field->WithType(unwrapArrowExtensionTypesRecursively(value_field->type()))); + } + + if (type->id() == arrow::Type::FIXED_SIZE_LIST) + { + auto fixed_list = std::static_pointer_cast(type); + auto value_field = fixed_list->value_field(); + return arrow::fixed_size_list(value_field->WithType(unwrapArrowExtensionTypesRecursively(value_field->type())), fixed_list->list_size()); + } + + if (type->id() == arrow::Type::MAP) + { + auto map_type = std::static_pointer_cast(type); + auto item_field = map_type->item_field(); + + // arrow::map expects a DataType for the key (since keys cannot be nullable), + // but accepts a Field for the item to preserve custom nullability. + return arrow::map(unwrapArrowExtensionTypesRecursively(map_type->key_type()), + item_field->WithType(unwrapArrowExtensionTypesRecursively(item_field->type())), + map_type->keys_sorted() + ); + } + + if (type->id() == arrow::Type::STRUCT) + { + auto struct_type = std::static_pointer_cast(type); + std::vector> new_fields; + for (const auto & struct_field : struct_type->fields()) + { + // WithType preserves the field name and nullable status, only changing the underlying type + new_fields.push_back(struct_field->WithType(unwrapArrowExtensionTypesRecursively(struct_field->type()))); + } + return arrow::struct_(new_fields); + } + + return type; +} + /// Create empty arrow column using specified field static std::shared_ptr createArrowColumn(const std::shared_ptr & field, const String & format_name) { + // We unwrap the type ONLY for the `build_type` + // passed to MakeBuilder. We DO NOT mutate the `field` itself. + // This provides Arrow with the primitive storage type it needs for RAM allocation + // without destroying the logical metadata required by ClickHouse for type inference + std::shared_ptr build_type = unwrapArrowExtensionTypesRecursively(field->type()); + std::unique_ptr array_builder; - /// default_memory_pool() uses posix_memalign which is intercepted and counted in MemoryTracker. - arrow::Status status = MakeBuilder(arrow::default_memory_pool(), field->type(), &array_builder); + arrow::Status status = MakeBuilder(arrow::default_memory_pool(), build_type, &array_builder); checkStatus(status, field->name(), format_name); std::shared_ptr arrow_array; diff --git a/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp b/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp index e69b69d1cfad..35973062bdb3 100644 --- a/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp +++ b/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -28,6 +29,7 @@ #include #include #include +#include #define FOR_INTERNAL_NUMERIC_TYPES(M) \ M(Int8, arrow::Int8Builder) \ @@ -68,6 +70,33 @@ namespace DB extern const int ILLEGAL_COLUMN; } + class ArrowUUIDExtensionType : public arrow::ExtensionType + { + public: + ArrowUUIDExtensionType() : arrow::ExtensionType(arrow::fixed_size_binary(16)) {} + + std::string extension_name() const override { return "arrow.uuid"; } + + bool ExtensionEquals(const arrow::ExtensionType& other) const override + { + return other.extension_name() == this->extension_name(); + } + + std::shared_ptr MakeArray(std::shared_ptr data) const override + { + return std::make_shared(data); + } + + arrow::Result> Deserialize( + std::shared_ptr /* storage_type */, + const std::string& /* serialized_data */) const override + { + return std::make_shared(); + } + + std::string Serialize() const override { return ""; } + }; + static const std::initializer_list>> internal_type_to_arrow_type = { {"UInt8", arrow::uint8()}, @@ -231,6 +260,50 @@ namespace DB } } + static void fillArrowArrayWithUUIDColumnData( + const ColumnPtr & column, + const PaddedPODArray * null_bytemap, + const String & format_name, + arrow::ArrayBuilder * array_builder, + size_t start, + size_t end) + { + const auto * col_uuid = assert_cast *>(column.get()); + + if (array_builder->type()->id() != arrow::Type::FIXED_SIZE_BINARY) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Cannot fill arrow array with {} data for format {}", column->getName(), format_name); + + auto * fixed_builder = assert_cast(array_builder); + const auto & uuid_data = col_uuid->getData(); + + for (size_t i = start; i < end; ++i) + { + if (null_bytemap && (*null_bytemap)[i]) + { + arrow::Status status = fixed_builder->AppendNull(); + checkStatus(status, column->getName(), format_name); + continue; + } + + UUID res = uuid_data[i]; + auto * bytes = reinterpret_cast(&res); + + if constexpr (std::endian::native == std::endian::little) + { + std::reverse(bytes, bytes + 8); + std::reverse(bytes + 8, bytes + 16); + } + else + { + std::swap_ranges(bytes, bytes + 8, bytes + 8); + } + + arrow::Status status = fixed_builder->Append(reinterpret_cast(&res)); + checkStatus(status, column->getName(), format_name); + } + } + static void fillArrowArray( const String & column_name, ColumnPtr & column, @@ -912,6 +985,9 @@ namespace DB case TypeIndex::UInt256: fillArrowArrayWithBigIntegerColumnData(column, null_bytemap, format_name, array_builder, start, end); break; + case TypeIndex::UUID: + fillArrowArrayWithUUIDColumnData(column, null_bytemap, format_name, array_builder, start, end); + break; #define DISPATCH(CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE) \ case TypeIndex::CPP_NUMERIC_TYPE: \ fillArrowArrayWithNumericColumnData(column, null_bytemap, format_name, array_builder, start, end); \ @@ -967,13 +1043,13 @@ namespace DB } static std::shared_ptr getArrowType( - DataTypePtr column_type, ColumnPtr column, const std::string & column_name, const std::string & format_name, const CHColumnToArrowColumn::Settings & settings, bool * out_is_column_nullable) + DataTypePtr column_type, ColumnPtr column, const std::string & column_name, const std::string & format_name, const CHColumnToArrowColumn::Settings & settings, bool * out_is_column_nullable, bool for_builder = false) { if (column_type->isNullable()) { DataTypePtr nested_type = assert_cast(column_type.get())->getNestedType(); ColumnPtr nested_column = assert_cast(column.get())->getNestedColumnPtr(); - auto arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, settings, out_is_column_nullable); + auto arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, settings, out_is_column_nullable, for_builder); *out_is_column_nullable = true; return arrow_type; } @@ -1008,7 +1084,7 @@ namespace DB auto nested_type = assert_cast(column_type.get())->getNestedType(); auto nested_column = assert_cast(column.get())->getDataPtr(); bool is_item_nullable = false; - auto nested_arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, settings, &is_item_nullable); + auto nested_arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, settings, &is_item_nullable, for_builder); return arrow::list(std::make_shared("item", nested_arrow_type, is_item_nullable)); } @@ -1022,7 +1098,7 @@ namespace DB for (size_t i = 0; i != nested_types.size(); ++i) { bool is_field_nullable = false; - auto nested_arrow_type = getArrowType(nested_types[i], tuple_column->getColumnPtr(i), nested_names[i], format_name, settings, &is_field_nullable); + auto nested_arrow_type = getArrowType(nested_types[i], tuple_column->getColumnPtr(i), nested_names[i], format_name, settings, &is_field_nullable, for_builder); nested_fields.push_back(std::make_shared(nested_names[i], nested_arrow_type, is_field_nullable)); } return arrow::struct_(nested_fields); @@ -1036,7 +1112,7 @@ namespace DB const auto & indexes_column = lc_column->getIndexesPtr(); return arrow::dictionary( getArrowTypeForLowCardinalityIndexes(indexes_column, settings), - getArrowType(nested_type, nested_column, column_name, format_name, settings, out_is_column_nullable)); + getArrowType(nested_type, nested_column, column_name, format_name, settings, out_is_column_nullable, for_builder)); } if (isMap(column_type)) @@ -1047,9 +1123,9 @@ namespace DB const auto & columns = assert_cast(column.get())->getNestedData().getColumns(); bool _is_key_nullable = false; - auto key_arrow_type = getArrowType(key_type, columns[0], column_name, format_name, settings, &_is_key_nullable); + auto key_arrow_type = getArrowType(key_type, columns[0], column_name, format_name, settings, &_is_key_nullable, for_builder); bool is_val_nullable = false; - auto val_arrow_type = getArrowType(val_type, columns[1], column_name, format_name, settings, &is_val_nullable); + auto val_arrow_type = getArrowType(val_type, columns[1], column_name, format_name, settings, &is_val_nullable, for_builder); return arrow::map( key_arrow_type, @@ -1080,6 +1156,9 @@ namespace DB if (isIPv4(column_type)) return arrow::uint32(); + if (isUUID(column_type)) + return for_builder ? arrow::fixed_size_binary(sizeof(UUID)) : std::make_shared(); + if (isDate(column_type) && settings.output_date_as_uint16) return arrow::uint16(); @@ -1158,13 +1237,27 @@ namespace DB format_name, settings, &is_column_nullable); + + std::shared_ptr field_metadata = nullptr; + if (column_to_field_id && column_to_field_id->contains(header_column.name)) { Int64 field_id = column_to_field_id->at(header_column.name); - auto key_value_metadata = arrow::key_value_metadata({"PARQUET:field_id"}, - {std::to_string(field_id)}); - arrow_fields.emplace_back(std::make_shared(header_column.name, arrow_type, is_column_nullable, key_value_metadata)); + field_metadata = arrow::key_value_metadata({"PARQUET:field_id"}, {std::to_string(field_id)}); } + + // Inject our UUID metadata if it's a root UUID column + if (isUUID(removeNullable(header_column.type))) + { + auto ext_metadata = arrow::key_value_metadata( + {"ARROW:extension:name", "ARROW:extension:metadata", "PARQUET:logical_type"}, + {"arrow.uuid", "", "UUID"} + ); + field_metadata = field_metadata ? field_metadata->Merge(*ext_metadata) : ext_metadata; + } + + if (field_metadata) + arrow_fields.emplace_back(std::make_shared(header_column.name, arrow_type, is_column_nullable, field_metadata)); else arrow_fields.emplace_back(std::make_shared(header_column.name, arrow_type, is_column_nullable)); } @@ -1202,8 +1295,13 @@ namespace DB if (!settings.low_cardinality_as_dictionary) column = recursiveRemoveLowCardinality(column); + // Generate the unwrapped builder schema (safe for MakeBuilder) + bool is_column_nullable = false; + auto builder_type = getArrowType( + header_column.type, column, header_column.name, format_name, settings, &is_column_nullable, true /* for_builder */); + std::unique_ptr array_builder; - arrow::Status status = MakeBuilder(arrow::default_memory_pool(), arrow_schema->field(static_cast(column_i))->type(), &array_builder); + arrow::Status status = MakeBuilder(arrow::default_memory_pool(), builder_type, &array_builder); checkStatus(status, column->getName(), format_name); fillArrowArray( @@ -1222,6 +1320,15 @@ namespace DB status = array_builder->Finish(&arrow_array); checkStatus(status, column->getName(), format_name); + // Zero-copy cast to the extension-rich schema (handles infinite nesting) + auto target_type = arrow_schema->field(static_cast(column_i))->type(); + if (!arrow_array->type()->Equals(*target_type)) + { + auto view_result = arrow_array->View(target_type); + checkStatus(view_result.status(), column->getName(), format_name); + arrow_array = view_result.ValueOrDie(); + } + table_data.at(column_i).emplace_back(std::move(arrow_array)); } } diff --git a/src/Processors/Formats/Impl/Parquet/Decoding.cpp b/src/Processors/Formats/Impl/Parquet/Decoding.cpp index 1df543f9f075..b7de7c3fd477 100644 --- a/src/Processors/Formats/Impl/Parquet/Decoding.cpp +++ b/src/Processors/Formats/Impl/Parquet/Decoding.cpp @@ -1473,6 +1473,43 @@ void Float16Converter::convertColumn(std::span data, size_t num_valu } } +static inline UUID decodeParquetUUID(const char * data) +{ + UUID res; + std::memcpy(&res, data, 16); + auto * bytes = reinterpret_cast(&res); + + // Parquet demands Big-Endian (network byte order) for UUIDs + if constexpr (std::endian::native == std::endian::little) + { + std::reverse(bytes, bytes + 8); + std::reverse(bytes + 8, bytes + 16); + } + else + { + std::swap_ranges(bytes, bytes + 8, bytes + 8); + } + + return res; +} + +void UUIDConverter::convertColumn(std::span data, size_t num_values, IColumn & col) const +{ + auto & col_data = assert_cast &>(col).getData(); + size_t old_size = col_data.size(); + col_data.resize(old_size + num_values); + + for (size_t i = 0; i < num_values; ++i) + { + col_data[old_size + i] = decodeParquetUUID(data.data() + i * 16); + } +} + +void UUIDConverter::convertField(std::span data, bool /*is_max*/, Field & out) const +{ + out = decodeParquetUUID(data.data()); +} + void FixedStringConverter::convertField(std::span data, bool /*is_max*/, Field & out) const { if (data.size() != input_size) diff --git a/src/Processors/Formats/Impl/Parquet/Decoding.h b/src/Processors/Formats/Impl/Parquet/Decoding.h index 05f6ef6a2e02..e952df66cb34 100644 --- a/src/Processors/Formats/Impl/Parquet/Decoding.h +++ b/src/Processors/Formats/Impl/Parquet/Decoding.h @@ -217,6 +217,14 @@ struct FixedStringConverter : public FixedSizeConverter void convertField(std::span data, bool /*is_max*/, Field & out) const override; }; +struct UUIDConverter : public FixedSizeConverter +{ + UUIDConverter() { input_size = 16; } + + void convertColumn(std::span data, size_t num_values, IColumn & col) const override; + void convertField(std::span data, bool is_max, Field & out) const override; +}; + struct TrivialStringConverter : public StringConverter { bool isTrivial() const override { return true; } diff --git a/src/Processors/Formats/Impl/Parquet/SchemaConverter.cpp b/src/Processors/Formats/Impl/Parquet/SchemaConverter.cpp index f6ea1a4bd6ce..910e3b06c8db 100644 --- a/src/Processors/Formats/Impl/Parquet/SchemaConverter.cpp +++ b/src/Processors/Formats/Impl/Parquet/SchemaConverter.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -1154,8 +1155,10 @@ void SchemaConverter::processPrimitiveColumn( if (type != parq::Type::FIXED_LEN_BYTE_ARRAY || element.type_length != 16) throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected physical type for UUID column: {}", thriftToString(element)); - /// TODO [parquet]: Support UUIDs. Make sure to get the byte order right, it seems tricky. - /// For now, fall through to reading as FixedString(16). + out_inferred_type = std::make_shared(); + out_decoder.allow_stats = true; // UUIDs support min/max stats + out_decoder.fixed_size_converter = std::make_shared(); + return; } else if (logical.__isset.FLOAT16) { @@ -1252,12 +1255,19 @@ void SchemaConverter::processPrimitiveColumn( { if (type_hint) { - /// If parquet type is FIXED_LEN_BYTE_ARRAY(16), and type hint is [U]Int128, assume - /// it's binary little-endian [U]Int128. That's how clickhouse parquet writer writes - /// [U]Int128 (btw, we should probably change that to Decimal). - /// Same for FIXED_LEN_BYTE_ARRAY(32) and [U]Int256. - /// We can't leave this conversion to castColumn because it would parse as text. WhichDataType which(type_hint->getTypeId()); + + /// Handle explicit UUID type hint (e.g. SELECT x::UUID) + if (which.isUUID() && element.type_length == 16) + { + out_inferred_type = type_hint; + out_decoder.fixed_size_converter = std::make_shared(); + out_decoder.allow_stats = true; + return; + } + + /// Legacy ClickHouse binary formats for [U]Int128 and [U]Int256. + /// These are written as FIXED_LEN_BYTE_ARRAY(16/32) but without logical types. if (which.isInteger() && !which.isNativeInteger() && type_hint->getSizeOfValueInMemory() == size_t(element.type_length)) { @@ -1265,14 +1275,26 @@ void SchemaConverter::processPrimitiveColumn( } } + /// Automatic Inference: If no hint is provided, but the Parquet + /// file metadata explicitly flags this column as a UUID. + if (logical.__isset.UUID && element.type_length == 16) + { + out_inferred_type = std::make_shared(); + out_decoder.fixed_size_converter = std::make_shared(); + out_decoder.allow_stats = true; + return; + } + + /// Default Fallback: If it's not a UUID or a BigInt hint, treat it as FixedString. if (!out_inferred_type) out_inferred_type = std::make_shared(size_t(element.type_length)); + auto converter = std::make_shared(); converter->input_size = size_t(element.type_length); out_decoder.fixed_size_converter = std::move(converter); - /// (The case where type_hint is FixedString is handled above, no need to check for it here.) - out_decoder.allow_stats = !logical.__isset.UUID && WhichDataType(get_output_type_index()).isString(); + /// Stats are only allowed for FixedString if the output is actually a string. + out_decoder.allow_stats = WhichDataType(get_output_type_index()).isString(); return; } } diff --git a/src/Processors/Formats/Impl/Parquet/Write.cpp b/src/Processors/Formats/Impl/Parquet/Write.cpp index 99737d7ff3bd..338ac3c82344 100644 --- a/src/Processors/Formats/Impl/Parquet/Write.cpp +++ b/src/Processors/Formats/Impl/Parquet/Write.cpp @@ -456,6 +456,45 @@ struct ConverterEnumAsString } }; +struct ConverterUUID +{ + using Statistics = StatisticsFixedStringRef; + + const ColumnVector & column; + PODArray buf; + PODArray swapped_buf; + + explicit ConverterUUID(const ColumnPtr & c) : column(assert_cast &>(*c)) {} + + const parquet::FixedLenByteArray * getBatch(size_t offset, size_t count) + { + buf.resize(count); + swapped_buf.resize(count); + + for (size_t i = 0; i < count; ++i) + { + UUID res = column.getData()[offset + i]; + auto * bytes = reinterpret_cast(&res); + + if constexpr (std::endian::native == std::endian::little) + { + std::reverse(bytes, bytes + 8); + std::reverse(bytes + 8, bytes + 16); + } + else + { + std::swap_ranges(bytes, bytes + 8, bytes + 8); + } + + swapped_buf[i] = res; + buf[i].ptr = reinterpret_cast(&swapped_buf[i]); + } + return buf.data(); + } + + size_t fixedStringSize() { return 16; } +}; + struct ConverterFixedString { using Statistics = StatisticsFixedStringRef; @@ -1241,9 +1280,15 @@ void writeColumnChunkBody( case TypeIndex::Int128: F(Int128); break; case TypeIndex::Int256: F(Int256); break; case TypeIndex::IPv6: F(IPv6); break; - case TypeIndex::UUID: F(UUID); break; #undef F + case TypeIndex::UUID: + writeColumnImpl(s, + options, + out, + ConverterUUID(s.primitive_column)); + break; + #define D(source_type) \ writeColumnImpl( \ s, options, out, ConverterDecimal(s.primitive_column)) diff --git a/tests/queries/0_stateless/00900_long_parquet_load_2.reference b/tests/queries/0_stateless/00900_long_parquet_load_2.reference index dd5b76e84788..c4527962ac67 100644 --- a/tests/queries/0_stateless/00900_long_parquet_load_2.reference +++ b/tests/queries/0_stateless/00900_long_parquet_load_2.reference @@ -547,28 +547,28 @@ \N [] [] 3 15782817770995647175 === Try load data from multi_column_bf.gz.parquet -540 -62 60904 524 4294967391 13439599920079768466 false VIRE {"key":400,"value":"BNHSB"} GMKT MưFI6eĂ 1 9024461037616764855 -923 -58 46518 345 4294967327 18309803705868924873 true LJHD {"key":97,"value":"EIUNN"} WVRQ |wGZ_P 1 13244549421489669952 -988 -89 47241 756 4294967373 15522063381546272260 true CLDK {"key":604,"value":"JSQFX"} USNQ >8O;i, 1 11820769459832951331 -73 116 32428 513 4294967384 10876946806756585544 false SGPI {"key":711,"value":"XMEED"} IOIX d\rOFЂn\b 1 129876983795436829 -639 -69 45537 315 4294967315 7263280072686713102 false CHFC {"key":48,"value":"MFGFA"} ZLBS ]Ʇ̠Bʹ (LuբW 1 12009403283062034866 -959 -51 56925 86 4294967348 2903746547904555720 true XHJC {"key":355,"value":"MQETE"} QIQA /<3@C x* 1 14720674036350698175 -286 7 32408 89 4294967349 11768163425287102515 true JBWQ {"key":63,"value":"LBRXP"} IHTI 2B=f*_ 1 18315169877216931174 -815 -16 37808 207 4294967396 18058293579070949154 true AEZA {"key":656,"value":"GOJYT"} ZODP eWE> 1 1240809798326395103 -212 -74 50116 784 4294967316 4639886571322169794 true JZKC {"key":844,"value":"WYFIG"} KELP w+ʰO] 1 13188467212433715822 -510 -44 14672 697 4294967323 3203265463862679482 true QPXM {"key":408,"value":"BJCSQ"} NMGV x|A2^qm 1 1158043015045771815 -827 38 52135 612 4294967349 2177159037635926390 false KWOV {"key":384,"value":"PLUUJ"} VACB >$=L$Q\t 1 12363779168489547769 -705 -96 7506 479 4294967337 1919118169111152342 true EGEU {"key":522,"value":"AOPEL"} TZFW sIޣr^3 1 10667084832660097342 -174 -38 10608 469 4294967369 8139895279544416045 false SUED {"key":348,"value":"KDIJQ"} VKSM v3A, 1 7867030734613018460 -89 79 31144 138 4294967379 3327168023216233174 true MRTX {"key":492,"value":"NGCXD"} GVCG y>LO(xج;& 1 17782050321393165287 -802 92 55937 83 4294967368 13260877643210989136 true FNXX {"key":566,"value":"KVFDZ"} CMMV m:uL{Tzf 1 7067095719366985926 -599 -107 36579 116 4294967311 4635153073869187643 true BURQ {"key":259,"value":"VNMQQ"} TMOJ YznM<~ $DATA_FILE + +python3 -c " +import pyarrow.feather as feather + +table = feather.read_table('$DATA_FILE') +raw_bytes = table['u'][0].as_py() + +print(raw_bytes) +" + +rm -f $DATA_FILE diff --git a/tests/queries/0_stateless/04042_arrow_uuid_import.reference b/tests/queries/0_stateless/04042_arrow_uuid_import.reference new file mode 100644 index 000000000000..4bbd907c1352 --- /dev/null +++ b/tests/queries/0_stateless/04042_arrow_uuid_import.reference @@ -0,0 +1 @@ +4d8f1ec8-bc41-4148-8069-d31c399b507b diff --git a/tests/queries/0_stateless/04042_arrow_uuid_import.sh b/tests/queries/0_stateless/04042_arrow_uuid_import.sh new file mode 100755 index 000000000000..05720ae36286 --- /dev/null +++ b/tests/queries/0_stateless/04042_arrow_uuid_import.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +DATA_FILE=$CLICKHOUSE_TEST_UNIQUE_NAME.arrow + +# Generate a strictly compliant RFC 4122 Arrow file +python3 -c " +import pyarrow as pa +import pyarrow.feather as feather +import uuid + +# python's uuid module automatically outputs Big-Endian network bytes +target_uuid = uuid.UUID('4d8f1ec8-bc41-4148-8069-d31c399b507b') + +# Create a FixedSizeBinary(16) Arrow array +arr = pa.array([target_uuid.bytes], type=pa.binary(16)) +table = pa.Table.from_arrays([arr], names=['u']) + +feather.write_feather(table, '$DATA_FILE', compression='uncompressed') +" + +$CLICKHOUSE_LOCAL -q " + DROP TABLE IF EXISTS test_arrow_import; + CREATE TABLE test_arrow_import (u UUID) ENGINE = Memory; + + -- When reading Arrow, ClickHouse will trigger our ArrowColumnToCHColumn C++ code + INSERT INTO test_arrow_import FROM INFILE '$DATA_FILE' FORMAT Arrow; + + SELECT u FROM test_arrow_import; +" + +rm -f $DATA_FILE \ No newline at end of file diff --git a/tests/queries/0_stateless/04043_parquet_uuid_inference.reference b/tests/queries/0_stateless/04043_parquet_uuid_inference.reference new file mode 100644 index 000000000000..97d2f9faafa3 --- /dev/null +++ b/tests/queries/0_stateless/04043_parquet_uuid_inference.reference @@ -0,0 +1 @@ +UUID 57033795-afc7-42d2-ae07-3943d0395dc4 diff --git a/tests/queries/0_stateless/04043_parquet_uuid_inference.sh b/tests/queries/0_stateless/04043_parquet_uuid_inference.sh new file mode 100755 index 000000000000..b4a121f91c18 --- /dev/null +++ b/tests/queries/0_stateless/04043_parquet_uuid_inference.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +DATA_FILE=$CLICKHOUSE_TEST_UNIQUE_NAME.parquet + +$CLICKHOUSE_LOCAL -q " + CREATE TABLE export_table (id UUID) ENGINE = Memory; + INSERT INTO export_table VALUES ('57033795-afc7-42d2-ae07-3943d0395dc4'); + SELECT * FROM export_table INTO OUTFILE '$DATA_FILE' FORMAT Parquet; +" + +# Must infer the schema strictly from the Parquet metadata footer +$CLICKHOUSE_LOCAL -q "SELECT toTypeName(id), id FROM file('$DATA_FILE', 'Parquet');" + +rm -f $DATA_FILE \ No newline at end of file diff --git a/tests/queries/0_stateless/04044_arrow_uuid_inference.reference b/tests/queries/0_stateless/04044_arrow_uuid_inference.reference new file mode 100644 index 000000000000..d87e08fe10eb --- /dev/null +++ b/tests/queries/0_stateless/04044_arrow_uuid_inference.reference @@ -0,0 +1 @@ +UUID ce7adebf-4abe-4d4d-836e-ec8b0ab78738 diff --git a/tests/queries/0_stateless/04044_arrow_uuid_inference.sh b/tests/queries/0_stateless/04044_arrow_uuid_inference.sh new file mode 100755 index 000000000000..5da87d729773 --- /dev/null +++ b/tests/queries/0_stateless/04044_arrow_uuid_inference.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +DATA_FILE=$CLICKHOUSE_TEST_UNIQUE_NAME.arrow + +$CLICKHOUSE_LOCAL -q " + CREATE TABLE export_table (id UUID) ENGINE = Memory; + INSERT INTO export_table VALUES ('ce7adebf-4abe-4d4d-836e-ec8b0ab78738'); + SELECT * FROM export_table INTO OUTFILE '$DATA_FILE' FORMAT Arrow; +" + +# Read without schema +$CLICKHOUSE_LOCAL -q "SELECT toTypeName(id), id FROM file('$DATA_FILE', 'Arrow');" + +rm -f $DATA_FILE \ No newline at end of file diff --git a/tests/queries/0_stateless/04045_parquet_uuid_nested_v2.reference b/tests/queries/0_stateless/04045_parquet_uuid_nested_v2.reference new file mode 100644 index 000000000000..1925896a8369 --- /dev/null +++ b/tests/queries/0_stateless/04045_parquet_uuid_nested_v2.reference @@ -0,0 +1,6 @@ +--- Read without hint --- +Nullable(UUID) 6c1799c6-1ceb-45d3-a697-61956cd5a47a Array(UUID) 0 +Nullable(UUID) \N Array(UUID) 2 +--- Read with schema hint --- +Nullable(UUID) 6c1799c6-1ceb-45d3-a697-61956cd5a47a Array(UUID) [] +Nullable(UUID) \N Array(UUID) ['b7f4341e-2cbc-489a-acd6-fae97bdc54a1','4867f717-c20e-4efc-b04b-6bf04793473f'] diff --git a/tests/queries/0_stateless/04045_parquet_uuid_nested_v2.sh b/tests/queries/0_stateless/04045_parquet_uuid_nested_v2.sh new file mode 100755 index 000000000000..2124b41a5322 --- /dev/null +++ b/tests/queries/0_stateless/04045_parquet_uuid_nested_v2.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +DATA_FILE=$CLICKHOUSE_TEST_UNIQUE_NAME.parquet + +$CLICKHOUSE_LOCAL -q " + CREATE TABLE export_table (opt_id Nullable(UUID), arr_id Array(UUID)) ENGINE = Memory; + INSERT INTO export_table VALUES (NULL, ['b7f4341e-2cbc-489a-acd6-fae97bdc54a1', '4867f717-c20e-4efc-b04b-6bf04793473f']); + INSERT INTO export_table VALUES ('6c1799c6-1ceb-45d3-a697-61956cd5a47a', []); + SELECT * FROM export_table INTO OUTFILE '$DATA_FILE' FORMAT Parquet; +" + +echo "--- Read without hint ---" +# The v2 reader successfully infers the primitive Nullable(UUID), but drops the tag for the Array. +$CLICKHOUSE_LOCAL --input_format_parquet_use_native_reader_v3=0 -q " + SELECT toTypeName(opt_id), opt_id, toTypeName(arr_id), length(arr_id) + FROM file('$DATA_FILE', 'Parquet') ORDER BY opt_id ASC; +" + +echo "--- Read with schema hint ---" +# Proves our byte-swapping logic works perfectly in the legacy reader if it knows the type. +$CLICKHOUSE_LOCAL --input_format_parquet_use_native_reader_v3=0 -q " + SELECT toTypeName(opt_id), opt_id, toTypeName(arr_id), arr_id + FROM file('$DATA_FILE', 'Parquet', 'opt_id Nullable(UUID), arr_id Array(UUID)') ORDER BY opt_id ASC; +" + +rm -f $DATA_FILE \ No newline at end of file diff --git a/tests/queries/0_stateless/04045_parquet_uuid_nested_v3.reference b/tests/queries/0_stateless/04045_parquet_uuid_nested_v3.reference new file mode 100644 index 000000000000..1f278fcfb6cd --- /dev/null +++ b/tests/queries/0_stateless/04045_parquet_uuid_nested_v3.reference @@ -0,0 +1,2 @@ +Nullable(UUID) 6c1799c6-1ceb-45d3-a697-61956cd5a47a Array(UUID) [] +Nullable(UUID) \N Array(UUID) ['b7f4341e-2cbc-489a-acd6-fae97bdc54a1','4867f717-c20e-4efc-b04b-6bf04793473f'] diff --git a/tests/queries/0_stateless/04045_parquet_uuid_nested_v3.sh b/tests/queries/0_stateless/04045_parquet_uuid_nested_v3.sh new file mode 100755 index 000000000000..0994b93e6c1b --- /dev/null +++ b/tests/queries/0_stateless/04045_parquet_uuid_nested_v3.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +DATA_FILE=$CLICKHOUSE_TEST_UNIQUE_NAME.parquet + +$CLICKHOUSE_LOCAL -q " + CREATE TABLE export_table (opt_id Nullable(UUID), arr_id Array(UUID)) ENGINE = Memory; + INSERT INTO export_table VALUES (NULL, ['b7f4341e-2cbc-489a-acd6-fae97bdc54a1', '4867f717-c20e-4efc-b04b-6bf04793473f']); + INSERT INTO export_table VALUES ('6c1799c6-1ceb-45d3-a697-61956cd5a47a', []); + SELECT * FROM export_table INTO OUTFILE '$DATA_FILE' FORMAT Parquet; +" + +# Read without schema to ensure nested inference works +$CLICKHOUSE_LOCAL --input_format_parquet_use_native_reader_v3=1 -q " + SELECT toTypeName(opt_id), opt_id, toTypeName(arr_id), arr_id + FROM file('$DATA_FILE', 'Parquet') ORDER BY opt_id ASC; +" + +rm -f $DATA_FILE \ No newline at end of file diff --git a/tests/queries/0_stateless/04046_arrow_uuid_nested.reference b/tests/queries/0_stateless/04046_arrow_uuid_nested.reference new file mode 100644 index 000000000000..5ae7ca59fd26 --- /dev/null +++ b/tests/queries/0_stateless/04046_arrow_uuid_nested.reference @@ -0,0 +1,2 @@ +Nullable(UUID) f803bc34-2b43-41d4-bc3d-989bb3b369e2 Array(UUID) [] +Nullable(UUID) \N Array(UUID) ['4b84c7c7-20da-4984-8208-b7f0d49897d8','56063faf-2bcc-4c9f-a273-c18d25bfc7a3'] diff --git a/tests/queries/0_stateless/04046_arrow_uuid_nested.sh b/tests/queries/0_stateless/04046_arrow_uuid_nested.sh new file mode 100755 index 000000000000..eebb3e7704bd --- /dev/null +++ b/tests/queries/0_stateless/04046_arrow_uuid_nested.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +DATA_FILE=$CLICKHOUSE_TEST_UNIQUE_NAME.arrow + +$CLICKHOUSE_LOCAL -q " + CREATE TABLE export_table (opt_id Nullable(UUID), arr_id Array(UUID)) ENGINE = Memory; + INSERT INTO export_table VALUES (NULL, ['4b84c7c7-20da-4984-8208-b7f0d49897d8', '56063faf-2bcc-4c9f-a273-c18d25bfc7a3']); + INSERT INTO export_table VALUES ('f803bc34-2b43-41d4-bc3d-989bb3b369e2', []); + SELECT * FROM export_table INTO OUTFILE '$DATA_FILE' FORMAT Arrow; +" + +# Read with explicit schema hint to bypass Arrow's nested inference limitations +$CLICKHOUSE_LOCAL -q " + SELECT toTypeName(opt_id), opt_id, toTypeName(arr_id), arr_id + FROM file('$DATA_FILE', 'Arrow', 'opt_id Nullable(UUID), arr_id Array(UUID)') ORDER BY opt_id ASC; +" + +rm -f $DATA_FILE \ No newline at end of file diff --git a/tests/queries/0_stateless/04047_arrow_uuid_nested_reader.reference b/tests/queries/0_stateless/04047_arrow_uuid_nested_reader.reference new file mode 100644 index 000000000000..8219b3904e18 --- /dev/null +++ b/tests/queries/0_stateless/04047_arrow_uuid_nested_reader.reference @@ -0,0 +1,2 @@ +Nullable(UUID) f803bc34-2b43-41d4-bc3d-989bb3b369e2 Array(Nullable(UUID)) [] +Nullable(UUID) \N Array(Nullable(UUID)) ['4b84c7c7-20da-4984-8208-b7f0d49897d8','56063faf-2bcc-4c9f-a273-c18d25bfc7a3'] diff --git a/tests/queries/0_stateless/04047_arrow_uuid_nested_reader.sql b/tests/queries/0_stateless/04047_arrow_uuid_nested_reader.sql new file mode 100644 index 000000000000..7c5dc0b641cd --- /dev/null +++ b/tests/queries/0_stateless/04047_arrow_uuid_nested_reader.sql @@ -0,0 +1,25 @@ +-- Tags: no-fasttest + +-- HOW THIS BASE64 STRING WAS GENERATED: +-- This is a perfectly formed Arrow IPC file containing a top-level Nullable(UUID) +-- and a nested Array(Nullable(UUID)) using the official arrow.uuid extension type. +-- It proves the ClickHouse Reader can recursively unwrap and infer Arrow extension types. +-- +-- import pyarrow as pa, uuid, base64, io +-- uuid_type = pa.uuid() +-- schema = pa.schema([pa.field('opt_id', uuid_type, nullable=True), pa.field('arr_id', pa.list_(uuid_type), nullable=True)]) +-- uuid_1 = uuid.UUID('4b84c7c7-20da-4984-8208-b7f0d49897d8') +-- uuid_2 = uuid.UUID('56063faf-2bcc-4c9f-a273-c18d25bfc7a3') +-- uuid_3 = uuid.UUID('f803bc34-2b43-41d4-bc3d-989bb3b369e2') +-- opt_id_array = pa.ExtensionArray.from_storage(uuid_type, pa.array([None, uuid_3.bytes], type=pa.binary(16))) +-- arr_id_storage = pa.array([[uuid_1.bytes, uuid_2.bytes], []], type=pa.list_(pa.binary(16))) +-- arr_id_array = arr_id_storage.cast(pa.list_(uuid_type)) +-- table = pa.Table.from_arrays([opt_id_array, arr_id_array], schema=schema) +-- buf = io.BytesIO() +-- with pa.RecordBatchFileWriter(buf, schema) as writer: +-- writer.write_table(table) +-- print(base64.b64encode(buf.getvalue()).decode('utf-8')) + +SELECT toTypeName(opt_id), opt_id, toTypeName(arr_id), arr_id +FROM format(Arrow, 'opt_id Nullable(UUID), arr_id Array(Nullable(UUID))', base64Decode('QVJST1cxAAD/////8AEAABAAAAAAAAoADAAGAAUACAAKAAAAAAEEAAwAAAAIAAgAAAAEAAgAAAAEAAAAAgAAAAgBAAAUAAAAEAAUAAgABgAHAAwAAAAQABAAAAAAAAEMFAAAACAAAAAEAAAAAQAAABgAAAAGAAAAYXJyX2lkAAAEAAQABAAAAFL///8AAAEPFAAAAJgAAAAIAAAAFAAAAAAAAAAEAAAAaXRlbQAAAAACAAAAQAAAAAQAAAAI////GAAAAAQAAAAKAAAAYXJyb3cudXVpZAAAFAAAAEFSUk9XOmV4dGVuc2lvbjpuYW1lAAAAAED///8QAAAABAAAAAAAAAAAAAAAGAAAAEFSUk9XOmV4dGVuc2lvbjptZXRhZGF0YQAABgAKAAQABgAAABAAAAAAABIAGAAIAAYABwAMAAAAEAAUABIAAAAAAAEPFAAAAKAAAAAIAAAAFAAAAAAAAAAGAAAAb3B0X2lkAAACAAAASAAAAAQAAADI////GAAAAAQAAAAKAAAAYXJyb3cudXVpZAAAFAAAAEFSUk9XOmV4dGVuc2lvbjpuYW1lAAAAAAgADAAEAAgACAAAABAAAAAEAAAAAAAAAAAAAAAYAAAAQVJST1c6ZXh0ZW5zaW9uOm1ldGFkYXRhAAAGAAgABAAGAAAAEAAAAAAAAAD/////6AAAABQAAAAAAAAADAAWAAYABQAIAAwADAAAAAADBAAYAAAAWAAAAAAAAAAAAAoAGAAMAAQACAAKAAAAfAAAABAAAAACAAAAAAAAAAAAAAAGAAAAAAAAAAAAAAABAAAAAAAAAAgAAAAAAAAAIAAAAAAAAAAoAAAAAAAAAAAAAAAAAAAAKAAAAAAAAAAMAAAAAAAAADgAAAAAAAAAAAAAAAAAAAA4AAAAAAAAACAAAAAAAAAAAAAAAAMAAAACAAAAAAAAAAEAAAAAAAAAAgAAAAAAAAAAAAAAAAAAAAIAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD4A7w0K0NB1Lw9mJuzs2niAAAAAAIAAAACAAAAAAAAAEuEx8cg2kmEggi38NSYl9hWBj+vK8xMn6JzwY0lv8ej/////wAAAAAQAAAADAAUAAYACAAMABAADAAAAAAABAA4AAAAKAAAAAQAAAABAAAAAAIAAAAAAADwAAAAAAAAAFgAAAAAAAAAAAAAAAAAAAAIAAgAAAAEAAgAAAAEAAAAAgAAAAgBAAAUAAAAEAAUAAgABgAHAAwAAAAQABAAAAAAAAEMFAAAACAAAAAEAAAAAQAAABgAAAAGAAAAYXJyX2lkAAAEAAQABAAAAFL///8AAAEPFAAAAJgAAAAIAAAAFAAAAAAAAAAEAAAAaXRlbQAAAAACAAAAQAAAAAQAAAAI////GAAAAAQAAAAKAAAAYXJyb3cudXVpZAAAFAAAAEFSUk9XOmV4dGVuc2lvbjpuYW1lAAAAAED///8QAAAABAAAAAAAAAAAAAAAGAAAAEFSUk9XOmV4dGVuc2lvbjptZXRhZGF0YQAABgAKAAQABgAAABAAAAAAABIAGAAIAAYABwAMAAAAEAAUABIAAAAAAAEPFAAAAKAAAAAIAAAAFAAAAAAAAAAGAAAAb3B0X2lkAAACAAAASAAAAAQAAADI////GAAAAAQAAAAKAAAAYXJyb3cudXVpZAAAFAAAAEFSUk9XOmV4dGVuc2lvbjpuYW1lAAAAAAgADAAEAAgACAAAABAAAAAEAAAAAAAAAAAAAAAYAAAAQVJST1c6ZXh0ZW5zaW9uOm1ldGFkYXRhAAAGAAgABAAGAAAAEAAAABgCAABBUlJPVzE=')) +ORDER BY opt_id ASC; diff --git a/tests/queries/0_stateless/04049_arrow_parquet_writer_uuid.reference b/tests/queries/0_stateless/04049_arrow_parquet_writer_uuid.reference new file mode 100644 index 000000000000..c0475122d427 --- /dev/null +++ b/tests/queries/0_stateless/04049_arrow_parquet_writer_uuid.reference @@ -0,0 +1 @@ +UUID 4dfdc6b0-e3f2-4649-ad36-6598aff9c482 diff --git a/tests/queries/0_stateless/04049_arrow_parquet_writer_uuid.sh b/tests/queries/0_stateless/04049_arrow_parquet_writer_uuid.sh new file mode 100755 index 000000000000..2c1bbcc679ed --- /dev/null +++ b/tests/queries/0_stateless/04049_arrow_parquet_writer_uuid.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +DATA_FILE="${CLICKHOUSE_TEST_UNIQUE_NAME}.parquet" + +$CLICKHOUSE_LOCAL --output_format_parquet_use_custom_encoder=0 -q " + SELECT toUUID('4dfdc6b0-e3f2-4649-ad36-6598aff9c482') AS id + INTO OUTFILE '${DATA_FILE}' FORMAT Parquet; +" + +$CLICKHOUSE_LOCAL -q " + SELECT toTypeName(id), id + FROM file('${DATA_FILE}', Parquet); +" + +rm -f "${DATA_FILE}" \ No newline at end of file