From e9e6bdeeaebd459b146296249e648bdff7a0d939 Mon Sep 17 00:00:00 2001 From: ziyangeng Date: Thu, 23 Apr 2026 20:17:34 +0800 Subject: [PATCH 1/5] feat(tools): support Parquet and Arrow import with schema/auto modes --- java/tools/README-zh.md | 195 ++++-- java/tools/README.md | 176 ++++-- java/tools/pom.xml | 78 +++ .../assembly/resources/tools/arrow2tsfile.bat | 54 ++ .../assembly/resources/tools/arrow2tsfile.sh | 59 ++ .../resources/tools/parquet2tsfile.bat | 49 ++ .../resources/tools/parquet2tsfile.sh | 53 ++ java/tools/src/assembly/tools.xml | 18 + .../tsfile/tools/ArrowSourceReader.java | 298 +++++++++ .../tsfile/tools/AutoSchemaInferer.java | 211 +++++++ .../apache/tsfile/tools/CsvSourceReader.java | 290 +++++++++ .../apache/tsfile/tools/ImportExecutor.java | 134 ++++ .../org/apache/tsfile/tools/ImportSchema.java | 247 ++++++++ .../tsfile/tools/ImportSchemaParser.java | 214 +++++++ .../tsfile/tools/ParquetSourceReader.java | 315 ++++++++++ .../org/apache/tsfile/tools/SourceBatch.java | 82 +++ .../org/apache/tsfile/tools/SourceReader.java | 47 ++ .../apache/tsfile/tools/TabletBuilder.java | 193 ++++++ .../apache/tsfile/tools/TimeConverter.java | 149 +++++ .../org/apache/tsfile/tools/TsFileTool.java | 587 ++++++----------- .../apache/tsfile/tools/ValueConverter.java | 126 ++++ .../tsfile/tools/ArrowSourceReaderTest.java | 499 +++++++++++++++ .../tsfile/tools/AutoSchemaInfererTest.java | 342 ++++++++++ .../tsfile/tools/CsvSourceReaderTest.java | 435 +++++++++++++ .../tsfile/tools/ImportExecutorTest.java | 197 ++++++ .../tsfile/tools/ImportSchemaParserTest.java | 514 +++++++++++++++ .../tsfile/tools/ParquetSourceReaderTest.java | 467 ++++++++++++++ .../tsfile/tools/TabletBuilderTest.java | 239 +++++++ .../tsfile/tools/TimeConverterTest.java | 188 ++++++ .../tsfile/tools/TsFileToolCliTest.java | 591 ++++++++++++++++++ .../apache/tsfile/tools/TsfiletoolsTest.java | 578 +++++++++++++++++ .../tsfile/tools/ValueConverterTest.java | 187 ++++++ 32 files changed, 7322 insertions(+), 490 deletions(-) create mode 100644 java/tools/src/assembly/resources/tools/arrow2tsfile.bat create mode 100644 java/tools/src/assembly/resources/tools/arrow2tsfile.sh create mode 100644 java/tools/src/assembly/resources/tools/parquet2tsfile.bat create mode 100644 java/tools/src/assembly/resources/tools/parquet2tsfile.sh create mode 100644 java/tools/src/main/java/org/apache/tsfile/tools/ArrowSourceReader.java create mode 100644 java/tools/src/main/java/org/apache/tsfile/tools/AutoSchemaInferer.java create mode 100644 java/tools/src/main/java/org/apache/tsfile/tools/CsvSourceReader.java create mode 100644 java/tools/src/main/java/org/apache/tsfile/tools/ImportExecutor.java create mode 100644 java/tools/src/main/java/org/apache/tsfile/tools/ImportSchema.java create mode 100644 java/tools/src/main/java/org/apache/tsfile/tools/ImportSchemaParser.java create mode 100644 java/tools/src/main/java/org/apache/tsfile/tools/ParquetSourceReader.java create mode 100644 java/tools/src/main/java/org/apache/tsfile/tools/SourceBatch.java create mode 100644 java/tools/src/main/java/org/apache/tsfile/tools/SourceReader.java create mode 100644 java/tools/src/main/java/org/apache/tsfile/tools/TabletBuilder.java create mode 100644 java/tools/src/main/java/org/apache/tsfile/tools/TimeConverter.java create mode 100644 java/tools/src/main/java/org/apache/tsfile/tools/ValueConverter.java create mode 100644 java/tools/src/test/java/org/apache/tsfile/tools/ArrowSourceReaderTest.java create mode 100644 java/tools/src/test/java/org/apache/tsfile/tools/AutoSchemaInfererTest.java create mode 100644 java/tools/src/test/java/org/apache/tsfile/tools/CsvSourceReaderTest.java create mode 100644 java/tools/src/test/java/org/apache/tsfile/tools/ImportExecutorTest.java create mode 100644 java/tools/src/test/java/org/apache/tsfile/tools/ImportSchemaParserTest.java create mode 100644 java/tools/src/test/java/org/apache/tsfile/tools/ParquetSourceReaderTest.java create mode 100644 java/tools/src/test/java/org/apache/tsfile/tools/TabletBuilderTest.java create mode 100644 java/tools/src/test/java/org/apache/tsfile/tools/TimeConverterTest.java create mode 100644 java/tools/src/test/java/org/apache/tsfile/tools/TsFileToolCliTest.java create mode 100644 java/tools/src/test/java/org/apache/tsfile/tools/ValueConverterTest.java diff --git a/java/tools/README-zh.md b/java/tools/README-zh.md index e0b8d7d3a..e25e5ca0a 100644 --- a/java/tools/README-zh.md +++ b/java/tools/README-zh.md @@ -47,49 +47,39 @@ mvn install -P with-java -DskipTests ## schema 定义 -| 参数 | 说明 | 是否必填 | 默认值 | -|------------|--------------------------|------|------| -| table_name | 表名 | 是 | | -| time_precision | 时间精度(可选值有:ms/us/ns) | 否 | ms | -| has_header | 是否包含表头 (可选值有:true/false) | 否 | true | -| separator | 行内分隔符(可选值有:, /tab/ ;) | 否 | , | -| null_format | 空值 | 否 | | -| id_columns | 主键列,支持cvs中不存在的列做为层级 | 否 | | -| time_column | 时间列 | 是 | | -| csv_columns | 按照顺序与csv列一一对应 | 是 | | +### 参数 -说明: +| 参数 | 说明 | 是否必填 | 默认值 | +|------|------|---------|--------| +| table_name | 表名 | 是 | | +| time_precision | 时间精度(ms / us / ns / s) | 否 | ms | +| has_header | CSV 是否包含表头(true / false),Parquet / Arrow 忽略此项 | 否 | true | +| separator | CSV 行内分隔符(, / tab / ;),Parquet / Arrow 忽略此项 | 否 | , | +| null_format | CSV 中视为 null 的字符串,Parquet / Arrow 忽略此项(使用原生 null) | 否 | | +| tag_columns | 标签列(设备标识 / 联合主键),支持 DEFAULT 虚拟列 | 否 | | +| time_column | 时间列名称 | 是 | | +| source_columns | 源文件列定义,映射源文件中的每一列 | 是 | | -id_columns 按照顺序进行设置值,支持csv 文件中不存在的列作为层级 -例如csv 只有a,b,c,d,time五列则 -id_columns -a1 default aa -a -其中a1 不在csv列,为虚拟列,默认值为aa - -csv_columns 之后的内容为值列的定义,每一行的第一个字段为在tsfile中的测点名,第二个字段为类型 -当csv中某一列不需要写入 tsfile时,可以设置为 SKIP -例: -csv_columns -地区 TEXT, -厂号 TEXT, -设备号 TEXT, -SKIP, -SKIP, -时间 INT64, -温度 FLOAT, -排量 DOUBLE, +> **向后兼容**:`id_columns` 和 `csv_columns` 仍然可用,分别作为 `tag_columns` 和 `source_columns` 的别名。 + +### 列概念 + +- **time_column**:每个表有且仅有一个时间列,写入 TsFile 后列名固定为 `time`,类型为 `TIMESTAMP`。 +- **tag_columns**:设备标识列(联合主键),可以为 0 到多个。支持通过 `DEFAULT` 关键字定义不在源文件中的虚拟列。 +- **source_columns**:映射源文件中的所有列,CSV 按位置对应,Parquet / Arrow 按列名匹配。使用 `SKIP` 跳过不需要的列。 +- **FIELD**(推导结果,非配置项):`source_columns` 中去掉 `time_column`、`tag_columns`、`SKIP` 后的剩余列,即为测点列,其值随时间变化。 -### 数据示例 -csv 文件内容 +### Schema 示例 + +CSV 文件内容: ``` -地区, 厂号, 设备号, 型号, 维修周期, 时间, 温度, 排量 -河北, 1001, 1, 10, 1, 1, 80.0, 1000.0 -河北, 1001, 1, 10, 1, 4, 80.0, 1000.0 -河北, 1002, 7, 5, 2, 1, 90.0, 1200.0 +Region,FactoryNumber,DeviceNumber,Model,MaintenanceCycle,Time,Temperature,Emission +hebei,1001,1,10,1,1,80.0,1000.0 +hebei,1001,1,10,1,4,80.0,1000.0 +hebei,1002,7,5,2,1,90.0,1200.0 ``` -schema 定义 +Schema 文件(`import.schema`): ``` table_name=root.db1 time_precision=ms @@ -97,30 +87,127 @@ has_header=true separator=, null_format=\N +tag_columns +Group DEFAULT Datang +Region +FactoryNumber +DeviceNumber -id_columns -集团 DEFAULT 大唐 -地区 -厂号 -设备号 - -time_column=时间 +time_column=Time -csv_columns -地区 TEXT, -厂号 TEXT, -设备号 TEXT, +source_columns +Region TEXT, +FactoryNumber TEXT, +DeviceNumber TEXT, SKIP, SKIP, -时间 INT64, -温度 FLOAT, -排量 DOUBLE, +Time INT64, +Temperature FLOAT, +Emission DOUBLE, ``` -## 命令 +说明: +- `Group` 是虚拟标签列(不在 CSV 中),默认值为 `Datang` +- `Region`、`FactoryNumber`、`DeviceNumber` 是从 CSV 中读取的标签列 +- `Model` 和 `MaintenanceCycle` 通过 `SKIP` 跳过 +- `Temperature` 和 `Emission` 自动推导为 FIELD 列 + +Parquet / Arrow 在 schema 模式下,`source_columns` 按列**名称**匹配而非位置。也支持命名 SKIP: ``` -csv2tsfile.sh --source ./xxx/xxx --target /xxx/xxx --fail_dir /xxx/xxx -csv2tsfile.bat --source ./xxx/xxx --target /xxx/xxx --fail_dir /xxx/xxx +source_columns +Time INT64, +unused_col SKIP, +Temperature FLOAT, +Emission DOUBLE, +``` + +## 命令行参数 + +| 参数 | 说明 | 是否必填 | 默认值 | +|------|------|---------|--------| +| -s, --source | 输入文件或目录 | 是 | | +| -t, --target | 输出目录 | 是 | | +| --schema | Schema 文件路径,不传则进入 auto 模式 | 否 | | +| --fail_dir | 失败文件存放目录 | 否 | failed | +| --format | 源格式:csv / parquet / arrow,不传则按文件扩展名自动识别 | 否 | 自动识别 | +| --table_name | 表名覆盖(auto 模式) | 否 | 从文件名推导 | +| --time_precision | 时间精度覆盖(auto 模式):ms / us / ns / s | 否 | ms | +| --separator | CSV 分隔符(auto 模式):, / tab / ; | 否 | , | +| -b, --block_size | CSV 分块大小(如 256M、1G) | 否 | 256M | +| -tn, --thread_num | 并行处理线程数 | 否 | 8 | + +## 模式 + +### Schema 模式 + +传入 `--schema` 文件,显式定义列映射、类型、标签列和时间列。 + +```sh +# CSV +csv2tsfile.sh --source ./data/csv --target ./output --fail_dir ./failed --schema ./schema/import.schema +csv2tsfile.bat --source .\data\csv --target .\output --fail_dir .\failed --schema .\schema\import.schema + +# Parquet +parquet2tsfile.sh --source ./data/parquet --target ./output --fail_dir ./failed --schema ./schema/import.schema +parquet2tsfile.bat --source .\data\parquet --target .\output --fail_dir .\failed --schema .\schema\import.schema + +# Arrow +arrow2tsfile.sh --source ./data/arrow --target ./output --fail_dir ./failed --schema ./schema/import.schema +arrow2tsfile.bat --source .\data\arrow --target .\output --fail_dir .\failed --schema .\schema\import.schema ``` +### Auto 模式 + +不传 `--schema`,自动推断列类型并识别时间列。 + +**Auto 模式规则:** +- 时间列:必须严格命名为 `time` 或 `TIME`(区分大小写) +- 其余所有列自动成为 FIELD(不自动推断标签列) +- CSV 类型推断基于前 100 行采样,提升链为:`BOOLEAN → INT64 → DOUBLE → STRING` +- Parquet / Arrow 直接使用原生 schema 类型 +- 默认表名:从源文件名推导(如 `sensor.csv` → 表名 `sensor`) +- 默认 null 识别(仅 CSV):空单元格和 `\N` + +**Auto 模式示例:** + +CSV 文件(`sensor.csv`): +``` +time,temperature,humidity,status +1000,25.5,60.0,true +2000,26.1,55.3,false +3000,27.0,58.1,true +``` + +Auto 模式推断结果: +``` +表名: sensor (从文件名推导) +时间列: time +FIELD 列: temperature DOUBLE, humidity DOUBLE, status BOOLEAN +标签列: (无) +``` + +**命令:** +```sh +# CSV +csv2tsfile.sh --source ./data/csv --target ./output --fail_dir ./failed +csv2tsfile.bat --source .\data\csv --target .\output --fail_dir .\failed + +# CSV 带可选参数 +csv2tsfile.sh --source ./data/csv --target ./output --table_name my_table --separator tab --time_precision us + +# Parquet +parquet2tsfile.sh --source ./data/parquet --target ./output --fail_dir ./failed +parquet2tsfile.bat --source .\data\parquet --target .\output --fail_dir .\failed + +# Arrow(.arrow / .ipc / .feather) +arrow2tsfile.sh --source ./data/arrow --target ./output --fail_dir ./failed +arrow2tsfile.bat --source .\data\arrow --target .\output --fail_dir .\failed +``` + +### 输出文件命名 + +- 单批次:`{源文件名}.tsfile` +- 多批次:`{源文件名}_1.tsfile`、`{源文件名}_2.tsfile`、... +- 表名与输出文件名相互独立——表名来自 schema 或 `--table_name`,文件名来自源文件。 + diff --git a/java/tools/README.md b/java/tools/README.md index 97858fde5..3de5332bd 100644 --- a/java/tools/README.md +++ b/java/tools/README.md @@ -44,55 +44,41 @@ mvn clean package -P with-java -DskipTests mvn install -P with-java -DskipTests ``` -## schema 定义 -| Parameter | Description | Required | Default | -|----------------|--------------------------|----------|------| -| table_name | Table name | Yes | | -| time_precision | Time precision (options: ms/us/ns) | No | ms | -| has_header | Whether it contains a header (options: true/false) | No | true | -| separator | Delimiter (options: , /tab/ ;) | No | , | -| null_format | Null value | No | | -| id_columns | Primary key columns, supports columns not in the CSV as hierarchy | No | | -| time_column | Time column | Yes | | -| csv_columns | Corresponding columns in the CSV in order | Yes | | - -Explanation: - -The "id_columns" sets values in order and supports using columns that do not exist in the CSV file as levels. -For example, if the CSV file has only five columns: "a", "b", "c", "d", and "time", -id_columns -a1 default aa -a -Among them, a1 is not in the CSV column and is a virtual column with a default value of aa - -The content after csv_columns is the definition of the value column, with the first field in each row being the measurement point name in tsfile and the second field being the type -When a column in CSV does not need to be written to tsfile, it can be set to SKIP. - -Example: -csv_columns -Region TEXT, -Factory Number TEXT, -Device Number TEXT, -SKIP, -SKIP, -Time INT64, -Temperature FLOAT, -Emission DOUBLE, +## Schema Definition -Data Example -CSV file content: +### Parameters + +| Parameter | Description | Required | Default | +|-----------|------------|----------|---------| +| table_name | Table name | Yes | | +| time_precision | Time precision (ms / us / ns / s) | No | ms | +| has_header | Whether CSV contains a header (true / false). Ignored for Parquet / Arrow. | No | true | +| separator | CSV delimiter (, / tab / ;). Ignored for Parquet / Arrow. | No | , | +| null_format | String value treated as null in CSV. Ignored for Parquet / Arrow (native null). | No | | +| tag_columns | Tag columns (device identifiers / primary key). Supports virtual columns with DEFAULT value. | No | | +| time_column | Time column name | Yes | | +| source_columns | Column definitions mapping to source file columns | Yes | | + +> **Backward compatibility**: `id_columns` and `csv_columns` are still accepted as aliases for `tag_columns` and `source_columns`. -### sample data +### Column Concepts + +- **time_column**: Exactly one per table. Written as `time` column with type `TIMESTAMP` in TsFile. +- **tag_columns**: Device identifiers (composite primary key), 0 or more. Supports virtual columns not present in the source file via `DEFAULT` keyword. +- **source_columns**: Maps every column in the source file by position (CSV) or by name (Parquet / Arrow). Use `SKIP` to ignore a column. +- **FIELD** (derived, not configured): All columns in `source_columns` that are not `time_column`, not in `tag_columns`, and not `SKIP`. These are the measurement columns whose values change over time. + +### Schema Example CSV file content: ``` Region,FactoryNumber,DeviceNumber,Model,MaintenanceCycle,Time,Temperature,Emission -hebei,1001, 1,10,1,1,80.0,1000.0 +hebei,1001,1,10,1,1,80.0,1000.0 hebei,1001,1,10,1,4,80.0,1000.0 hebei,1002,7,5,2,1,90.0,1200.0 ``` -Schema definition +Schema file (`import.schema`): ``` table_name=root.db1 time_precision=ms @@ -100,8 +86,7 @@ has_header=true separator=, null_format=\N - -id_columns +tag_columns Group DEFAULT Datang Region FactoryNumber @@ -109,8 +94,8 @@ DeviceNumber time_column=Time -csv_columns -RegionTEXT, +source_columns +Region TEXT, FactoryNumber TEXT, DeviceNumber TEXT, SKIP, @@ -119,9 +104,108 @@ Time INT64, Temperature FLOAT, Emission DOUBLE, ``` -## Commands +In this example: +- `Group` is a virtual tag column (not in CSV) with default value `Datang` +- `Region`, `FactoryNumber`, `DeviceNumber` are tag columns read from CSV +- `Model` and `MaintenanceCycle` are skipped via `SKIP` +- `Temperature` and `Emission` are automatically derived as FIELD columns + +For Parquet / Arrow in schema mode, `source_columns` matches by column **name** instead of position. Named SKIP is also supported: +``` +source_columns +Time INT64, +unused_col SKIP, +Temperature FLOAT, +Emission DOUBLE, +``` + +## CLI Parameters + +| Parameter | Description | Required | Default | +|-----------|------------|----------|---------| +| -s, --source | Input file or directory | Yes | | +| -t, --target | Output directory | Yes | | +| --schema | Schema file path. Omit for auto mode. | No | | +| --fail_dir | Directory for failed source files | No | failed | +| --format | Source format: csv / parquet / arrow. Auto-detected by file extension if omitted. | No | auto-detect | +| --table_name | Table name override (auto mode) | No | derived from filename | +| --time_precision | Time precision override (auto mode): ms / us / ns / s | No | ms | +| --separator | CSV delimiter (auto mode): , / tab / ; | No | , | +| -b, --block_size | CSV chunk size (e.g. 256M, 1G) | No | 256M | +| -tn, --thread_num | Thread count for parallel processing | No | 8 | + +## Modes + +### Schema Mode + +Provide a `--schema` file to explicitly define column mapping, types, tags, and time column. + +```sh +# CSV +csv2tsfile.sh --source ./data/csv --target ./output --fail_dir ./failed --schema ./schema/import.schema +csv2tsfile.bat --source .\data\csv --target .\output --fail_dir .\failed --schema .\schema\import.schema + +# Parquet +parquet2tsfile.sh --source ./data/parquet --target ./output --fail_dir ./failed --schema ./schema/import.schema +parquet2tsfile.bat --source .\data\parquet --target .\output --fail_dir .\failed --schema .\schema\import.schema + +# Arrow +arrow2tsfile.sh --source ./data/arrow --target ./output --fail_dir ./failed --schema ./schema/import.schema +arrow2tsfile.bat --source .\data\arrow --target .\output --fail_dir .\failed --schema .\schema\import.schema +``` + +### Auto Mode + +Omit `--schema` to automatically infer column types and detect the time column. + +**Auto mode rules:** +- Time column: must be named exactly `time` or `TIME` (case-sensitive, strict match) +- All other columns become FIELD (no tag inference) +- CSV type inference uses a 100-row sampling window with promotion chain: `BOOLEAN → INT64 → DOUBLE → STRING` +- Parquet / Arrow use native schema types directly +- Default table name: derived from source filename (e.g. `sensor.csv` → table `sensor`) +- Default null tokens (CSV only): empty cell and `\N` + +**Auto mode example:** + +CSV file (`sensor.csv`): ``` -csv2tsfile.sh --source ./xxx/xxx --target /xxx/xxx --fail_dir /xxx/xxx -csv2tsfile.bat --source ./xxx/xxx --target /xxx/xxx --fail_dir /xxx/xxx +time,temperature,humidity,status +1000,25.5,60.0,true +2000,26.1,55.3,false +3000,27.0,58.1,true ``` + +Auto mode infers: +``` +table name: sensor (from filename) +time column: time +fields: temperature DOUBLE, humidity DOUBLE, status BOOLEAN +tags: (none) +``` + +**Commands:** +```sh +# CSV +csv2tsfile.sh --source ./data/csv --target ./output --fail_dir ./failed +csv2tsfile.bat --source .\data\csv --target .\output --fail_dir .\failed + +# CSV with options +csv2tsfile.sh --source ./data/csv --target ./output --table_name my_table --separator tab --time_precision us + +# Parquet +parquet2tsfile.sh --source ./data/parquet --target ./output --fail_dir ./failed +parquet2tsfile.bat --source .\data\parquet --target .\output --fail_dir .\failed + +# Arrow (.arrow / .ipc / .feather) +arrow2tsfile.sh --source ./data/arrow --target ./output --fail_dir ./failed +arrow2tsfile.bat --source .\data\arrow --target .\output --fail_dir .\failed +``` + +### Output File Naming + +- Single batch: `{source_basename}.tsfile` +- Multiple batches: `{source_basename}_1.tsfile`, `{source_basename}_2.tsfile`, ... +- Table name and output filename are independent — table name comes from schema or `--table_name`, filename comes from source file. + diff --git a/java/tools/pom.xml b/java/tools/pom.xml index 42ba58b0c..74c014b29 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -28,6 +28,9 @@ tools TsFile: Java: Tools + + true + org.apache.tsfile @@ -52,6 +55,79 @@ org.slf4j slf4j-api + + org.apache.parquet + parquet-hadoop + 1.14.4 + + + org.apache.hadoop + hadoop-mapreduce-client-core + 3.3.6 + + + org.apache.hadoop + hadoop-yarn-common + + + + + org.apache.hadoop + hadoop-common + 3.3.6 + + + org.apache.hadoop + hadoop-auth + + + org.apache.curator + * + + + org.apache.zookeeper + * + + + org.apache.kerby + * + + + org.eclipse.jetty + * + + + javax.servlet + * + + + com.sun.jersey + * + + + ch.qos.reload4j + * + + + org.slf4j + slf4j-reload4j + + + dnsjava + * + + + + + org.apache.arrow + arrow-vector + 15.0.2 + + + org.apache.arrow + arrow-memory-unsafe + 15.0.2 + junit junit @@ -67,6 +143,8 @@ ch.qos.logback:logback-classic + org.apache.hadoop:hadoop-common + org.apache.arrow:arrow-memory-unsafe diff --git a/java/tools/src/assembly/resources/tools/arrow2tsfile.bat b/java/tools/src/assembly/resources/tools/arrow2tsfile.bat new file mode 100644 index 000000000..e096697fb --- /dev/null +++ b/java/tools/src/assembly/resources/tools/arrow2tsfile.bat @@ -0,0 +1,54 @@ +@REM +@REM Licensed to the Apache Software Foundation (ASF) under one +@REM or more contributor license agreements. See the NOTICE file +@REM distributed with this work for additional information +@REM regarding copyright ownership. The ASF licenses this file +@REM to you under the Apache License, Version 2.0 (the +@REM "License"); you may not use this file except in compliance +@REM with the License. You may obtain a copy of the License at +@REM +@REM http://www.apache.org/licenses/LICENSE-2.0 +@REM +@REM Unless required by applicable law or agreed to in writing, +@REM software distributed under the License is distributed on an +@REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +@REM KIND, either express or implied. See the License for the +@REM specific language governing permissions and limitations +@REM under the License. +@REM + +@echo off +setlocal enabledelayedexpansion + +if "%OS%" == "Windows_NT" setlocal + +pushd %~dp0.. +if NOT DEFINED TSFILE_HOME set TSFILE_HOME=%CD% +popd + +set JAVA_OPTS=-ea^ + -DTSFILE_HOME="%TSFILE_HOME%" + +if NOT DEFINED JAVA_HOME goto :err + +echo ------------------------------------------ +echo Starting Arrow to TsFile Script +echo ------------------------------------------ + +set CLASSPATH="%TSFILE_HOME%\lib\*" +if NOT DEFINED MAIN_CLASS set MAIN_CLASS=org.apache.tsfile.tools.TsFileTool + +set TSFILE_CONF=%TSFILE_HOME%\conf +set "tsfile_params=-Dlogback.configurationFile=!TSFILE_CONF!\logback-cvs2tsfile.xml" + +set ARROW_OPTS= +"%JAVA_HOME%\bin\java" --add-opens=java.base/java.nio=ALL-UNNAMED -version >nul 2>&1 +if not errorlevel 1 set ARROW_OPTS=--add-opens=java.base/java.nio=ALL-UNNAMED + +start /B /WAIT "" cmd /C "("%JAVA_HOME%\bin\java" -DTSFILE_HOME=!TSFILE_HOME! !tsfile_params! !ARROW_OPTS! !JAVA_OPTS! -cp !CLASSPATH! !MAIN_CLASS! --format arrow %*)" +exit /b + +:err +echo JAVA_HOME environment variable must be set! +set ret_code=1 +exit /b diff --git a/java/tools/src/assembly/resources/tools/arrow2tsfile.sh b/java/tools/src/assembly/resources/tools/arrow2tsfile.sh new file mode 100644 index 000000000..1a68e875b --- /dev/null +++ b/java/tools/src/assembly/resources/tools/arrow2tsfile.sh @@ -0,0 +1,59 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +echo ------------------------------------------ +echo Starting Arrow to TsFile Script +echo ------------------------------------------ + +if [ -z "${TSFILE_HOME}" ]; then + export TSFILE_HOME="$(cd "`dirname "$0"`"/..; pwd)" +fi + +if [ -n "$JAVA_HOME" ]; then + for java in "$JAVA_HOME"/bin/amd64/java "$JAVA_HOME"/bin/java; do + if [ -x "$java" ]; then + JAVA="$java" + break + fi + done +else + JAVA=java +fi + +if [ -z $JAVA ] ; then + echo Unable to find java executable. Check JAVA_HOME and PATH environment variables. > /dev/stderr + exit 1; +fi + + +CLASSPATH=${TSFILE_HOME}/lib/* + +MAIN_CLASS=org.apache.tsfile.tools.TsFileTool + +TSFILE_CONF=${TSFILE_HOME}/conf +tsfile_params="-Dlogback.configurationFile=${TSFILE_CONF}/logback-cvs2tsfile.xml" + +ARROW_OPTS="" +java_major=$("$JAVA" -version 2>&1 | head -1 | cut -d'"' -f2 | cut -d'.' -f1) +if [ "$java_major" -ge 9 ] 2>/dev/null; then + ARROW_OPTS="--add-opens=java.base/java.nio=ALL-UNNAMED" +fi + +exec "$JAVA" -DTSFILE_HOME=${TSFILE_HOME} $tsfile_params $ARROW_OPTS -cp "$CLASSPATH" "$MAIN_CLASS" --format arrow "$@" diff --git a/java/tools/src/assembly/resources/tools/parquet2tsfile.bat b/java/tools/src/assembly/resources/tools/parquet2tsfile.bat new file mode 100644 index 000000000..241397885 --- /dev/null +++ b/java/tools/src/assembly/resources/tools/parquet2tsfile.bat @@ -0,0 +1,49 @@ +@REM +@REM Licensed to the Apache Software Foundation (ASF) under one +@REM or more contributor license agreements. See the NOTICE file +@REM distributed with this work for additional information +@REM regarding copyright ownership. The ASF licenses this file +@REM to you under the Apache License, Version 2.0 (the +@REM "License"); you may not use this file except in compliance +@REM with the License. You may obtain a copy of the License at +@REM +@REM http://www.apache.org/licenses/LICENSE-2.0 +@REM +@REM Unless required by applicable law or agreed to in writing, +@REM software distributed under the License is distributed on an +@REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +@REM KIND, either express or implied. See the License for the +@REM specific language governing permissions and limitations +@REM under the License. +@REM + +@echo off +setlocal enabledelayedexpansion + +if "%OS%" == "Windows_NT" setlocal + +pushd %~dp0.. +if NOT DEFINED TSFILE_HOME set TSFILE_HOME=%CD% +popd + +set JAVA_OPTS=-ea^ + -DTSFILE_HOME="%TSFILE_HOME%" + +if NOT DEFINED JAVA_HOME goto :err + +echo ------------------------------------------ +echo Starting Parquet to TsFile Script +echo ------------------------------------------ + +set CLASSPATH="%TSFILE_HOME%\lib\*" +if NOT DEFINED MAIN_CLASS set MAIN_CLASS=org.apache.tsfile.tools.TsFileTool + +set TSFILE_CONF=%TSFILE_HOME%\conf +set "tsfile_params=-Dlogback.configurationFile=!TSFILE_CONF!\logback-cvs2tsfile.xml" +start /B /WAIT "" cmd /C "("%JAVA_HOME%\bin\java" -DTSFILE_HOME=!TSFILE_HOME! !tsfile_params! !JAVA_OPTS! -cp !CLASSPATH! !MAIN_CLASS! --format parquet %*)" +exit /b + +:err +echo JAVA_HOME environment variable must be set! +set ret_code=1 +exit /b diff --git a/java/tools/src/assembly/resources/tools/parquet2tsfile.sh b/java/tools/src/assembly/resources/tools/parquet2tsfile.sh new file mode 100644 index 000000000..51bfa3428 --- /dev/null +++ b/java/tools/src/assembly/resources/tools/parquet2tsfile.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +echo ------------------------------------------ +echo Starting Parquet to TsFile Script +echo ------------------------------------------ + +if [ -z "${TSFILE_HOME}" ]; then + export TSFILE_HOME="$(cd "`dirname "$0"`"/..; pwd)" +fi + +if [ -n "$JAVA_HOME" ]; then + for java in "$JAVA_HOME"/bin/amd64/java "$JAVA_HOME"/bin/java; do + if [ -x "$java" ]; then + JAVA="$java" + break + fi + done +else + JAVA=java +fi + +if [ -z $JAVA ] ; then + echo Unable to find java executable. Check JAVA_HOME and PATH environment variables. > /dev/stderr + exit 1; +fi + + +CLASSPATH=${TSFILE_HOME}/lib/* + +MAIN_CLASS=org.apache.tsfile.tools.TsFileTool + +TSFILE_CONF=${TSFILE_HOME}/conf +tsfile_params="-Dlogback.configurationFile=${TSFILE_CONF}/logback-cvs2tsfile.xml" + +exec "$JAVA" -DTSFILE_HOME=${TSFILE_HOME} $tsfile_params -cp "$CLASSPATH" "$MAIN_CLASS" --format parquet "$@" diff --git a/java/tools/src/assembly/tools.xml b/java/tools/src/assembly/tools.xml index d03bca136..31e9c81a0 100644 --- a/java/tools/src/assembly/tools.xml +++ b/java/tools/src/assembly/tools.xml @@ -51,5 +51,23 @@ ${maven.multiModuleProjectDirectory}/java/tools/src/assembly/resources/tools/csv2tsfile.bat tools/csv2tsfile.bat + + ${maven.multiModuleProjectDirectory}/java/tools/src/assembly/resources/tools/parquet2tsfile.sh + tools/parquet2tsfile.sh + 0755 + + + ${maven.multiModuleProjectDirectory}/java/tools/src/assembly/resources/tools/parquet2tsfile.bat + tools/parquet2tsfile.bat + + + ${maven.multiModuleProjectDirectory}/java/tools/src/assembly/resources/tools/arrow2tsfile.sh + tools/arrow2tsfile.sh + 0755 + + + ${maven.multiModuleProjectDirectory}/java/tools/src/assembly/resources/tools/arrow2tsfile.bat + tools/arrow2tsfile.bat + diff --git a/java/tools/src/main/java/org/apache/tsfile/tools/ArrowSourceReader.java b/java/tools/src/main/java/org/apache/tsfile/tools/ArrowSourceReader.java new file mode 100644 index 000000000..9a9bc4441 --- /dev/null +++ b/java/tools/src/main/java/org/apache/tsfile/tools/ArrowSourceReader.java @@ -0,0 +1,298 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.tsfile.tools; + +import org.apache.tsfile.enums.TSDataType; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.ipc.ArrowFileReader; +import org.apache.arrow.vector.ipc.message.ArrowBlock; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class ArrowSourceReader implements SourceReader { + + private static final Logger LOGGER = LoggerFactory.getLogger(ArrowSourceReader.class); + + private final File sourceFile; + private ImportSchema schema; + private BufferAllocator allocator; + private ArrowFileReader arrowReader; + private Schema arrowSchema; + private List recordBatches; + private int currentBatchIndex; + private boolean exhausted; + + private String overrideTableName; + private String overrideTimePrecision; + + public ArrowSourceReader(File sourceFile, ImportSchema schema) { + this.sourceFile = sourceFile; + this.schema = schema; + this.exhausted = false; + this.currentBatchIndex = 0; + } + + public ArrowSourceReader(File sourceFile) { + this.sourceFile = sourceFile; + this.schema = null; + this.exhausted = false; + this.currentBatchIndex = 0; + } + + public void setOverrideTableName(String tableName) { + this.overrideTableName = tableName; + } + + public void setOverrideTimePrecision(String timePrecision) { + this.overrideTimePrecision = timePrecision; + } + + @Override + public ImportSchema inferSchema() { + if (schema != null) { + throw new UnsupportedOperationException("inferSchema() is only available in auto mode"); + } + + try { + ensureReaderOpen(); + + List columnNames = new ArrayList<>(); + List columnTypes = new ArrayList<>(); + String detectedTimePrecision = null; + + for (Field field : arrowSchema.getFields()) { + String name = field.getName(); + columnNames.add(name); + TSDataType tsType = mapArrowType(field.getType()); + columnTypes.add(tsType); + + if (("time".equals(name) || "TIME".equals(name)) && detectedTimePrecision == null) { + detectedTimePrecision = detectTimestampPrecision(field.getType()); + } + } + + String timeColumn = AutoSchemaInferer.detectTimeColumn(columnNames); + TSDataType[] types = columnTypes.toArray(new TSDataType[0]); + + String tableName = + overrideTableName != null + ? overrideTableName + : AutoSchemaInferer.deriveTableName(sourceFile.getName(), "arrow_data"); + + String timePrecision; + if (overrideTimePrecision != null) { + timePrecision = overrideTimePrecision; + } else if (detectedTimePrecision != null) { + timePrecision = detectedTimePrecision; + } else { + timePrecision = "ms"; + } + + schema = + AutoSchemaInferer.buildAutoSchema( + tableName, timeColumn, columnNames, types, timePrecision); + return schema; + } catch (IOException e) { + throw new RuntimeException("Failed to infer schema from: " + sourceFile.getAbsolutePath(), e); + } + } + + @Override + public SourceBatch readBatch() { + if (exhausted) { + return null; + } + + try { + ensureReaderOpen(); + + if (currentBatchIndex >= recordBatches.size()) { + exhausted = true; + return null; + } + + arrowReader.loadRecordBatch(recordBatches.get(currentBatchIndex)); + currentBatchIndex++; + + VectorSchemaRoot root = arrowReader.getVectorSchemaRoot(); + int rowCount = root.getRowCount(); + if (rowCount == 0) { + if (currentBatchIndex >= recordBatches.size()) { + exhausted = true; + return null; + } + return readBatch(); + } + + List schemaColumnNames = getSchemaColumnNames(); + Map vectorMap = new HashMap<>(); + for (FieldVector vec : root.getFieldVectors()) { + vectorMap.put(vec.getName(), vec); + } + + int numCols = schemaColumnNames.size(); + List rows = new ArrayList<>(rowCount); + + for (int r = 0; r < rowCount; r++) { + Object[] row = new Object[numCols]; + for (int c = 0; c < numCols; c++) { + String colName = schemaColumnNames.get(c); + FieldVector vec = vectorMap.get(colName); + if (vec == null || vec.isNull(r)) { + row[c] = null; + } else { + row[c] = extractValue(vec, r); + } + } + rows.add(row); + } + + return SourceBatch.fromRows(schemaColumnNames, rows); + } catch (IOException e) { + LOGGER.error("Error reading Arrow file: " + sourceFile.getAbsolutePath(), e); + exhausted = true; + return null; + } + } + + @Override + public void close() { + if (arrowReader != null) { + try { + arrowReader.close(); + } catch (IOException e) { + LOGGER.error("Error closing Arrow reader", e); + } + arrowReader = null; + } + if (allocator != null) { + allocator.close(); + allocator = null; + } + } + + private void ensureReaderOpen() throws IOException { + if (arrowReader == null) { + allocator = new RootAllocator(); + arrowReader = new ArrowFileReader(new FileInputStream(sourceFile).getChannel(), allocator); + arrowSchema = arrowReader.getVectorSchemaRoot().getSchema(); + recordBatches = arrowReader.getRecordBlocks(); + } + } + + private List getSchemaColumnNames() { + List names = new ArrayList<>(); + for (ImportSchema.SourceColumn col : schema.getSourceColumns()) { + if (!col.isSkip()) { + names.add(col.getName()); + } + } + return names; + } + + private Object extractValue(FieldVector vec, int row) { + if (vec instanceof BigIntVector) { + return ((BigIntVector) vec).get(row); + } else if (vec instanceof IntVector) { + return ((IntVector) vec).get(row); + } else if (vec instanceof Float4Vector) { + return ((Float4Vector) vec).get(row); + } else if (vec instanceof Float8Vector) { + return ((Float8Vector) vec).get(row); + } else if (vec instanceof BitVector) { + return ((BitVector) vec).get(row) != 0; + } else if (vec instanceof VarCharVector) { + byte[] bytes = ((VarCharVector) vec).get(row); + return new String(bytes, StandardCharsets.UTF_8); + } else if (vec instanceof VarBinaryVector) { + return ((VarBinaryVector) vec).get(row); + } else { + Object obj = vec.getObject(row); + return obj != null ? obj.toString() : null; + } + } + + static TSDataType mapArrowType(ArrowType type) { + if (type instanceof ArrowType.Int) { + int bitWidth = ((ArrowType.Int) type).getBitWidth(); + return bitWidth <= 32 ? TSDataType.INT32 : TSDataType.INT64; + } else if (type instanceof ArrowType.FloatingPoint) { + switch (((ArrowType.FloatingPoint) type).getPrecision()) { + case SINGLE: + return TSDataType.FLOAT; + case DOUBLE: + return TSDataType.DOUBLE; + default: + return TSDataType.DOUBLE; + } + } else if (type instanceof ArrowType.Bool) { + return TSDataType.BOOLEAN; + } else if (type instanceof ArrowType.Utf8 || type instanceof ArrowType.LargeUtf8) { + return TSDataType.STRING; + } else if (type instanceof ArrowType.Binary || type instanceof ArrowType.LargeBinary) { + return TSDataType.BLOB; + } else if (type instanceof ArrowType.Timestamp) { + return TSDataType.INT64; + } else if (type instanceof ArrowType.Date) { + return TSDataType.DATE; + } + return TSDataType.STRING; + } + + static String detectTimestampPrecision(ArrowType type) { + if (type instanceof ArrowType.Timestamp) { + switch (((ArrowType.Timestamp) type).getUnit()) { + case MILLISECOND: + return "ms"; + case MICROSECOND: + return "us"; + case NANOSECOND: + return "ns"; + case SECOND: + return "s"; + default: + return null; + } + } + return null; + } +} diff --git a/java/tools/src/main/java/org/apache/tsfile/tools/AutoSchemaInferer.java b/java/tools/src/main/java/org/apache/tsfile/tools/AutoSchemaInferer.java new file mode 100644 index 000000000..b9f41610a --- /dev/null +++ b/java/tools/src/main/java/org/apache/tsfile/tools/AutoSchemaInferer.java @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.tsfile.tools; + +import org.apache.tsfile.enums.TSDataType; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Pattern; + +public class AutoSchemaInferer { + + static final int DEFAULT_SAMPLE_SIZE = 100; + static final Set DEFAULT_CSV_NULL_TOKENS = + Collections.unmodifiableSet(new HashSet<>(Arrays.asList("", "\\N"))); + + private static final Pattern INTEGER_PATTERN = Pattern.compile("-?\\d+"); + private static final Pattern DECIMAL_PATTERN = Pattern.compile("-?\\d+\\.\\d*|-?\\.\\d+"); + + enum InferredType { + UNKNOWN, + BOOLEAN, + INT64, + DOUBLE, + STRING + } + + public static String detectTimeColumn(List columnNames) { + List matches = new ArrayList<>(); + for (String name : columnNames) { + if ("time".equals(name) || "TIME".equals(name)) { + matches.add(name); + } + } + if (matches.isEmpty()) { + throw new IllegalArgumentException( + "No time column found. Auto mode requires exactly one column named 'time' or 'TIME'."); + } + if (matches.size() > 1) { + throw new IllegalArgumentException( + "Ambiguous time column: found multiple columns matching 'time'/'TIME': " + matches); + } + return matches.get(0); + } + + public static TSDataType[] inferColumnTypes( + List columnNames, + List sampleRows, + String timeColumnName, + Set nullTokens) { + + int numCols = columnNames.size(); + InferredType[] types = new InferredType[numCols]; + Arrays.fill(types, InferredType.UNKNOWN); + + for (Object[] row : sampleRows) { + for (int i = 0; i < numCols && i < row.length; i++) { + if (columnNames.get(i).equals(timeColumnName)) { + continue; + } + Object val = row[i]; + if (val == null) { + continue; + } + String str = val.toString(); + if (nullTokens.contains(str)) { + continue; + } + if (str.trim().isEmpty()) { + continue; + } + InferredType cellType = classifyCell(str); + types[i] = promote(types[i], cellType); + } + } + + TSDataType[] result = new TSDataType[numCols]; + for (int i = 0; i < numCols; i++) { + if (columnNames.get(i).equals(timeColumnName)) { + result[i] = TSDataType.INT64; + } else { + result[i] = toTSDataType(types[i]); + } + } + return result; + } + + public static String deriveTableName(String fileName, String formatDefaultName) { + String name = removeExtension(fileName); + name = name.trim(); + name = name.replaceAll("[^a-zA-Z0-9_.]", "_"); + name = name.replaceAll("_+", "_"); + name = name.replaceAll("^_+|_+$", ""); + if (name.isEmpty()) { + return formatDefaultName; + } + if (Character.isDigit(name.charAt(0))) { + return "t_" + name; + } + return name; + } + + public static ImportSchema buildAutoSchema( + String tableName, + String timeColumnName, + List columnNames, + TSDataType[] columnTypes, + String timePrecision) { + + ImportSchema schema = new ImportSchema(); + schema.setTableName(tableName); + schema.setTimeColumnName(timeColumnName); + schema.setTimePrecision(timePrecision != null ? timePrecision : "ms"); + schema.setTagColumns(new ArrayList()); + schema.setHasHeader(true); + + List sourceColumns = new ArrayList<>(); + for (int i = 0; i < columnNames.size(); i++) { + sourceColumns.add(new ImportSchema.SourceColumn(columnNames.get(i), columnTypes[i])); + } + schema.setSourceColumns(sourceColumns); + + return schema; + } + + static InferredType classifyCell(String value) { + String lower = value.toLowerCase(); + if ("true".equals(lower) || "false".equals(lower)) { + return InferredType.BOOLEAN; + } + if (INTEGER_PATTERN.matcher(value).matches()) { + return InferredType.INT64; + } + if (DECIMAL_PATTERN.matcher(value).matches()) { + return InferredType.DOUBLE; + } + return InferredType.STRING; + } + + static InferredType promote(InferredType current, InferredType incoming) { + if (current == InferredType.UNKNOWN) { + return incoming; + } + if (incoming == InferredType.UNKNOWN) { + return current; + } + if (current == incoming) { + return current; + } + if (current == InferredType.BOOLEAN || incoming == InferredType.BOOLEAN) { + return InferredType.STRING; + } + if ((current == InferredType.INT64 && incoming == InferredType.DOUBLE) + || (current == InferredType.DOUBLE && incoming == InferredType.INT64)) { + return InferredType.DOUBLE; + } + return InferredType.STRING; + } + + static TSDataType toTSDataType(InferredType type) { + switch (type) { + case BOOLEAN: + return TSDataType.BOOLEAN; + case INT64: + return TSDataType.INT64; + case DOUBLE: + return TSDataType.DOUBLE; + case STRING: + return TSDataType.STRING; + case UNKNOWN: + return TSDataType.STRING; + default: + return TSDataType.STRING; + } + } + + private static String removeExtension(String fileName) { + String[] extensions = {".csv", ".parquet", ".arrow", ".ipc", ".feather"}; + String lower = fileName.toLowerCase(); + for (String ext : extensions) { + if (lower.endsWith(ext)) { + return fileName.substring(0, fileName.length() - ext.length()); + } + } + int dot = fileName.lastIndexOf('.'); + if (dot > 0) { + return fileName.substring(0, dot); + } + return fileName; + } +} diff --git a/java/tools/src/main/java/org/apache/tsfile/tools/CsvSourceReader.java b/java/tools/src/main/java/org/apache/tsfile/tools/CsvSourceReader.java new file mode 100644 index 000000000..4db24bba6 --- /dev/null +++ b/java/tools/src/main/java/org/apache/tsfile/tools/CsvSourceReader.java @@ -0,0 +1,290 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.tsfile.tools; + +import org.apache.tsfile.enums.TSDataType; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.List; + +public class CsvSourceReader implements SourceReader { + + private static final Logger LOGGER = LoggerFactory.getLogger(CsvSourceReader.class); + private static final long DEFAULT_CHUNK_SIZE = 256L * 1024 * 1024; + + private final File sourceFile; + private ImportSchema schema; + private final long chunkSizeBytes; + private final String separator; + + private BufferedReader reader; + private String[] columnNames; + private boolean headerConsumed; + private boolean exhausted; + + private List bufferedSampleRows; + private String overrideTableName; + private String overrideTimePrecision; + + public CsvSourceReader(File sourceFile, ImportSchema schema) { + this(sourceFile, schema, DEFAULT_CHUNK_SIZE); + } + + public CsvSourceReader(File sourceFile, ImportSchema schema, long chunkSizeBytes) { + this.sourceFile = sourceFile; + this.schema = schema; + this.chunkSizeBytes = chunkSizeBytes; + this.separator = schema.getSeparator(); + this.headerConsumed = false; + this.exhausted = false; + } + + public CsvSourceReader(File sourceFile, String separator) { + this(sourceFile, separator, DEFAULT_CHUNK_SIZE); + } + + public CsvSourceReader(File sourceFile, String separator, long chunkSizeBytes) { + this.sourceFile = sourceFile; + this.schema = null; + this.chunkSizeBytes = chunkSizeBytes; + this.separator = separator != null ? separator : ","; + this.headerConsumed = false; + this.exhausted = false; + } + + public void setOverrideTableName(String tableName) { + this.overrideTableName = tableName; + } + + public void setOverrideTimePrecision(String timePrecision) { + this.overrideTimePrecision = timePrecision; + } + + @Override + public ImportSchema inferSchema() { + if (schema != null) { + throw new UnsupportedOperationException( + "inferSchema() is only available in auto mode (no schema provided)"); + } + + try { + ensureReaderOpen(); + + String headerLine = reader.readLine(); + if (headerLine == null) { + throw new IllegalArgumentException("CSV file is empty: " + sourceFile.getAbsolutePath()); + } + columnNames = splitLine(headerLine); + headerConsumed = true; + + List colNameList = new ArrayList<>(columnNames.length); + for (String name : columnNames) { + colNameList.add(name); + } + + bufferedSampleRows = new ArrayList<>(); + for (int i = 0; i < AutoSchemaInferer.DEFAULT_SAMPLE_SIZE; i++) { + String line = reader.readLine(); + if (line == null) { + exhausted = true; + break; + } + bufferedSampleRows.add(parseLineAutoMode(line)); + } + + String timeColumn = AutoSchemaInferer.detectTimeColumn(colNameList); + TSDataType[] types = + AutoSchemaInferer.inferColumnTypes( + colNameList, + bufferedSampleRows, + timeColumn, + AutoSchemaInferer.DEFAULT_CSV_NULL_TOKENS); + + String tableName = + overrideTableName != null + ? overrideTableName + : AutoSchemaInferer.deriveTableName(sourceFile.getName(), "csv_data"); + String timePrecision = overrideTimePrecision != null ? overrideTimePrecision : "ms"; + + schema = + AutoSchemaInferer.buildAutoSchema( + tableName, timeColumn, colNameList, types, timePrecision); + schema.setNullFormat("\\N"); + + return schema; + } catch (IOException e) { + throw new RuntimeException("Failed to infer schema from: " + sourceFile.getAbsolutePath(), e); + } + } + + @Override + public SourceBatch readBatch() { + boolean hasBuffered = bufferedSampleRows != null && !bufferedSampleRows.isEmpty(); + if (exhausted && !hasBuffered) { + return null; + } + + try { + ensureReaderOpen(); + + if (schema.isHasHeader() && !headerConsumed) { + String headerLine = reader.readLine(); + if (headerLine == null) { + exhausted = true; + return null; + } + columnNames = splitLine(headerLine); + validateColumnCount(); + headerConsumed = true; + } else if (!headerConsumed) { + columnNames = buildColumnNamesFromSchema(); + headerConsumed = true; + } + + List rows = new ArrayList<>(); + long currentSize = 0; + + if (hasBuffered) { + rows.addAll(bufferedSampleRows); + bufferedSampleRows = null; + } + + if (!exhausted) { + String line; + while ((line = reader.readLine()) != null) { + byte[] lineBytes = line.getBytes(StandardCharsets.UTF_8); + long lineSize = lineBytes.length; + + if (currentSize > 0 && currentSize + lineSize > chunkSizeBytes) { + rows.add(parseLine(line)); + return buildBatch(rows); + } + + rows.add(parseLine(line)); + currentSize += lineSize; + } + exhausted = true; + } + + if (rows.isEmpty()) { + return null; + } + return buildBatch(rows); + + } catch (IOException e) { + LOGGER.error("Error reading CSV file: " + sourceFile.getAbsolutePath(), e); + exhausted = true; + return null; + } + } + + @Override + public void close() { + if (reader != null) { + try { + reader.close(); + } catch (IOException e) { + LOGGER.error("Error closing CSV reader", e); + } + reader = null; + } + } + + private void ensureReaderOpen() throws IOException { + if (reader == null) { + reader = + new BufferedReader( + new InputStreamReader( + Files.newInputStream(sourceFile.toPath()), StandardCharsets.UTF_8)); + } + } + + private String[] splitLine(String line) { + return line.split(separator, -1); + } + + private Object[] parseLine(String line) { + String[] parts = splitLine(line); + Object[] row = new Object[columnNames.length]; + for (int i = 0; i < row.length && i < parts.length; i++) { + String val = parts[i]; + String nullFormat = schema.getNullFormat(); + if (val.isEmpty() || (nullFormat != null && nullFormat.equals(val))) { + row[i] = null; + } else { + row[i] = val; + } + } + return row; + } + + private Object[] parseLineAutoMode(String line) { + String[] parts = splitLine(line); + Object[] row = new Object[columnNames.length]; + for (int i = 0; i < row.length && i < parts.length; i++) { + String val = parts[i]; + if (AutoSchemaInferer.DEFAULT_CSV_NULL_TOKENS.contains(val)) { + row[i] = null; + } else { + row[i] = val; + } + } + return row; + } + + private void validateColumnCount() { + int expected = schema.getSourceColumns().size(); + if (columnNames.length != expected) { + throw new IllegalArgumentException( + "Column count mismatch: schema defines " + + expected + + " columns but CSV header has " + + columnNames.length + + " columns in " + + sourceFile.getAbsolutePath()); + } + } + + private String[] buildColumnNamesFromSchema() { + List srcCols = schema.getSourceColumns(); + String[] names = new String[srcCols.size()]; + for (int i = 0; i < srcCols.size(); i++) { + ImportSchema.SourceColumn col = srcCols.get(i); + names[i] = col.isSkip() ? "_skip_" + i : col.getName(); + } + return names; + } + + private SourceBatch buildBatch(List rows) { + List nameList = new ArrayList<>(columnNames.length); + for (String name : columnNames) { + nameList.add(name); + } + return SourceBatch.fromRows(nameList, rows); + } +} diff --git a/java/tools/src/main/java/org/apache/tsfile/tools/ImportExecutor.java b/java/tools/src/main/java/org/apache/tsfile/tools/ImportExecutor.java new file mode 100644 index 000000000..80961019e --- /dev/null +++ b/java/tools/src/main/java/org/apache/tsfile/tools/ImportExecutor.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.tsfile.tools; + +import org.apache.tsfile.write.TsFileWriter; +import org.apache.tsfile.write.record.Tablet; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; + +public class ImportExecutor { + + private static final Logger LOGGER = LoggerFactory.getLogger(ImportExecutor.class); + + private final ImportSchema importSchema; + private final TimeConverter timeConverter; + private final TabletBuilder tabletBuilder; + + public ImportExecutor(ImportSchema importSchema) { + this.importSchema = importSchema; + this.timeConverter = new TimeConverter(importSchema.getTimePrecision()); + this.tabletBuilder = new TabletBuilder(importSchema, timeConverter); + } + + public boolean execute(SourceReader reader, String outputDir, String sourceBaseName) { + return execute(reader, outputDir, sourceBaseName, null); + } + + public boolean execute( + SourceReader reader, String outputDir, String sourceBaseName, String failDir) { + try { + Files.createDirectories(Paths.get(outputDir)); + } catch (IOException e) { + LOGGER.error("Failed to create output directory: " + outputDir, e); + return false; + } + + int chunkIndex = 0; + boolean hasData = false; + boolean allSuccess = true; + + SourceBatch batch; + while ((batch = reader.readBatch()) != null) { + if (batch.isEmpty()) { + continue; + } + hasData = true; + chunkIndex++; + String tsFileName = buildOutputFileName(sourceBaseName, chunkIndex); + boolean ok = writeTsFile(batch, outputDir, tsFileName); + if (!ok) { + allSuccess = false; + LOGGER.error("Failed to write chunk " + chunkIndex + " to " + tsFileName); + } + } + + if (!hasData) { + LOGGER.warn("No data read from source: " + sourceBaseName); + } + + if (chunkIndex == 1) { + String singleName = sourceBaseName + ".tsfile"; + File indexed = new File(outputDir, buildOutputFileName(sourceBaseName, 1)); + File single = new File(outputDir, singleName); + if (indexed.exists() && !single.exists()) { + if (!indexed.renameTo(single)) { + LOGGER.warn("Failed to rename " + indexed.getName() + " to " + singleName); + } + } + } + + return allSuccess; + } + + private boolean writeTsFile(SourceBatch batch, String outputDir, String fileName) { + File tsFile = new File(outputDir, fileName); + TsFileWriter writer = null; + boolean success = false; + try { + writer = new TsFileWriter(tsFile); + writer.setGenerateTableSchema(true); + writer.registerTableSchema(tabletBuilder.getTableSchema()); + Tablet tablet = tabletBuilder.build(batch); + writer.writeTable(tablet); + success = true; + return true; + } catch (Exception e) { + LOGGER.error("Failed to write file: " + tsFile.getAbsolutePath(), e); + return false; + } finally { + if (writer != null) { + try { + writer.close(); + } catch (IOException e) { + LOGGER.error("Failed to close file: " + tsFile.getAbsolutePath(), e); + } + } + if (!success) { + deleteFile(tsFile); + } + } + } + + private static String buildOutputFileName(String baseName, int chunkIndex) { + return baseName + "_" + chunkIndex + ".tsfile"; + } + + private static void deleteFile(File file) { + if (file.exists() && !file.delete()) { + LOGGER.warn("Failed to delete: " + file.getAbsolutePath()); + } + } +} diff --git a/java/tools/src/main/java/org/apache/tsfile/tools/ImportSchema.java b/java/tools/src/main/java/org/apache/tsfile/tools/ImportSchema.java new file mode 100644 index 000000000..234cd6a20 --- /dev/null +++ b/java/tools/src/main/java/org/apache/tsfile/tools/ImportSchema.java @@ -0,0 +1,247 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.tsfile.tools; + +import org.apache.tsfile.enums.TSDataType; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +public class ImportSchema { + + private String tableName = ""; + private String timePrecision = "ms"; + private boolean hasHeader = true; + private String separator = ","; + private String nullFormat; + private String timeColumnName = ""; + private List tagColumns = new ArrayList<>(); + private List sourceColumns = new ArrayList<>(); + + public static class TagColumn { + private final String name; + private final boolean hasDefault; + private final String defaultValue; + + public TagColumn(String name) { + this.name = name; + this.hasDefault = false; + this.defaultValue = null; + } + + public TagColumn(String name, String defaultValue) { + this.name = name; + this.hasDefault = true; + this.defaultValue = defaultValue; + } + + public String getName() { + return name; + } + + public boolean hasDefault() { + return hasDefault; + } + + public String getDefaultValue() { + return defaultValue; + } + + public boolean existsInSource() { + return !hasDefault; + } + + @Override + public String toString() { + if (hasDefault) { + return "TagColumn{name='" + name + "', default='" + defaultValue + "'}"; + } + return "TagColumn{name='" + name + "'}"; + } + } + + public static class SourceColumn { + private final String name; + private final TSDataType dataType; + private final boolean skip; + + public SourceColumn(String name, TSDataType dataType) { + this.name = name; + this.dataType = dataType; + this.skip = false; + } + + private SourceColumn() { + this.name = null; + this.dataType = null; + this.skip = true; + } + + public static SourceColumn skip() { + return new SourceColumn(); + } + + public static SourceColumn skip(String name) { + return new SourceColumn(name, true); + } + + private SourceColumn(String name, boolean skip) { + this.name = name; + this.dataType = null; + this.skip = skip; + } + + public String getName() { + return name; + } + + public TSDataType getDataType() { + return dataType; + } + + public boolean isSkip() { + return skip; + } + + @Override + public String toString() { + if (skip) { + return name != null ? "SourceColumn{SKIP, name='" + name + "'}" : "SourceColumn{SKIP}"; + } + return "SourceColumn{name='" + name + "', type=" + dataType + "}"; + } + } + + public List fieldColumns() { + Set tagNames = new HashSet<>(); + for (TagColumn tag : tagColumns) { + if (tag.existsInSource()) { + tagNames.add(tag.getName()); + } + } + + List fields = new ArrayList<>(); + for (SourceColumn col : sourceColumns) { + if (col.isSkip()) { + continue; + } + if (col.getName().equals(timeColumnName)) { + continue; + } + if (tagNames.contains(col.getName())) { + continue; + } + fields.add(col); + } + return Collections.unmodifiableList(fields); + } + + // --- getters and setters --- + + public String getTableName() { + return tableName; + } + + public void setTableName(String tableName) { + this.tableName = tableName; + } + + public String getTimePrecision() { + return timePrecision; + } + + public void setTimePrecision(String timePrecision) { + this.timePrecision = timePrecision; + } + + public boolean isHasHeader() { + return hasHeader; + } + + public void setHasHeader(boolean hasHeader) { + this.hasHeader = hasHeader; + } + + public String getSeparator() { + return separator; + } + + public void setSeparator(String separator) { + this.separator = separator; + } + + public String getNullFormat() { + return nullFormat; + } + + public void setNullFormat(String nullFormat) { + this.nullFormat = nullFormat; + } + + public String getTimeColumnName() { + return timeColumnName; + } + + public void setTimeColumnName(String timeColumnName) { + this.timeColumnName = timeColumnName; + } + + public List getTagColumns() { + return tagColumns; + } + + public void setTagColumns(List tagColumns) { + this.tagColumns = tagColumns; + } + + public List getSourceColumns() { + return sourceColumns; + } + + public void setSourceColumns(List sourceColumns) { + this.sourceColumns = sourceColumns; + } + + @Override + public String toString() { + return "ImportSchema{" + + "tableName='" + + tableName + + "', timePrecision='" + + timePrecision + + "', hasHeader=" + + hasHeader + + ", separator='" + + separator + + "', nullFormat='" + + nullFormat + + "', timeColumnName='" + + timeColumnName + + "', tagColumns=" + + tagColumns + + ", sourceColumns=" + + sourceColumns + + ", fieldColumns=" + + fieldColumns() + + '}'; + } +} diff --git a/java/tools/src/main/java/org/apache/tsfile/tools/ImportSchemaParser.java b/java/tools/src/main/java/org/apache/tsfile/tools/ImportSchemaParser.java new file mode 100644 index 000000000..28410b421 --- /dev/null +++ b/java/tools/src/main/java/org/apache/tsfile/tools/ImportSchemaParser.java @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.tsfile.tools; + +import org.apache.tsfile.enums.TSDataType; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +public class ImportSchemaParser { + + private enum Section { + NONE, + TAG_COLUMNS, + SOURCE_COLUMNS + } + + public static ImportSchema parse(String filePath) throws IOException { + ImportSchema schema = new ImportSchema(); + List tagColumns = new ArrayList<>(); + List sourceColumns = new ArrayList<>(); + + try (BufferedReader reader = new BufferedReader(new FileReader(filePath))) { + String line; + Section section = Section.NONE; + + while ((line = reader.readLine()) != null) { + line = line.trim(); + if (line.isEmpty() || line.startsWith("//")) { + continue; + } + + if (line.startsWith("table_name=")) { + schema.setTableName(extractValue(line)); + section = Section.NONE; + } else if (line.startsWith("time_precision=")) { + schema.setTimePrecision(extractValue(line)); + section = Section.NONE; + } else if (line.startsWith("has_header=")) { + String val = extractValue(line); + if (!"true".equals(val) && !"false".equals(val)) { + throw new IllegalArgumentException("has_header must be true or false"); + } + schema.setHasHeader(Boolean.parseBoolean(val)); + section = Section.NONE; + } else if (line.startsWith("separator=")) { + schema.setSeparator(extractValue(line)); + section = Section.NONE; + } else if (line.startsWith("null_format=")) { + schema.setNullFormat(extractValue(line)); + section = Section.NONE; + } else if (line.startsWith("time_column=")) { + schema.setTimeColumnName(extractValue(line)); + section = Section.NONE; + } else if (line.equals("tag_columns") || line.equals("id_columns")) { + section = Section.TAG_COLUMNS; + } else if (line.equals("source_columns") || line.equals("csv_columns")) { + section = Section.SOURCE_COLUMNS; + } else if (section == Section.TAG_COLUMNS) { + tagColumns.add(parseTagColumn(line)); + } else if (section == Section.SOURCE_COLUMNS) { + sourceColumns.add(parseSourceColumn(line)); + } + } + } + + schema.setTagColumns(tagColumns); + schema.setSourceColumns(sourceColumns); + + if ("tab".equals(schema.getSeparator())) { + schema.setSeparator("\t"); + } + + validate(schema); + return schema; + } + + private static String extractValue(String line) { + int index = line.indexOf('='); + return line.substring(index + 1); + } + + private static ImportSchema.TagColumn parseTagColumn(String line) { + String[] parts = line.split(" "); + if (parts.length == 3 && parts[1].trim().equalsIgnoreCase("DEFAULT")) { + return new ImportSchema.TagColumn(parts[0].trim(), parts[2].trim()); + } else if (parts.length == 1) { + return new ImportSchema.TagColumn(parts[0].trim()); + } + throw new IllegalArgumentException("Invalid tag_columns format: " + line); + } + + private static ImportSchema.SourceColumn parseSourceColumn(String line) { + String[] parts = line.split(" "); + String name = parts[0].trim(); + if (name.endsWith(",") || name.endsWith(";")) { + name = name.substring(0, name.length() - 1); + } + + if (parts.length == 2) { + String dataType = parts[1].trim(); + if (dataType.endsWith(",") || dataType.endsWith(";")) { + dataType = dataType.substring(0, dataType.length() - 1); + } + if (dataType.equalsIgnoreCase("SKIP")) { + return ImportSchema.SourceColumn.skip(name); + } + return new ImportSchema.SourceColumn(name, resolveDataType(dataType)); + } else if (parts.length == 1) { + if (name.equalsIgnoreCase("SKIP")) { + return ImportSchema.SourceColumn.skip(); + } + return new ImportSchema.SourceColumn(name, TSDataType.STRING); + } + throw new IllegalArgumentException("Invalid source_columns format: " + line); + } + + private static TSDataType resolveDataType(String typeStr) { + switch (typeStr.toUpperCase()) { + case "TEXT": + return TSDataType.TEXT; + case "STRING": + return TSDataType.STRING; + case "INT32": + return TSDataType.INT32; + case "INT64": + return TSDataType.INT64; + case "FLOAT": + return TSDataType.FLOAT; + case "DOUBLE": + return TSDataType.DOUBLE; + case "BOOLEAN": + return TSDataType.BOOLEAN; + case "BLOB": + return TSDataType.BLOB; + case "DATE": + return TSDataType.DATE; + case "TIMESTAMP": + return TSDataType.TIMESTAMP; + default: + throw new IllegalArgumentException("Unknown data type: " + typeStr); + } + } + + private static void validate(ImportSchema schema) { + String tp = schema.getTimePrecision(); + if (!"ms".equals(tp) && !"us".equals(tp) && !"ns".equals(tp)) { + throw new IllegalArgumentException("time_precision must be ms, us, or ns"); + } + + String sep = schema.getSeparator(); + if (!",".equals(sep) && !"\t".equals(sep) && !";".equals(sep)) { + throw new IllegalArgumentException("separator must be \",\", tab, or \";\""); + } + + if (schema.getTableName().isEmpty()) { + throw new IllegalArgumentException("table_name is required"); + } + + if (schema.getTimeColumnName().isEmpty()) { + throw new IllegalArgumentException("time_column is required"); + } + + if (schema.getSourceColumns().isEmpty()) { + throw new IllegalArgumentException("source_columns is required"); + } + + boolean timeFound = false; + for (ImportSchema.SourceColumn col : schema.getSourceColumns()) { + if (!col.isSkip() && col.getName().equals(schema.getTimeColumnName())) { + timeFound = true; + break; + } + } + if (!timeFound) { + throw new IllegalArgumentException( + "time_column '" + schema.getTimeColumnName() + "' not found in source_columns"); + } + + Set sourceNames = new HashSet<>(); + for (ImportSchema.SourceColumn col : schema.getSourceColumns()) { + if (!col.isSkip()) { + sourceNames.add(col.getName()); + } + } + for (ImportSchema.TagColumn tag : schema.getTagColumns()) { + if (tag.existsInSource() && !sourceNames.contains(tag.getName())) { + throw new IllegalArgumentException( + "tag_columns '" + tag.getName() + "' not found in source_columns"); + } + } + } +} diff --git a/java/tools/src/main/java/org/apache/tsfile/tools/ParquetSourceReader.java b/java/tools/src/main/java/org/apache/tsfile/tools/ParquetSourceReader.java new file mode 100644 index 000000000..5cc547d2b --- /dev/null +++ b/java/tools/src/main/java/org/apache/tsfile/tools/ParquetSourceReader.java @@ -0,0 +1,315 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.tsfile.tools; + +import org.apache.tsfile.enums.TSDataType; + +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.example.data.simple.convert.GroupRecordConverter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.LocalInputFile; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class ParquetSourceReader implements SourceReader { + + private static final Logger LOGGER = LoggerFactory.getLogger(ParquetSourceReader.class); + + private final File sourceFile; + private ImportSchema schema; + private ParquetFileReader parquetReader; + private MessageType parquetSchema; + private boolean exhausted; + + private String overrideTableName; + private String overrideTimePrecision; + + public ParquetSourceReader(File sourceFile, ImportSchema schema) { + this.sourceFile = sourceFile; + this.schema = schema; + this.exhausted = false; + } + + public ParquetSourceReader(File sourceFile) { + this.sourceFile = sourceFile; + this.schema = null; + this.exhausted = false; + } + + public void setOverrideTableName(String tableName) { + this.overrideTableName = tableName; + } + + public void setOverrideTimePrecision(String timePrecision) { + this.overrideTimePrecision = timePrecision; + } + + @Override + public ImportSchema inferSchema() { + if (schema != null) { + throw new UnsupportedOperationException("inferSchema() is only available in auto mode"); + } + + try { + ensureReaderOpen(); + + List columnNames = new ArrayList<>(); + List columnTypes = new ArrayList<>(); + String detectedTimePrecision = null; + + for (Type field : parquetSchema.getFields()) { + String name = field.getName(); + columnNames.add(name); + + if (field.isPrimitive()) { + PrimitiveType pt = field.asPrimitiveType(); + TSDataType tsType = mapParquetType(pt); + columnTypes.add(tsType); + + if (("time".equals(name) || "TIME".equals(name)) && detectedTimePrecision == null) { + detectedTimePrecision = detectTimestampPrecision(pt); + } + } else { + columnTypes.add(TSDataType.STRING); + } + } + + String timeColumn = AutoSchemaInferer.detectTimeColumn(columnNames); + TSDataType[] types = columnTypes.toArray(new TSDataType[0]); + + String tableName = + overrideTableName != null + ? overrideTableName + : AutoSchemaInferer.deriveTableName(sourceFile.getName(), "parquet_data"); + + String timePrecision; + if (overrideTimePrecision != null) { + timePrecision = overrideTimePrecision; + } else if (detectedTimePrecision != null) { + timePrecision = detectedTimePrecision; + } else { + timePrecision = "ms"; + } + + schema = + AutoSchemaInferer.buildAutoSchema( + tableName, timeColumn, columnNames, types, timePrecision); + return schema; + } catch (IOException e) { + throw new RuntimeException("Failed to infer schema from: " + sourceFile.getAbsolutePath(), e); + } + } + + @Override + public SourceBatch readBatch() { + if (exhausted) { + return null; + } + + try { + ensureReaderOpen(); + + PageReadStore rowGroup = parquetReader.readNextRowGroup(); + if (rowGroup == null) { + exhausted = true; + return null; + } + + long rowCount = rowGroup.getRowCount(); + MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(parquetSchema); + RecordReader recordReader = + columnIO.getRecordReader(rowGroup, new GroupRecordConverter(parquetSchema)); + + List schemaColumnNames = getSchemaColumnNames(); + Map parquetColIndex = buildParquetColumnIndex(); + + int numCols = schemaColumnNames.size(); + List rows = new ArrayList<>((int) rowCount); + + for (long r = 0; r < rowCount; r++) { + Group group = recordReader.read(); + Object[] row = new Object[numCols]; + + for (int c = 0; c < numCols; c++) { + String colName = schemaColumnNames.get(c); + Integer pIdx = parquetColIndex.get(colName); + if (pIdx == null) { + row[c] = null; + continue; + } + + try { + if (group.getFieldRepetitionCount(pIdx) == 0) { + row[c] = null; + } else { + row[c] = extractValue(group, pIdx); + } + } catch (RuntimeException e) { + row[c] = null; + } + } + rows.add(row); + } + + return SourceBatch.fromRows(schemaColumnNames, rows); + } catch (IOException e) { + LOGGER.error("Error reading Parquet file: " + sourceFile.getAbsolutePath(), e); + exhausted = true; + return null; + } + } + + @Override + public void close() { + if (parquetReader != null) { + try { + parquetReader.close(); + } catch (IOException e) { + LOGGER.error("Error closing Parquet reader", e); + } + parquetReader = null; + } + } + + private void ensureReaderOpen() throws IOException { + if (parquetReader == null) { + parquetReader = ParquetFileReader.open(new LocalInputFile(sourceFile.toPath())); + parquetSchema = parquetReader.getFooter().getFileMetaData().getSchema(); + } + } + + private List getSchemaColumnNames() { + List names = new ArrayList<>(); + for (ImportSchema.SourceColumn col : schema.getSourceColumns()) { + if (!col.isSkip()) { + names.add(col.getName()); + } + } + return names; + } + + private Map buildParquetColumnIndex() { + Map index = new HashMap<>(); + List fields = parquetSchema.getFields(); + for (int i = 0; i < fields.size(); i++) { + index.put(fields.get(i).getName(), i); + } + return index; + } + + private Object extractValue(Group group, int fieldIndex) { + Type fieldType = parquetSchema.getType(fieldIndex); + if (!fieldType.isPrimitive()) { + return group.getGroup(fieldIndex, 0).toString(); + } + + PrimitiveType pt = fieldType.asPrimitiveType(); + switch (pt.getPrimitiveTypeName()) { + case BOOLEAN: + return group.getBoolean(fieldIndex, 0); + case INT32: + return group.getInteger(fieldIndex, 0); + case INT64: + return group.getLong(fieldIndex, 0); + case FLOAT: + return group.getFloat(fieldIndex, 0); + case DOUBLE: + return group.getDouble(fieldIndex, 0); + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + LogicalTypeAnnotation logical = pt.getLogicalTypeAnnotation(); + if (logical instanceof LogicalTypeAnnotation.StringLogicalTypeAnnotation) { + return group.getString(fieldIndex, 0); + } + return group.getBinary(fieldIndex, 0).getBytes(); + case INT96: + return group.getBinary(fieldIndex, 0).getBytes(); + default: + return group.getValueToString(fieldIndex, 0); + } + } + + static TSDataType mapParquetType(PrimitiveType pt) { + LogicalTypeAnnotation logical = pt.getLogicalTypeAnnotation(); + + if (logical instanceof LogicalTypeAnnotation.TimestampLogicalTypeAnnotation) { + return TSDataType.INT64; + } + if (logical instanceof LogicalTypeAnnotation.DateLogicalTypeAnnotation) { + return TSDataType.DATE; + } + if (logical instanceof LogicalTypeAnnotation.StringLogicalTypeAnnotation) { + return TSDataType.STRING; + } + + switch (pt.getPrimitiveTypeName()) { + case BOOLEAN: + return TSDataType.BOOLEAN; + case INT32: + return TSDataType.INT32; + case INT64: + return TSDataType.INT64; + case FLOAT: + return TSDataType.FLOAT; + case DOUBLE: + return TSDataType.DOUBLE; + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + return TSDataType.STRING; + case INT96: + return TSDataType.INT64; + default: + return TSDataType.STRING; + } + } + + static String detectTimestampPrecision(PrimitiveType pt) { + LogicalTypeAnnotation logical = pt.getLogicalTypeAnnotation(); + if (logical instanceof LogicalTypeAnnotation.TimestampLogicalTypeAnnotation) { + LogicalTypeAnnotation.TimestampLogicalTypeAnnotation ts = + (LogicalTypeAnnotation.TimestampLogicalTypeAnnotation) logical; + switch (ts.getUnit()) { + case MILLIS: + return "ms"; + case MICROS: + return "us"; + case NANOS: + return "ns"; + default: + return null; + } + } + return null; + } +} diff --git a/java/tools/src/main/java/org/apache/tsfile/tools/SourceBatch.java b/java/tools/src/main/java/org/apache/tsfile/tools/SourceBatch.java new file mode 100644 index 000000000..18d01b2bb --- /dev/null +++ b/java/tools/src/main/java/org/apache/tsfile/tools/SourceBatch.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.tsfile.tools; + +import java.util.Arrays; +import java.util.List; + +public class SourceBatch { + + private final String[] columnNames; + private final Object[][] columnData; + private final int rowCount; + + public SourceBatch(String[] columnNames, Object[][] columnData, int rowCount) { + this.columnNames = columnNames; + this.columnData = columnData; + this.rowCount = rowCount; + } + + public static SourceBatch fromRows(List columnNames, List rows) { + int colCount = columnNames.size(); + int rowCount = rows.size(); + String[] names = columnNames.toArray(new String[0]); + Object[][] colData = new Object[colCount][rowCount]; + for (int r = 0; r < rowCount; r++) { + Object[] row = rows.get(r); + for (int c = 0; c < colCount; c++) { + colData[c][r] = c < row.length ? row[c] : null; + } + } + return new SourceBatch(names, colData, rowCount); + } + + public int getRowCount() { + return rowCount; + } + + public int getColumnCount() { + return columnNames.length; + } + + public String getColumnName(int columnIndex) { + return columnNames[columnIndex]; + } + + public String[] getColumnNames() { + return columnNames; + } + + public Object getValue(int rowIndex, int columnIndex) { + return columnData[columnIndex][rowIndex]; + } + + public Object[] getColumn(int columnIndex) { + return columnData[columnIndex]; + } + + public boolean isEmpty() { + return rowCount == 0; + } + + @Override + public String toString() { + return "SourceBatch{columns=" + Arrays.toString(columnNames) + ", rows=" + rowCount + '}'; + } +} diff --git a/java/tools/src/main/java/org/apache/tsfile/tools/SourceReader.java b/java/tools/src/main/java/org/apache/tsfile/tools/SourceReader.java new file mode 100644 index 000000000..aa6721005 --- /dev/null +++ b/java/tools/src/main/java/org/apache/tsfile/tools/SourceReader.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.tsfile.tools; + +/** + * Unified interface for reading source data from different formats (CSV, Parquet, Arrow). + * + *

Usage in schema mode: construct with an ImportSchema, then call readBatch() repeatedly. + * + *

Usage in auto mode: call inferSchema() first, then readBatch() repeatedly. + */ +public interface SourceReader extends AutoCloseable { + + /** + * Infer schema from the source data (auto mode). Examines column names and types to produce an + * ImportSchema where the time column is identified and all other columns become FIELD. + * + * @return inferred ImportSchema + */ + ImportSchema inferSchema(); + + /** + * Read the next batch of data. Returns null when no more data is available. + * + * @return next batch, or null if exhausted + */ + SourceBatch readBatch(); + + @Override + void close(); +} diff --git a/java/tools/src/main/java/org/apache/tsfile/tools/TabletBuilder.java b/java/tools/src/main/java/org/apache/tsfile/tools/TabletBuilder.java new file mode 100644 index 000000000..26a957fdb --- /dev/null +++ b/java/tools/src/main/java/org/apache/tsfile/tools/TabletBuilder.java @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.tsfile.tools; + +import org.apache.tsfile.enums.ColumnCategory; +import org.apache.tsfile.enums.TSDataType; +import org.apache.tsfile.file.metadata.TableSchema; +import org.apache.tsfile.write.record.Tablet; +import org.apache.tsfile.write.schema.IMeasurementSchema; +import org.apache.tsfile.write.schema.MeasurementSchema; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class TabletBuilder { + + private final ImportSchema importSchema; + private final TimeConverter timeConverter; + private final TableSchema tableSchema; + private final Map tagDefaults; + private final Map sourceColumnIndex; + private final int timeColumnSourceIndex; + + public TabletBuilder(ImportSchema importSchema, TimeConverter timeConverter) { + this.importSchema = importSchema; + this.timeConverter = timeConverter; + this.tagDefaults = new HashMap<>(); + this.sourceColumnIndex = new HashMap<>(); + this.tableSchema = buildTableSchema(); + this.timeColumnSourceIndex = resolveTimeColumnIndex(); + } + + public TableSchema getTableSchema() { + return tableSchema; + } + + public Tablet build(SourceBatch batch) { + int rowCount = batch.getRowCount(); + int[] sortedIndices = sortByTimestamp(batch); + + Tablet tablet = + new Tablet( + tableSchema.getTableName(), + IMeasurementSchema.getMeasurementNameList(tableSchema.getColumnSchemas()), + IMeasurementSchema.getDataTypeList(tableSchema.getColumnSchemas()), + tableSchema.getColumnTypes(), + rowCount); + + for (int i = 0; i < rowCount; i++) { + int row = sortedIndices[i]; + Object timeValue = batch.getValue(row, timeColumnSourceIndex); + long timestamp = timeConverter.convert(timeValue, importSchema.getTimePrecision()); + tablet.addTimestamp(i, timestamp); + + for (int col = 0; col < tableSchema.getColumnSchemas().size(); col++) { + IMeasurementSchema colSchema = tableSchema.getColumnSchemas().get(col); + String colName = colSchema.getMeasurementName(); + + if (tagDefaults.containsKey(colName)) { + tablet.addValue(colName, i, tagDefaults.get(colName)); + continue; + } + + Integer srcIdx = sourceColumnIndex.get(colName); + if (srcIdx == null) { + continue; + } + + Object rawValue = batch.getValue(row, srcIdx); + if (isNull(rawValue)) { + continue; + } + + boolean isMeasurement = tableSchema.getColumnTypes().get(col) == ColumnCategory.FIELD; + Object converted = ValueConverter.convert(rawValue, colSchema.getType(), isMeasurement); + tablet.addValue(colName, i, converted); + } + } + + tablet.setRowSize(rowCount); + return tablet; + } + + private int[] sortByTimestamp(SourceBatch batch) { + int rowCount = batch.getRowCount(); + Integer[] indices = new Integer[rowCount]; + for (int i = 0; i < rowCount; i++) { + indices[i] = i; + } + java.util.Arrays.sort( + indices, + (a, b) -> { + Object va = batch.getValue(a, timeColumnSourceIndex); + Object vb = batch.getValue(b, timeColumnSourceIndex); + long ta = timeConverter.convert(va, importSchema.getTimePrecision()); + long tb = timeConverter.convert(vb, importSchema.getTimePrecision()); + return Long.compare(ta, tb); + }); + int[] result = new int[rowCount]; + for (int i = 0; i < rowCount; i++) { + result[i] = indices[i]; + } + return result; + } + + private TableSchema buildTableSchema() { + List schemas = new ArrayList<>(); + List categories = new ArrayList<>(); + + for (ImportSchema.TagColumn tag : importSchema.getTagColumns()) { + if (tag.hasDefault()) { + tagDefaults.put(tag.getName(), tag.getDefaultValue()); + } + schemas.add( + new MeasurementSchema( + tag.getName(), + TSDataType.TEXT, + org.apache.tsfile.file.metadata.enums.TSEncoding.PLAIN, + org.apache.tsfile.file.metadata.enums.CompressionType.UNCOMPRESSED)); + categories.add(ColumnCategory.TAG); + } + + for (ImportSchema.SourceColumn field : importSchema.fieldColumns()) { + schemas.add( + new MeasurementSchema( + field.getName(), + field.getDataType(), + org.apache.tsfile.file.metadata.enums.TSEncoding.PLAIN, + org.apache.tsfile.file.metadata.enums.CompressionType.UNCOMPRESSED)); + categories.add(ColumnCategory.FIELD); + } + + return new TableSchema(importSchema.getTableName(), schemas, categories); + } + + private int resolveTimeColumnIndex() { + String[] colNames = findSourceColumnNames(); + String timeName = importSchema.getTimeColumnName(); + for (int i = 0; i < colNames.length; i++) { + if (colNames[i] != null && colNames[i].equals(timeName)) { + return i; + } + } + throw new IllegalArgumentException( + "Time column '" + timeName + "' not found in source columns"); + } + + private String[] findSourceColumnNames() { + List srcCols = importSchema.getSourceColumns(); + String[] names = new String[srcCols.size()]; + for (int i = 0; i < srcCols.size(); i++) { + ImportSchema.SourceColumn col = srcCols.get(i); + names[i] = col.isSkip() ? null : col.getName(); + if (!col.isSkip()) { + sourceColumnIndex.put(col.getName(), i); + } + } + return names; + } + + private boolean isNull(Object value) { + if (value == null) { + return true; + } + if (value instanceof String) { + String s = (String) value; + String nullFormat = importSchema.getNullFormat(); + if (nullFormat != null && nullFormat.equals(s)) { + return true; + } + return s.isEmpty(); + } + return false; + } +} diff --git a/java/tools/src/main/java/org/apache/tsfile/tools/TimeConverter.java b/java/tools/src/main/java/org/apache/tsfile/tools/TimeConverter.java new file mode 100644 index 000000000..02e9aabc7 --- /dev/null +++ b/java/tools/src/main/java/org/apache/tsfile/tools/TimeConverter.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.tsfile.tools; + +import java.time.Instant; + +public class TimeConverter { + + private static final long THRESHOLD_NS = (long) 1e15; + private static final long THRESHOLD_US = (long) 1e12; + private static final long THRESHOLD_MS = (long) 1e11; + + private final String targetPrecision; + + public TimeConverter(String targetPrecision) { + this.targetPrecision = targetPrecision; + } + + public long convert(Object value) { + if (value == null) { + throw new IllegalArgumentException("Time value cannot be null"); + } + if (value instanceof Instant) { + return fromInstant((Instant) value); + } + if (value instanceof Number) { + return fromNumeric(((Number) value).longValue()); + } + return fromString(value.toString()); + } + + public long convert(Object value, String sourcePrecision) { + if (value == null) { + throw new IllegalArgumentException("Time value cannot be null"); + } + if (value instanceof Instant) { + return fromInstant((Instant) value); + } + if (value instanceof Number) { + return rescale(((Number) value).longValue(), sourcePrecision, targetPrecision); + } + return fromStringWithPrecision(value.toString(), sourcePrecision); + } + + private long fromStringWithPrecision(String value, String sourcePrecision) { + try { + long numeric = Long.parseLong(value); + return rescale(numeric, sourcePrecision, targetPrecision); + } catch (NumberFormatException e) { + return DateTimeUtils.convertTimestampOrDatetimeStrToLongWithDefaultZone( + value, targetPrecision); + } + } + + private long fromInstant(Instant instant) { + switch (targetPrecision) { + case "ns": + return Math.addExact( + Math.multiplyExact(instant.getEpochSecond(), 1_000_000_000L), instant.getNano()); + case "us": + return Math.addExact( + Math.multiplyExact(instant.getEpochSecond(), 1_000_000L), instant.getNano() / 1_000); + case "ms": + default: + return instant.toEpochMilli(); + } + } + + private long fromNumeric(long value) { + String inferred = inferPrecision(value); + return rescale(value, inferred, targetPrecision); + } + + private long fromString(String value) { + try { + long numeric = Long.parseLong(value); + return fromNumeric(numeric); + } catch (NumberFormatException e) { + return DateTimeUtils.convertTimestampOrDatetimeStrToLongWithDefaultZone( + value, targetPrecision); + } + } + + static String inferPrecision(long value) { + long abs = Math.abs(value); + if (abs > THRESHOLD_NS) { + return "ns"; + } else if (abs > THRESHOLD_US) { + return "us"; + } else if (abs > THRESHOLD_MS) { + return "ms"; + } + return "s"; + } + + static long rescale(long value, String from, String to) { + if (from.equals(to)) { + return value; + } + long valueInNs = toNanos(value, from); + return fromNanos(valueInNs, to); + } + + private static long toNanos(long value, String precision) { + switch (precision) { + case "s": + return Math.multiplyExact(value, 1_000_000_000L); + case "ms": + return Math.multiplyExact(value, 1_000_000L); + case "us": + return Math.multiplyExact(value, 1_000L); + case "ns": + return value; + default: + throw new IllegalArgumentException("Unknown precision: " + precision); + } + } + + private static long fromNanos(long nanos, String precision) { + switch (precision) { + case "s": + return nanos / 1_000_000_000L; + case "ms": + return nanos / 1_000_000L; + case "us": + return nanos / 1_000L; + case "ns": + return nanos; + default: + throw new IllegalArgumentException("Unknown precision: " + precision); + } + } +} diff --git a/java/tools/src/main/java/org/apache/tsfile/tools/TsFileTool.java b/java/tools/src/main/java/org/apache/tsfile/tools/TsFileTool.java index 26b699167..fb3251112 100644 --- a/java/tools/src/main/java/org/apache/tsfile/tools/TsFileTool.java +++ b/java/tools/src/main/java/org/apache/tsfile/tools/TsFileTool.java @@ -19,18 +19,7 @@ package org.apache.tsfile.tools; -import org.apache.tsfile.enums.ColumnCategory; -import org.apache.tsfile.enums.TSDataType; import org.apache.tsfile.external.commons.io.FilenameUtils; -import org.apache.tsfile.external.commons.lang3.StringUtils; -import org.apache.tsfile.file.metadata.TableSchema; -import org.apache.tsfile.file.metadata.enums.CompressionType; -import org.apache.tsfile.file.metadata.enums.TSEncoding; -import org.apache.tsfile.utils.Binary; -import org.apache.tsfile.write.TsFileWriter; -import org.apache.tsfile.write.record.Tablet; -import org.apache.tsfile.write.schema.IMeasurementSchema; -import org.apache.tsfile.write.schema.MeasurementSchema; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; @@ -41,38 +30,32 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.BufferedReader; -import java.io.BufferedWriter; import java.io.File; -import java.io.FileWriter; import java.io.IOException; -import java.io.InputStreamReader; -import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardCopyOption; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; public class TsFileTool { + + private static final Logger LOGGER = LoggerFactory.getLogger(TsFileTool.class); + private static int THREAD_COUNT = 8; - // Default value 256MB - private static long CHUNK_SIZE_BYTE = 1024 * 1024 * 256; + private static long CHUNK_SIZE_BYTE = 256L * 1024 * 1024; private static String outputDirectoryStr = ""; private static String inputDirectoryStr = ""; private static String failedDirectoryStr = "failed"; private static String schemaPathStr = ""; + private static String tableNameStr = null; + private static String timePrecisionStr = null; + private static String separatorStr = null; + private static String formatStr = null; - private static SchemaParser.Schema schema = null; - - private static final Logger LOGGER = LoggerFactory.getLogger(TsFileTool.class); + private static ImportSchema importSchema = null; public static void main(String[] args) { if (System.getenv("TSFILE_HOME") != null) { @@ -83,14 +66,18 @@ public static void main(String[] args) { return; } createDir(); - try { - schema = SchemaParser.parseSchema(schemaPathStr); - } catch (Exception e) { - LOGGER.error("Failed to parse schema file: " + schemaPathStr, e); - System.exit(1); + + boolean isSchemaMode = schemaPathStr != null && !schemaPathStr.isEmpty(); + if (isSchemaMode) { + try { + importSchema = ImportSchemaParser.parse(schemaPathStr); + } catch (Exception e) { + LOGGER.error("Failed to parse schema file: " + schemaPathStr, e); + System.exit(1); + } } - File inputDirectory = new File(inputDirectoryStr); + File inputDirectory = new File(inputDirectoryStr); ExecutorService executor = Executors.newFixedThreadPool(THREAD_COUNT); try { @@ -106,384 +93,198 @@ public static void main(String[] args) { } } - private static TableSchema genTableSchema( - List idColumnList, - List columnList, - String tableName, - Map defaultMap) { - List measurementSchemas = new ArrayList<>(); - List columnCategories = new ArrayList<>(); - List idSchemaList = new ArrayList<>(); - for (SchemaParser.IDColumns idSchema : idColumnList) { - if (idSchema.isDefault) { - defaultMap.put(idSchema.name, idSchema.defaultValue); - } - idSchemaList.add(idSchema.name); - measurementSchemas.add( - new MeasurementSchema( - idSchema.name, TSDataType.TEXT, TSEncoding.PLAIN, CompressionType.UNCOMPRESSED)); - columnCategories.add(ColumnCategory.TAG); - } - List newColumnList = new ArrayList<>(); - - for (SchemaParser.Column column : columnList) { - if (!column.isSkip - && !idSchemaList.contains(column.name) - && !column.name.equals(schema.timeColumn)) { - newColumnList.add(column); + private static void processDirectory(File directory, ExecutorService executor) { + if (directory.isFile()) { + processFile(directory, executor); + } else { + File[] files = directory.listFiles(); + if (files != null) { + for (File file : files) { + if (file.isDirectory()) { + processDirectory(file, executor); + } else if (file.isFile() && isAcceptedFormat(file.getName())) { + processFile(file, executor); + } + } } } - - for (SchemaParser.Column column : newColumnList) { - measurementSchemas.add( - new MeasurementSchema( - column.name, - TSDataType.valueOf(column.type), - TSEncoding.PLAIN, - CompressionType.UNCOMPRESSED)); - columnCategories.add(ColumnCategory.FIELD); - } - return new TableSchema(tableName, measurementSchemas, columnCategories); } - private static boolean writeTsFile( - String sourceFilePath, String fileName, List lineList) { - String inputFileAbsolutePath = new File(inputDirectoryStr).getAbsolutePath(); - String soureFlieName = new File(sourceFilePath).getName(); - String fileOutPutDirStr = - outputDirectoryStr - + sourceFilePath.replace(inputFileAbsolutePath, "").replace(soureFlieName, ""); - final File tsFile = new File(fileOutPutDirStr, fileName); - TsFileWriter writer = null; - try { - writer = new TsFileWriter(tsFile); - writer.setGenerateTableSchema(true); - Map defaultMap = new HashMap<>(); - TableSchema tableSchema = - genTableSchema(schema.idColumns, schema.csvColumns, schema.tableName, defaultMap); - writer.registerTableSchema(tableSchema); - Tablet tablet = genTablet(tableSchema, lineList, defaultMap); - if (tablet != null) { - writer.writeTable(tablet); - return true; - } else { - return false; - } - } catch (Exception e) { - LOGGER.error("Failed to write file: " + tsFile.getAbsolutePath(), e); - return false; - } finally { - if (writer != null) { - try { - writer.close(); - } catch (IOException e) { - LOGGER.error("Failed to close file: " + tsFile.getAbsolutePath(), e); - } - } + private static boolean isAcceptedFormat(String fileName) { + String lower = fileName.toLowerCase(); + String fmt = resolveFormat(fileName); + if (formatStr != null) { + return fmt.equals(formatStr); } + return lower.endsWith(".csv") + || lower.endsWith(".parquet") + || lower.endsWith(".arrow") + || lower.endsWith(".ipc") + || lower.endsWith(".feather"); } - private static void deleteFile(File tsfile) { - if (!tsfile.delete()) { - LOGGER.error(tsfile.getAbsolutePath() + " delete failed"); + private static String resolveFormat(String fileName) { + if (formatStr != null) { + return formatStr; } + String lower = fileName.toLowerCase(); + if (lower.endsWith(".parquet")) { + return "parquet"; + } + if (lower.endsWith(".arrow") || lower.endsWith(".ipc") || lower.endsWith(".feather")) { + return "arrow"; + } + return "csv"; } - private static Tablet genTablet( - TableSchema tableSchema, List lineList, Map defaultMap) { - int num = lineList.size(); - Tablet tablet = - new Tablet( - tableSchema.getTableName(), - IMeasurementSchema.getMeasurementNameList(tableSchema.getColumnSchemas()), - IMeasurementSchema.getDataTypeList(tableSchema.getColumnSchemas()), - tableSchema.getColumnTypes(), - num); - - Map map = new HashMap<>(); - for (int i = 0; i < schema.csvColumns.size(); i++) { - SchemaParser.Column column = schema.csvColumns.get(i); - map.put(column.name, i); - } - try { - List parsedLines = sortAndParseLines(lineList); - for (int i = 0; i < num; i++) { - String[] lineArray = parsedLines.get(i); - long timestamp = - DateTimeUtils.convertTimestampOrDatetimeStrToLongWithDefaultZone( - lineArray[schema.timeColumnIndex], schema.timePrecision); + private static void processFile(File inputFile, ExecutorService executor) { + String baseName = FilenameUtils.getBaseName(inputFile.getName()); + String inputFileAbsolutePath = new File(inputDirectoryStr).getAbsolutePath(); + String sourceFileName = inputFile.getName(); + String relativePath = + inputFile.getAbsolutePath().replace(inputFileAbsolutePath, "").replace(sourceFileName, ""); + String outputDir = outputDirectoryStr + relativePath; + String format = resolveFormat(inputFile.getName()); - tablet.addTimestamp(i, timestamp); - List columnSchemas = tableSchema.getColumnSchemas(); - for (int j = 0; j < columnSchemas.size(); j++) { - IMeasurementSchema columnSchema = columnSchemas.get(j); - if (defaultMap.get(columnSchema.getMeasurementName()) != null) { - tablet.addValue( - columnSchema.getMeasurementName(), - i, - defaultMap.get(columnSchema.getMeasurementName())); - } else { - String value = lineArray[map.get(columnSchema.getMeasurementName())]; - if (value.equals(schema.nullFormat)) { - value = null; + executor.submit( + () -> { + try { + if (importSchema != null) { + processSchemaMode(inputFile, baseName, outputDir, format); + } else { + processAutoMode(inputFile, baseName, outputDir, format); } - tablet.addValue( - columnSchema.getMeasurementName(), - i, - getValue(columnSchema.getType(), value, tableSchema.getColumnTypes().get(j))); + } catch (Exception e) { + LOGGER.error("Failed to process file: " + inputFile.getAbsolutePath(), e); + cpFile(inputFile.getAbsolutePath(), failedDirectoryStr); } - } + }); + } + + private static void processSchemaMode( + File inputFile, String baseName, String outputDir, String format) { + try (SourceReader reader = createSchemaReader(inputFile, format)) { + ImportExecutor importExecutor = new ImportExecutor(importSchema); + boolean success = importExecutor.execute(reader, outputDir, baseName, failedDirectoryStr); + if (success) { + LOGGER.info(baseName + ".tsfile successfully generated"); + } else { + cpFile(inputFile.getAbsolutePath(), failedDirectoryStr); } - tablet.setRowSize(num); - return tablet; } catch (Exception e) { - LOGGER.error("Failed to parse csv file", e); + LOGGER.error("Failed to process file: " + inputFile.getAbsolutePath(), e); + cpFile(inputFile.getAbsolutePath(), failedDirectoryStr); } - return null; } - public static List sortAndParseLines(List data) { - List parsedLines = new ArrayList<>(data.size()); - - for (String line : data) { - parsedLines.add(line.split(schema.separator)); + private static void processAutoMode( + File inputFile, String baseName, String outputDir, String format) { + try (SourceReader reader = createAutoReader(inputFile, format)) { + ImportSchema autoSchema = reader.inferSchema(); + ImportExecutor importExecutor = new ImportExecutor(autoSchema); + boolean success = importExecutor.execute(reader, outputDir, baseName, failedDirectoryStr); + if (success) { + LOGGER.info(baseName + ".tsfile successfully generated"); + } else { + cpFile(inputFile.getAbsolutePath(), failedDirectoryStr); + } + } catch (Exception e) { + LOGGER.error("Failed to process file: " + inputFile.getAbsolutePath(), e); + cpFile(inputFile.getAbsolutePath(), failedDirectoryStr); } - parsedLines.sort( - (o1, o2) -> { - long time1 = - DateTimeUtils.convertTimestampOrDatetimeStrToLongWithDefaultZone( - o1[schema.timeColumnIndex], schema.timePrecision); - long time2 = - DateTimeUtils.convertTimestampOrDatetimeStrToLongWithDefaultZone( - o2[schema.timeColumnIndex], schema.timePrecision); - return Long.compare(time1, time2); - }); - return parsedLines; } - public static Object getValue(TSDataType dataType, String i, ColumnCategory columnCategory) { - switch (dataType) { - case INT64: - return Long.valueOf(i); - case INT32: - return Integer.valueOf(i); - case BOOLEAN: - return Boolean.valueOf(i); - case TEXT: - if (columnCategory.equals(ColumnCategory.FIELD)) { - return new Binary(String.valueOf(i), StandardCharsets.UTF_8); - } else { - return String.valueOf(i); - } - case FLOAT: - return Float.valueOf(i); - case DOUBLE: - return Double.valueOf(i); - default: - return i; + private static SourceReader createSchemaReader(File inputFile, String format) { + if ("parquet".equals(format)) { + return new ParquetSourceReader(inputFile, importSchema); } + if ("arrow".equals(format)) { + return new ArrowSourceReader(inputFile, importSchema); + } + return new CsvSourceReader(inputFile, importSchema, CHUNK_SIZE_BYTE); } - private static void processDirectory(File directory, ExecutorService executor) { - if (directory.isFile()) { - processFile(directory, executor); - } else { - File[] files = directory.listFiles(); - if (files != null) { - for (File file : files) { - if (file.isDirectory()) { - processDirectory(file, executor); - } else if (file.isFile() && file.getName().endsWith(".csv")) { - processFile(file, executor); - } - } + private static SourceReader createAutoReader(File inputFile, String format) { + if ("parquet".equals(format)) { + ParquetSourceReader reader = new ParquetSourceReader(inputFile); + if (tableNameStr != null) { + reader.setOverrideTableName(tableNameStr); } + if (timePrecisionStr != null) { + reader.setOverrideTimePrecision(timePrecisionStr); + } + return reader; } + if ("arrow".equals(format)) { + ArrowSourceReader reader = new ArrowSourceReader(inputFile); + if (tableNameStr != null) { + reader.setOverrideTableName(tableNameStr); + } + if (timePrecisionStr != null) { + reader.setOverrideTimePrecision(timePrecisionStr); + } + return reader; + } + String sep = separatorStr != null ? separatorStr : ","; + CsvSourceReader reader = new CsvSourceReader(inputFile, sep, CHUNK_SIZE_BYTE); + if (tableNameStr != null) { + reader.setOverrideTableName(tableNameStr); + } + if (timePrecisionStr != null) { + reader.setOverrideTimePrecision(timePrecisionStr); + } + return reader; } private static void cpFile(String sourceFilePath, String targetDirectoryPath) { try { String inputFileAbsolutePath = new File(inputDirectoryStr).getAbsolutePath(); - String soureFlieName = new File(sourceFilePath).getName(); - String fileOutPutDirStr = + String sourceFileName = new File(sourceFilePath).getName(); + String relativeDir = targetDirectoryPath - + sourceFilePath.replace(inputFileAbsolutePath, "").replace(soureFlieName, ""); - Files.createDirectories(Paths.get(fileOutPutDirStr)); + + sourceFilePath.replace(inputFileAbsolutePath, "").replace(sourceFileName, ""); + Files.createDirectories(Paths.get(relativeDir)); Path sourcePath = Paths.get(sourceFilePath); - Path targetPath = Paths.get(fileOutPutDirStr, sourcePath.getFileName().toString()); + Path targetPath = Paths.get(relativeDir, sourcePath.getFileName().toString()); Files.copy(sourcePath, targetPath, StandardCopyOption.REPLACE_EXISTING); } catch (IOException e) { LOGGER.error("Failed to copy file: " + sourceFilePath, e); } } - public static void writeToNewCSV( - String headerLine, String fileAbsolutePath, List data, String newFileName) { - if (schema.hasHeader && StringUtils.isNotEmpty(headerLine)) { - data.add(0, headerLine); - } - String inputFileAbsolutePath = new File(inputDirectoryStr).getAbsolutePath(); - String soureFlieName = new File(fileAbsolutePath).getName(); - String fileOutPutDirStr = - failedDirectoryStr - + fileAbsolutePath.replace(inputFileAbsolutePath, "").replace(soureFlieName, ""); - try { - Files.createDirectories(Paths.get(fileOutPutDirStr)); - } catch (IOException e) { - throw new RuntimeException(e); - } - String path = Paths.get(fileOutPutDirStr, newFileName).toFile().getAbsolutePath(); - try (BufferedWriter writer = new BufferedWriter(new FileWriter(path))) { - for (String line : data) { - writer.write(line); - writer.newLine(); - } - } catch (IOException e) { - LOGGER.error("Error writing to CSV file", e); - } - } - - private static void processFile(File inputFile, ExecutorService executor) { - AtomicInteger fileCounter = new AtomicInteger(1); - String fileName = FilenameUtils.getBaseName(inputFile.getName()); - String fileAbsolutePath = inputFile.getAbsolutePath(); - try (BufferedReader reader = - new BufferedReader( - new InputStreamReader( - Files.newInputStream(inputFile.toPath()), StandardCharsets.UTF_8))) { - String line; - long currentChunkSize = 0; - int chunkLines = 0; - int index = 0; - List lineList = new ArrayList<>(); - boolean isSingleFile = true; - String headerLine = null; - while ((line = reader.readLine()) != null) { - if (index == 0) { - if (schema.timeColumnIndex == -1) { - LOGGER.error(inputFile.getAbsolutePath() + " not found:" + schema.timeColumn); - cpFile(inputFile.getAbsolutePath(), failedDirectoryStr); - break; - } - String[] csvCloumns = line.split(schema.separator); - if (csvCloumns.length != schema.csvColumns.size()) { - LOGGER.error( - "The number of columns defined in the schema file is not equal to the number of columns in the csv file(" - + inputFile.getAbsolutePath() - + ")."); - cpFile(inputFile.getAbsolutePath(), failedDirectoryStr); - break; - } - } - - if (schema.hasHeader && index == 0) { - headerLine = line; - index++; - continue; - } - index++; - byte[] lineBytes = line.getBytes(StandardCharsets.UTF_8); - long lineSize = lineBytes.length; - if (currentChunkSize + lineSize > CHUNK_SIZE_BYTE) { - isSingleFile = false; - if (chunkLines > 0) { - submitChunk( - headerLine, - lineList, - fileCounter.getAndIncrement(), - executor, - fileName, - isSingleFile, - fileAbsolutePath); - lineList = new ArrayList<>(); - currentChunkSize = 0; - chunkLines = 0; - } else { - lineList.add(line); - submitChunk( - headerLine, - lineList, - fileCounter.getAndIncrement(), - executor, - fileName, - isSingleFile, - fileAbsolutePath); - lineList = new ArrayList<>(); - currentChunkSize = 0; - chunkLines = 0; - } - } - lineList.add(line); - currentChunkSize += lineSize; - chunkLines++; - } - if (lineList.size() > 0) { - submitChunk( - headerLine, - lineList, - fileCounter.getAndIncrement(), - executor, - fileName, - isSingleFile, - fileAbsolutePath); - } - - } catch (IOException e) { - LOGGER.error("Error reading file", e); - } - } - - private static void submitChunk( - String headerLine, - List lineList, - int fileNumber, - ExecutorService executor, - String fileName, - boolean isSingleFile, - String fileAbsolutePath) { - executor.submit( - () -> { - boolean isSuccess; - if (isSingleFile) { - isSuccess = writeTsFile(fileAbsolutePath, fileName + ".tsfile", lineList); - } else { - isSuccess = - writeTsFile(fileAbsolutePath, fileName + "_" + fileNumber + ".tsfile", lineList); - } - if (!isSuccess) { - if (isSingleFile) { - cpFile(fileAbsolutePath, failedDirectoryStr); - File tsfile = new File(outputDirectoryStr, fileName + ".tsfile"); - deleteFile(tsfile); - } else { - writeToNewCSV( - headerLine, fileAbsolutePath, lineList, fileName + "_" + fileNumber + ".csv"); - File tsfile = new File(outputDirectoryStr, fileName + "_" + fileNumber + ".tsfile"); - deleteFile(tsfile); - } - } else { - String tsFileName = fileName + "_" + fileNumber + ".tsfile"; - if (isSingleFile) { - tsFileName = fileName + ".tsfile"; - } - LOGGER.info(tsFileName + " successfully generated"); - } - }); - } - private static void printHelp(Options options) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("csv2tsfile.sh/csv2tsfile.bat", options); } private static void parseCommandLineParams(String[] args) { + THREAD_COUNT = 8; + CHUNK_SIZE_BYTE = 256L * 1024 * 1024; + outputDirectoryStr = ""; + inputDirectoryStr = ""; + failedDirectoryStr = "failed"; + schemaPathStr = ""; + tableNameStr = null; + timePrecisionStr = null; + separatorStr = null; + formatStr = null; + importSchema = null; + Options options = new Options(); - options.addOption("s", "source", true, "Input directory"); + options.addOption("s", "source", true, "Input directory or file"); options.addOption("t", "target", true, "Output directory"); options.addOption("fd", "fail_dir", true, "Failed file directory"); - options.addOption("b", "block_size", true, "Block size default value 256M"); - options.addOption("tn", "thread_num", true, "Thread number"); - options.addOption("schema", "schema", true, "Schema file path"); + options.addOption("b", "block_size", true, "Block size (default 256M)"); + options.addOption("tn", "thread_num", true, "Thread count (default 8)"); + options.addOption("schema", "schema", true, "Schema file path (omit for auto mode)"); + options.addOption(null, "table_name", true, "Table name override (auto mode)"); + options.addOption(null, "time_precision", true, "Time precision: ms, us, ns, s (auto mode)"); + options.addOption(null, "separator", true, "CSV separator: , / tab / ; (auto mode, default ,)"); + options.addOption( + null, + "format", + true, + "Source format: csv / parquet / arrow (default: auto-detect by extension)"); options.addOption("h", "help", false, "Show help"); try { @@ -494,7 +295,6 @@ private static void parseCommandLineParams(String[] args) { printHelp(options); System.exit(0); } - if (cmd.hasOption("s")) { inputDirectoryStr = cmd.getOptionValue("s"); } @@ -513,8 +313,24 @@ private static void parseCommandLineParams(String[] args) { if (cmd.hasOption("schema")) { schemaPathStr = cmd.getOptionValue("schema"); } - - if (failedDirectoryStr == null || failedDirectoryStr.equals("")) { + if (cmd.hasOption("table_name")) { + tableNameStr = cmd.getOptionValue("table_name"); + } + if (cmd.hasOption("time_precision")) { + timePrecisionStr = cmd.getOptionValue("time_precision"); + } + if (cmd.hasOption("separator")) { + String sep = cmd.getOptionValue("separator"); + if ("tab".equalsIgnoreCase(sep)) { + separatorStr = "\t"; + } else { + separatorStr = sep; + } + } + if (cmd.hasOption("format")) { + formatStr = cmd.getOptionValue("format").toLowerCase(); + } + if (failedDirectoryStr == null || failedDirectoryStr.isEmpty()) { failedDirectoryStr = "failed"; } } catch (ParseException e) { @@ -522,27 +338,21 @@ private static void parseCommandLineParams(String[] args) { } } - private static long parseBlockSize(String blockSizeValue) { - long size; + static long parseBlockSize(String blockSizeValue) { blockSizeValue = blockSizeValue.toUpperCase(); - if (blockSizeValue.endsWith("K")) { - size = Long.parseLong(blockSizeValue.substring(0, blockSizeValue.length() - 1)) * 1024; + return Long.parseLong(blockSizeValue.substring(0, blockSizeValue.length() - 1)) * 1024; } else if (blockSizeValue.endsWith("M")) { - size = Long.parseLong(blockSizeValue.substring(0, blockSizeValue.length() - 1)) * 1024 * 1024; + return Long.parseLong(blockSizeValue.substring(0, blockSizeValue.length() - 1)) * 1024 * 1024; } else if (blockSizeValue.endsWith("G")) { - size = - Long.parseLong(blockSizeValue.substring(0, blockSizeValue.length() - 1)) - * 1024 - * 1024 - * 1024; + return Long.parseLong(blockSizeValue.substring(0, blockSizeValue.length() - 1)) + * 1024 + * 1024 + * 1024; } else if (blockSizeValue.endsWith("T") || blockSizeValue.endsWith("B")) { throw new IllegalArgumentException("block_size only supports units of K, M, G, or numbers"); - } else { - size = Long.parseLong(blockSizeValue); } - - return size; + return Long.parseLong(blockSizeValue); } private static void createDir() { @@ -560,15 +370,11 @@ private static void createDir() { private static boolean validateParams() { if (inputDirectoryStr == null || inputDirectoryStr.isEmpty()) { - LOGGER.error("Missing required parameters.--source/-s is a required"); + LOGGER.error("Missing required parameters. --source/-s is required"); return false; } if (outputDirectoryStr == null || outputDirectoryStr.isEmpty()) { - LOGGER.error("Missing required parameters. --target/-t is a required"); - return false; - } - if (schemaPathStr == null || schemaPathStr.isEmpty()) { - LOGGER.error("Missing required parameters. --schema is a required"); + LOGGER.error("Missing required parameters. --target/-t is required"); return false; } File sourceDir = new File(inputDirectoryStr); @@ -576,16 +382,17 @@ private static boolean validateParams() { LOGGER.error(sourceDir + " directory or file does not exist."); return false; } - File schemaFile = new File(schemaPathStr); - if (!schemaFile.exists()) { - LOGGER.error(schemaPathStr + " schema file does not exist."); - return false; + if (schemaPathStr != null && !schemaPathStr.isEmpty()) { + File schemaFile = new File(schemaPathStr); + if (!schemaFile.exists()) { + LOGGER.error(schemaPathStr + " schema file does not exist."); + return false; + } } if (THREAD_COUNT <= 0) { LOGGER.error("Invalid thread number. Thread number must be greater than 0."); return false; } - return true; } } diff --git a/java/tools/src/main/java/org/apache/tsfile/tools/ValueConverter.java b/java/tools/src/main/java/org/apache/tsfile/tools/ValueConverter.java new file mode 100644 index 000000000..d9435eb1c --- /dev/null +++ b/java/tools/src/main/java/org/apache/tsfile/tools/ValueConverter.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.tsfile.tools; + +import org.apache.tsfile.enums.TSDataType; +import org.apache.tsfile.utils.Binary; + +import java.nio.charset.StandardCharsets; + +public class ValueConverter { + + public static Object convert(Object value, TSDataType targetType, boolean isMeasurement) { + if (value == null) { + return null; + } + if (value instanceof String) { + return fromString((String) value, targetType, isMeasurement); + } + return fromObject(value, targetType, isMeasurement); + } + + private static Object fromString(String value, TSDataType targetType, boolean isMeasurement) { + switch (targetType) { + case BOOLEAN: + return Boolean.valueOf(value); + case INT32: + return Integer.valueOf(value); + case INT64: + return Long.valueOf(value); + case FLOAT: + return Float.valueOf(value); + case DOUBLE: + return Double.valueOf(value); + case TEXT: + case STRING: + if (isMeasurement) { + return new Binary(value, StandardCharsets.UTF_8); + } + return value; + case BLOB: + return new Binary(value.getBytes(StandardCharsets.UTF_8)); + default: + return value; + } + } + + private static Object fromObject(Object value, TSDataType targetType, boolean isMeasurement) { + switch (targetType) { + case BOOLEAN: + if (value instanceof Boolean) { + return value; + } + return Boolean.valueOf(value.toString()); + case INT32: + if (value instanceof Integer) { + return value; + } + if (value instanceof Number) { + return ((Number) value).intValue(); + } + return Integer.valueOf(value.toString()); + case INT64: + if (value instanceof Long) { + return value; + } + if (value instanceof Number) { + return ((Number) value).longValue(); + } + return Long.valueOf(value.toString()); + case FLOAT: + if (value instanceof Float) { + return value; + } + if (value instanceof Number) { + return ((Number) value).floatValue(); + } + return Float.valueOf(value.toString()); + case DOUBLE: + if (value instanceof Double) { + return value; + } + if (value instanceof Number) { + return ((Number) value).doubleValue(); + } + return Double.valueOf(value.toString()); + case TEXT: + case STRING: + if (isMeasurement) { + if (value instanceof Binary) { + return value; + } + return new Binary(value.toString(), StandardCharsets.UTF_8); + } + if (value instanceof String) { + return value; + } + return value.toString(); + case BLOB: + if (value instanceof Binary) { + return value; + } + if (value instanceof byte[]) { + return new Binary((byte[]) value); + } + return new Binary(value.toString().getBytes(StandardCharsets.UTF_8)); + default: + return value; + } + } +} diff --git a/java/tools/src/test/java/org/apache/tsfile/tools/ArrowSourceReaderTest.java b/java/tools/src/test/java/org/apache/tsfile/tools/ArrowSourceReaderTest.java new file mode 100644 index 000000000..81cb603f3 --- /dev/null +++ b/java/tools/src/test/java/org/apache/tsfile/tools/ArrowSourceReaderTest.java @@ -0,0 +1,499 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.tools; + +import org.apache.tsfile.enums.TSDataType; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.ipc.ArrowFileWriter; +import org.apache.arrow.vector.types.FloatingPointPrecision; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +public class ArrowSourceReaderTest { + + private final String testDir = "target" + File.separator + "arrowReaderTest"; + private BufferAllocator allocator; + + @Before + public void setUp() { + new File(testDir).mkdirs(); + allocator = new RootAllocator(); + } + + @After + public void tearDown() { + allocator.close(); + deleteRecursive(new File(testDir)); + } + + private void deleteRecursive(File dir) { + File[] files = dir.listFiles(); + if (files != null) { + for (File f : files) { + if (f.isDirectory()) { + deleteRecursive(f); + } + f.delete(); + } + } + dir.delete(); + } + + private Schema buildBasicSchema() { + List fields = new ArrayList<>(); + fields.add(new Field("time", FieldType.notNullable(new ArrowType.Int(64, true)), null)); + fields.add( + new Field( + "temperature", + FieldType.notNullable(new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), + null)); + fields.add( + new Field( + "humidity", + FieldType.notNullable(new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)), + null)); + return new Schema(fields); + } + + private File writeArrowFile(String name, Schema schema, WriteCallback callback) + throws IOException { + File file = new File(testDir, name); + try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator); + FileOutputStream fos = new FileOutputStream(file); + ArrowFileWriter writer = new ArrowFileWriter(root, null, fos.getChannel())) { + writer.start(); + callback.write(root, writer); + writer.end(); + } + return file; + } + + interface WriteCallback { + void write(VectorSchemaRoot root, ArrowFileWriter writer) throws IOException; + } + + @Test + public void testAutoModeInferSchema() throws Exception { + Schema arrowSchema = buildBasicSchema(); + File file = + writeArrowFile( + "auto.arrow", + arrowSchema, + (root, writer) -> { + BigIntVector timeVec = (BigIntVector) root.getVector("time"); + Float4Vector tempVec = (Float4Vector) root.getVector("temperature"); + Float8Vector humVec = (Float8Vector) root.getVector("humidity"); + timeVec.allocateNew(2); + tempVec.allocateNew(2); + humVec.allocateNew(2); + timeVec.set(0, 1000L); + tempVec.set(0, 25.5f); + humVec.set(0, 60.0); + timeVec.set(1, 2000L); + tempVec.set(1, 26.0f); + humVec.set(1, 55.0); + root.setRowCount(2); + writer.writeBatch(); + }); + + try (ArrowSourceReader reader = new ArrowSourceReader(file)) { + ImportSchema schema = reader.inferSchema(); + assertEquals("auto", schema.getTableName()); + assertEquals("time", schema.getTimeColumnName()); + assertTrue(schema.getTagColumns().isEmpty()); + + List fields = schema.fieldColumns(); + assertEquals(2, fields.size()); + assertEquals("temperature", fields.get(0).getName()); + assertEquals(TSDataType.FLOAT, fields.get(0).getDataType()); + assertEquals("humidity", fields.get(1).getName()); + assertEquals(TSDataType.DOUBLE, fields.get(1).getDataType()); + } + } + + @Test + public void testAutoModeReadBatch() throws Exception { + Schema arrowSchema = buildBasicSchema(); + File file = + writeArrowFile( + "batch.arrow", + arrowSchema, + (root, writer) -> { + BigIntVector timeVec = (BigIntVector) root.getVector("time"); + Float4Vector tempVec = (Float4Vector) root.getVector("temperature"); + Float8Vector humVec = (Float8Vector) root.getVector("humidity"); + timeVec.allocateNew(3); + tempVec.allocateNew(3); + humVec.allocateNew(3); + for (int i = 0; i < 3; i++) { + timeVec.set(i, 1000L + i); + tempVec.set(i, 20.0f + i); + humVec.set(i, 50.0 + i); + } + root.setRowCount(3); + writer.writeBatch(); + }); + + try (ArrowSourceReader reader = new ArrowSourceReader(file)) { + reader.inferSchema(); + SourceBatch batch = reader.readBatch(); + assertNotNull(batch); + assertEquals(3, batch.getRowCount()); + assertEquals(1000L, batch.getValue(0, 0)); + assertEquals(20.0f, (float) batch.getValue(0, 1), 0.001f); + assertEquals(50.0, (double) batch.getValue(0, 2), 0.001); + assertNull(reader.readBatch()); + } + } + + @Test + public void testAutoModeUppercaseTIME() throws Exception { + List fields = new ArrayList<>(); + fields.add(new Field("TIME", FieldType.notNullable(new ArrowType.Int(64, true)), null)); + fields.add( + new Field( + "value", + FieldType.notNullable(new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), + null)); + Schema arrowSchema = new Schema(fields); + + File file = + writeArrowFile( + "upper.arrow", + arrowSchema, + (root, writer) -> { + BigIntVector tv = (BigIntVector) root.getVector("TIME"); + Float4Vector vv = (Float4Vector) root.getVector("value"); + tv.allocateNew(1); + vv.allocateNew(1); + tv.set(0, 1000L); + vv.set(0, 1.0f); + root.setRowCount(1); + writer.writeBatch(); + }); + + try (ArrowSourceReader reader = new ArrowSourceReader(file)) { + ImportSchema schema = reader.inferSchema(); + assertEquals("TIME", schema.getTimeColumnName()); + } + } + + @Test(expected = IllegalArgumentException.class) + public void testAutoModeMixedCaseTimeFails() throws Exception { + List fields = new ArrayList<>(); + fields.add(new Field("Time", FieldType.notNullable(new ArrowType.Int(64, true)), null)); + fields.add( + new Field( + "value", + FieldType.notNullable(new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), + null)); + + File file = + writeArrowFile( + "mixed.arrow", + new Schema(fields), + (root, writer) -> { + BigIntVector tv = (BigIntVector) root.getVector("Time"); + Float4Vector vv = (Float4Vector) root.getVector("value"); + tv.allocateNew(1); + vv.allocateNew(1); + tv.set(0, 1000L); + vv.set(0, 1.0f); + root.setRowCount(1); + writer.writeBatch(); + }); + + try (ArrowSourceReader reader = new ArrowSourceReader(file)) { + reader.inferSchema(); + } + } + + @Test + public void testAutoModeTableNameFromFilename() throws Exception { + Schema arrowSchema = buildBasicSchema(); + File file = + writeArrowFile( + "sensor_data.arrow", + arrowSchema, + (root, writer) -> { + BigIntVector tv = (BigIntVector) root.getVector("time"); + Float4Vector tp = (Float4Vector) root.getVector("temperature"); + Float8Vector hm = (Float8Vector) root.getVector("humidity"); + tv.allocateNew(1); + tp.allocateNew(1); + hm.allocateNew(1); + tv.set(0, 1000L); + tp.set(0, 25.0f); + hm.set(0, 60.0); + root.setRowCount(1); + writer.writeBatch(); + }); + + try (ArrowSourceReader reader = new ArrowSourceReader(file)) { + ImportSchema schema = reader.inferSchema(); + assertEquals("sensor_data", schema.getTableName()); + } + } + + @Test + public void testAutoModeTableNameOverride() throws Exception { + Schema arrowSchema = buildBasicSchema(); + File file = + writeArrowFile( + "data.arrow", + arrowSchema, + (root, writer) -> { + BigIntVector tv = (BigIntVector) root.getVector("time"); + Float4Vector tp = (Float4Vector) root.getVector("temperature"); + Float8Vector hm = (Float8Vector) root.getVector("humidity"); + tv.allocateNew(1); + tp.allocateNew(1); + hm.allocateNew(1); + tv.set(0, 1000L); + tp.set(0, 25.0f); + hm.set(0, 60.0); + root.setRowCount(1); + writer.writeBatch(); + }); + + try (ArrowSourceReader reader = new ArrowSourceReader(file)) { + reader.setOverrideTableName("custom"); + ImportSchema schema = reader.inferSchema(); + assertEquals("custom", schema.getTableName()); + } + } + + @Test + public void testSchemaMode() throws Exception { + Schema arrowSchema = buildBasicSchema(); + File file = + writeArrowFile( + "schema_mode.arrow", + arrowSchema, + (root, writer) -> { + BigIntVector tv = (BigIntVector) root.getVector("time"); + Float4Vector tp = (Float4Vector) root.getVector("temperature"); + Float8Vector hm = (Float8Vector) root.getVector("humidity"); + tv.allocateNew(2); + tp.allocateNew(2); + hm.allocateNew(2); + tv.set(0, 1000L); + tp.set(0, 25.5f); + hm.set(0, 60.0); + tv.set(1, 2000L); + tp.set(1, 26.0f); + hm.set(1, 55.0); + root.setRowCount(2); + writer.writeBatch(); + }); + + ImportSchema importSchema = new ImportSchema(); + importSchema.setTableName("test"); + importSchema.setTimeColumnName("time"); + importSchema.setTimePrecision("ms"); + importSchema.setTagColumns(new ArrayList()); + importSchema.setSourceColumns( + Arrays.asList( + new ImportSchema.SourceColumn("time", TSDataType.INT64), + new ImportSchema.SourceColumn("temperature", TSDataType.FLOAT), + new ImportSchema.SourceColumn("humidity", TSDataType.DOUBLE))); + + try (ArrowSourceReader reader = new ArrowSourceReader(file, importSchema)) { + SourceBatch batch = reader.readBatch(); + assertNotNull(batch); + assertEquals(2, batch.getRowCount()); + assertEquals(1000L, batch.getValue(0, 0)); + assertEquals(25.5f, (float) batch.getValue(0, 1), 0.001f); + assertNull(reader.readBatch()); + } + } + + @Test + public void testNativeNullHandling() throws Exception { + List fields = new ArrayList<>(); + fields.add(new Field("time", FieldType.notNullable(new ArrowType.Int(64, true)), null)); + fields.add( + new Field( + "value", + FieldType.nullable(new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), + null)); + + File file = + writeArrowFile( + "nulls.arrow", + new Schema(fields), + (root, writer) -> { + BigIntVector tv = (BigIntVector) root.getVector("time"); + Float4Vector vv = (Float4Vector) root.getVector("value"); + tv.allocateNew(2); + vv.allocateNew(2); + tv.set(0, 1000L); + vv.set(0, 3.14f); + tv.set(1, 2000L); + vv.setNull(1); + root.setRowCount(2); + writer.writeBatch(); + }); + + try (ArrowSourceReader reader = new ArrowSourceReader(file)) { + reader.inferSchema(); + SourceBatch batch = reader.readBatch(); + assertNotNull(batch); + assertEquals(2, batch.getRowCount()); + assertEquals(3.14f, (float) batch.getValue(0, 1), 0.001f); + assertNull(batch.getValue(1, 1)); + } + } + + @Test + public void testTypeMapping() throws Exception { + List fields = new ArrayList<>(); + fields.add(new Field("time", FieldType.notNullable(new ArrowType.Int(64, true)), null)); + fields.add(new Field("flag", FieldType.notNullable(new ArrowType.Bool()), null)); + fields.add(new Field("count", FieldType.notNullable(new ArrowType.Int(32, true)), null)); + fields.add(new Field("name", FieldType.notNullable(new ArrowType.Utf8()), null)); + + File file = + writeArrowFile( + "types.arrow", + new Schema(fields), + (root, writer) -> { + BigIntVector tv = (BigIntVector) root.getVector("time"); + BitVector bv = (BitVector) root.getVector("flag"); + IntVector iv = (IntVector) root.getVector("count"); + VarCharVector sv = (VarCharVector) root.getVector("name"); + tv.allocateNew(1); + bv.allocateNew(1); + iv.allocateNew(1); + sv.allocateNew(1); + tv.set(0, 1000L); + bv.set(0, 1); + iv.set(0, 42); + sv.set(0, "hello".getBytes(StandardCharsets.UTF_8)); + root.setRowCount(1); + writer.writeBatch(); + }); + + try (ArrowSourceReader reader = new ArrowSourceReader(file)) { + ImportSchema schema = reader.inferSchema(); + List schemaFields = schema.fieldColumns(); + assertEquals(3, schemaFields.size()); + assertEquals(TSDataType.BOOLEAN, schemaFields.get(0).getDataType()); + assertEquals(TSDataType.INT32, schemaFields.get(1).getDataType()); + assertEquals(TSDataType.STRING, schemaFields.get(2).getDataType()); + + SourceBatch batch = reader.readBatch(); + assertNotNull(batch); + assertEquals(true, batch.getValue(0, 1)); + assertEquals(42, batch.getValue(0, 2)); + assertEquals("hello", batch.getValue(0, 3)); + } + } + + @Test + public void testEmptyFile() throws Exception { + Schema arrowSchema = buildBasicSchema(); + File file = writeArrowFile("empty.arrow", arrowSchema, (root, writer) -> {}); + + try (ArrowSourceReader reader = new ArrowSourceReader(file)) { + reader.inferSchema(); + SourceBatch batch = reader.readBatch(); + assertNull(batch); + } + } + + @Test + public void testMultipleRecordBatches() throws Exception { + Schema arrowSchema = buildBasicSchema(); + File file = + writeArrowFile( + "multi.arrow", + arrowSchema, + (root, writer) -> { + BigIntVector tv = (BigIntVector) root.getVector("time"); + Float4Vector tp = (Float4Vector) root.getVector("temperature"); + Float8Vector hm = (Float8Vector) root.getVector("humidity"); + + tv.allocateNew(2); + tp.allocateNew(2); + hm.allocateNew(2); + tv.set(0, 1000L); + tp.set(0, 20.0f); + hm.set(0, 50.0); + tv.set(1, 2000L); + tp.set(1, 21.0f); + hm.set(1, 51.0); + root.setRowCount(2); + writer.writeBatch(); + + tv.allocateNew(3); + tp.allocateNew(3); + hm.allocateNew(3); + for (int i = 0; i < 3; i++) { + tv.set(i, 3000L + i); + tp.set(i, 30.0f + i); + hm.set(i, 60.0 + i); + } + root.setRowCount(3); + writer.writeBatch(); + }); + + try (ArrowSourceReader reader = new ArrowSourceReader(file)) { + reader.inferSchema(); + + SourceBatch batch1 = reader.readBatch(); + assertNotNull(batch1); + assertEquals(2, batch1.getRowCount()); + + SourceBatch batch2 = reader.readBatch(); + assertNotNull(batch2); + assertEquals(3, batch2.getRowCount()); + + assertNull(reader.readBatch()); + } + } +} diff --git a/java/tools/src/test/java/org/apache/tsfile/tools/AutoSchemaInfererTest.java b/java/tools/src/test/java/org/apache/tsfile/tools/AutoSchemaInfererTest.java new file mode 100644 index 000000000..7e9ab118c --- /dev/null +++ b/java/tools/src/test/java/org/apache/tsfile/tools/AutoSchemaInfererTest.java @@ -0,0 +1,342 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.tools; + +import org.apache.tsfile.enums.TSDataType; + +import org.junit.Test; + +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class AutoSchemaInfererTest { + + // ===== Time column detection ===== + + @Test + public void testDetectTimeColumnLowercase() { + assertEquals("time", AutoSchemaInferer.detectTimeColumn(Arrays.asList("time", "value"))); + } + + @Test + public void testDetectTimeColumnUppercase() { + assertEquals("TIME", AutoSchemaInferer.detectTimeColumn(Arrays.asList("TIME", "value"))); + } + + @Test(expected = IllegalArgumentException.class) + public void testDetectTimeColumnMixedCaseFails() { + AutoSchemaInferer.detectTimeColumn(Arrays.asList("Time", "value")); + } + + @Test(expected = IllegalArgumentException.class) + public void testDetectTimeColumnRandomCaseFails() { + AutoSchemaInferer.detectTimeColumn(Arrays.asList("tIME", "value")); + } + + @Test(expected = IllegalArgumentException.class) + public void testDetectTimeColumnBothTimeAndTIMEFails() { + AutoSchemaInferer.detectTimeColumn(Arrays.asList("time", "TIME", "value")); + } + + @Test(expected = IllegalArgumentException.class) + public void testDetectTimeColumnDuplicateTimeFails() { + AutoSchemaInferer.detectTimeColumn(Arrays.asList("time", "time", "value")); + } + + @Test(expected = IllegalArgumentException.class) + public void testDetectTimeColumnNoMatchFails() { + AutoSchemaInferer.detectTimeColumn(Arrays.asList("ts", "value")); + } + + // ===== Type inference ===== + + @Test + public void testInferAllIntegers() { + List cols = Arrays.asList("time", "count"); + List rows = + Arrays.asList( + new Object[] {"1000", "42"}, new Object[] {"2000", "99"}, new Object[] {"3000", "-7"}); + + TSDataType[] types = + AutoSchemaInferer.inferColumnTypes( + cols, rows, "time", AutoSchemaInferer.DEFAULT_CSV_NULL_TOKENS); + assertEquals(TSDataType.INT64, types[0]); + assertEquals(TSDataType.INT64, types[1]); + } + + @Test + public void testInferIntegerThenDecimalPromotesToDouble() { + List cols = Arrays.asList("time", "value"); + List rows = Arrays.asList(new Object[] {"1000", "42"}, new Object[] {"2000", "3.14"}); + + TSDataType[] types = + AutoSchemaInferer.inferColumnTypes( + cols, rows, "time", AutoSchemaInferer.DEFAULT_CSV_NULL_TOKENS); + assertEquals(TSDataType.DOUBLE, types[1]); + } + + @Test + public void testInferNumericThenNonNumericPromotesToString() { + List cols = Arrays.asList("time", "mixed"); + List rows = + Arrays.asList(new Object[] {"1000", "42"}, new Object[] {"2000", "hello"}); + + TSDataType[] types = + AutoSchemaInferer.inferColumnTypes( + cols, rows, "time", AutoSchemaInferer.DEFAULT_CSV_NULL_TOKENS); + assertEquals(TSDataType.STRING, types[1]); + } + + @Test + public void testInferBooleanThenNonBooleanPromotesToString() { + List cols = Arrays.asList("time", "flag"); + List rows = Arrays.asList(new Object[] {"1000", "true"}, new Object[] {"2000", "42"}); + + TSDataType[] types = + AutoSchemaInferer.inferColumnTypes( + cols, rows, "time", AutoSchemaInferer.DEFAULT_CSV_NULL_TOKENS); + assertEquals(TSDataType.STRING, types[1]); + } + + @Test + public void testInferAllBoolean() { + List cols = Arrays.asList("time", "flag"); + List rows = + Arrays.asList( + new Object[] {"1000", "true"}, + new Object[] {"2000", "false"}, + new Object[] {"3000", "True"}, + new Object[] {"4000", "FALSE"}); + + TSDataType[] types = + AutoSchemaInferer.inferColumnTypes( + cols, rows, "time", AutoSchemaInferer.DEFAULT_CSV_NULL_TOKENS); + assertEquals(TSDataType.BOOLEAN, types[1]); + } + + @Test + public void testInferEmptyStringSkipped() { + List cols = Arrays.asList("time", "value"); + List rows = Arrays.asList(new Object[] {"1000", ""}, new Object[] {"2000", "42"}); + + TSDataType[] types = + AutoSchemaInferer.inferColumnTypes( + cols, rows, "time", AutoSchemaInferer.DEFAULT_CSV_NULL_TOKENS); + assertEquals(TSDataType.INT64, types[1]); + } + + @Test + public void testInferBackslashNSkipped() { + List cols = Arrays.asList("time", "value"); + List rows = + Arrays.asList(new Object[] {"1000", "\\N"}, new Object[] {"2000", "3.14"}); + + TSDataType[] types = + AutoSchemaInferer.inferColumnTypes( + cols, rows, "time", AutoSchemaInferer.DEFAULT_CSV_NULL_TOKENS); + assertEquals(TSDataType.DOUBLE, types[1]); + } + + @Test + public void testInferAllUnknownDefaultsToString() { + List cols = Arrays.asList("time", "empty_col"); + List rows = Arrays.asList(new Object[] {"1000", ""}, new Object[] {"2000", null}); + + TSDataType[] types = + AutoSchemaInferer.inferColumnTypes( + cols, rows, "time", AutoSchemaInferer.DEFAULT_CSV_NULL_TOKENS); + assertEquals(TSDataType.STRING, types[1]); + } + + // ===== Null token recognition ===== + + @Test + public void testEmptyCellIsNullToken() { + assertTrue(AutoSchemaInferer.DEFAULT_CSV_NULL_TOKENS.contains("")); + } + + @Test + public void testBackslashNIsNullToken() { + assertTrue(AutoSchemaInferer.DEFAULT_CSV_NULL_TOKENS.contains("\\N")); + } + + @Test + public void testUppercaseNULLNotNullToken() { + assertTrue(!AutoSchemaInferer.DEFAULT_CSV_NULL_TOKENS.contains("NULL")); + } + + @Test + public void testLowercaseNullNotNullToken() { + assertTrue(!AutoSchemaInferer.DEFAULT_CSV_NULL_TOKENS.contains("null")); + } + + @Test + public void testNaNNotNullToken() { + assertTrue(!AutoSchemaInferer.DEFAULT_CSV_NULL_TOKENS.contains("NaN")); + } + + // ===== Default table name ===== + + @Test + public void testDeriveTableNameFromCsvFile() { + assertEquals("sensor_data", AutoSchemaInferer.deriveTableName("sensor_data.csv", "csv_data")); + } + + @Test + public void testDeriveTableNameFromParquetFile() { + assertEquals("motor", AutoSchemaInferer.deriveTableName("motor.parquet", "parquet_data")); + } + + @Test + public void testDeriveTableNameSpecialChars() { + assertEquals("my_data_2025", AutoSchemaInferer.deriveTableName("my-data@2025.csv", "csv_data")); + } + + @Test + public void testDeriveTableNameEmptyAfterClean() { + assertEquals("csv_data", AutoSchemaInferer.deriveTableName("@#$.csv", "csv_data")); + } + + @Test + public void testDeriveTableNameDigitPrefix() { + assertEquals("t_123abc", AutoSchemaInferer.deriveTableName("123abc.csv", "csv_data")); + } + + @Test + public void testDeriveTableNameDigitOnly() { + assertEquals("t_123", AutoSchemaInferer.deriveTableName("123.csv", "csv_data")); + } + + @Test + public void testDeriveTableNameNoExtension() { + assertEquals("datafile", AutoSchemaInferer.deriveTableName("datafile", "csv_data")); + } + + @Test + public void testDeriveTableNameUnknownExtension() { + assertEquals("data", AutoSchemaInferer.deriveTableName("data.txt", "csv_data")); + } + + // ===== buildAutoSchema ===== + + @Test + public void testBuildAutoSchemaBasic() { + List cols = Arrays.asList("time", "temp", "humidity"); + TSDataType[] types = {TSDataType.INT64, TSDataType.DOUBLE, TSDataType.DOUBLE}; + + ImportSchema schema = AutoSchemaInferer.buildAutoSchema("test", "time", cols, types, "ms"); + + assertEquals("test", schema.getTableName()); + assertEquals("time", schema.getTimeColumnName()); + assertEquals("ms", schema.getTimePrecision()); + assertTrue(schema.getTagColumns().isEmpty()); + assertEquals(3, schema.getSourceColumns().size()); + + List fields = schema.fieldColumns(); + assertEquals(2, fields.size()); + assertEquals("temp", fields.get(0).getName()); + assertEquals(TSDataType.DOUBLE, fields.get(0).getDataType()); + assertEquals("humidity", fields.get(1).getName()); + } + + @Test + public void testBuildAutoSchemaDefaultPrecision() { + List cols = Arrays.asList("time", "val"); + TSDataType[] types = {TSDataType.INT64, TSDataType.STRING}; + + ImportSchema schema = AutoSchemaInferer.buildAutoSchema("t", "time", cols, types, null); + assertEquals("ms", schema.getTimePrecision()); + } + + // ===== Cell classification ===== + + @Test + public void testClassifyCellBoolean() { + assertEquals(AutoSchemaInferer.InferredType.BOOLEAN, AutoSchemaInferer.classifyCell("true")); + assertEquals(AutoSchemaInferer.InferredType.BOOLEAN, AutoSchemaInferer.classifyCell("false")); + assertEquals(AutoSchemaInferer.InferredType.BOOLEAN, AutoSchemaInferer.classifyCell("True")); + assertEquals(AutoSchemaInferer.InferredType.BOOLEAN, AutoSchemaInferer.classifyCell("FALSE")); + } + + @Test + public void testClassifyCellInteger() { + assertEquals(AutoSchemaInferer.InferredType.INT64, AutoSchemaInferer.classifyCell("42")); + assertEquals(AutoSchemaInferer.InferredType.INT64, AutoSchemaInferer.classifyCell("-7")); + assertEquals(AutoSchemaInferer.InferredType.INT64, AutoSchemaInferer.classifyCell("0")); + } + + @Test + public void testClassifyCellDouble() { + assertEquals(AutoSchemaInferer.InferredType.DOUBLE, AutoSchemaInferer.classifyCell("3.14")); + assertEquals(AutoSchemaInferer.InferredType.DOUBLE, AutoSchemaInferer.classifyCell("-1.5")); + assertEquals(AutoSchemaInferer.InferredType.DOUBLE, AutoSchemaInferer.classifyCell(".5")); + } + + @Test + public void testClassifyCellString() { + assertEquals(AutoSchemaInferer.InferredType.STRING, AutoSchemaInferer.classifyCell("hello")); + assertEquals(AutoSchemaInferer.InferredType.STRING, AutoSchemaInferer.classifyCell("abc123")); + } + + // ===== Type promotion ===== + + @Test + public void testPromoteUnknownTakesIncoming() { + assertEquals( + AutoSchemaInferer.InferredType.INT64, + AutoSchemaInferer.promote( + AutoSchemaInferer.InferredType.UNKNOWN, AutoSchemaInferer.InferredType.INT64)); + } + + @Test + public void testPromoteSameTypeStays() { + assertEquals( + AutoSchemaInferer.InferredType.INT64, + AutoSchemaInferer.promote( + AutoSchemaInferer.InferredType.INT64, AutoSchemaInferer.InferredType.INT64)); + } + + @Test + public void testPromoteInt64DoubleBecomesDouble() { + assertEquals( + AutoSchemaInferer.InferredType.DOUBLE, + AutoSchemaInferer.promote( + AutoSchemaInferer.InferredType.INT64, AutoSchemaInferer.InferredType.DOUBLE)); + } + + @Test + public void testPromoteBooleanInt64BecomesString() { + assertEquals( + AutoSchemaInferer.InferredType.STRING, + AutoSchemaInferer.promote( + AutoSchemaInferer.InferredType.BOOLEAN, AutoSchemaInferer.InferredType.INT64)); + } + + @Test + public void testPromoteDoubleStringBecomesString() { + assertEquals( + AutoSchemaInferer.InferredType.STRING, + AutoSchemaInferer.promote( + AutoSchemaInferer.InferredType.DOUBLE, AutoSchemaInferer.InferredType.STRING)); + } +} diff --git a/java/tools/src/test/java/org/apache/tsfile/tools/CsvSourceReaderTest.java b/java/tools/src/test/java/org/apache/tsfile/tools/CsvSourceReaderTest.java new file mode 100644 index 000000000..ea89e6aef --- /dev/null +++ b/java/tools/src/test/java/org/apache/tsfile/tools/CsvSourceReaderTest.java @@ -0,0 +1,435 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.tools; + +import org.apache.tsfile.enums.TSDataType; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +public class CsvSourceReaderTest { + + private final String testDir = "target" + File.separator + "csvReaderTest"; + + @Before + public void setUp() { + new File(testDir).mkdirs(); + } + + @After + public void tearDown() { + File dir = new File(testDir); + File[] files = dir.listFiles(); + if (files != null) { + for (File f : files) { + f.delete(); + } + } + dir.delete(); + } + + private File writeCsv(String name, String content) throws IOException { + File file = new File(testDir, name); + try (BufferedWriter w = new BufferedWriter(new FileWriter(file))) { + w.write(content); + } + return file; + } + + private ImportSchema buildSchema(String timeCol, List tagCols, String[]... srcColDefs) { + ImportSchema schema = new ImportSchema(); + schema.setTableName("test"); + schema.setTimeColumnName(timeCol); + + List tags = new ArrayList<>(); + if (tagCols != null) { + for (String[] t : tagCols) { + tags.add(new ImportSchema.TagColumn(t[0])); + } + } + schema.setTagColumns(tags); + + List sources = new ArrayList<>(); + for (String[] sc : srcColDefs) { + if ("SKIP".equals(sc[1])) { + sources.add(ImportSchema.SourceColumn.skip()); + } else { + sources.add(new ImportSchema.SourceColumn(sc[0], TSDataType.valueOf(sc[1]))); + } + } + schema.setSourceColumns(sources); + + return schema; + } + + @Test + public void testReadWithHeader() throws Exception { + File csv = + writeCsv( + "data.csv", + "time,device,value\n" + "1000,dev1,3.14\n" + "2000,dev1,2.71\n" + "3000,dev2,1.0\n"); + + ImportSchema schema = + buildSchema( + "time", + null, + new String[] {"time", "INT64"}, + new String[] {"device", "TEXT"}, + new String[] {"value", "FLOAT"}); + + try (CsvSourceReader reader = new CsvSourceReader(csv, schema)) { + SourceBatch batch = reader.readBatch(); + assertNotNull(batch); + assertEquals(3, batch.getRowCount()); + assertEquals(3, batch.getColumnCount()); + assertEquals("time", batch.getColumnName(0)); + assertEquals("device", batch.getColumnName(1)); + assertEquals("value", batch.getColumnName(2)); + assertEquals("1000", batch.getValue(0, 0)); + assertEquals("dev1", batch.getValue(0, 1)); + assertEquals("3.14", batch.getValue(0, 2)); + + assertNull(reader.readBatch()); + } + } + + @Test + public void testReadWithoutHeader() throws Exception { + File csv = writeCsv("noheader.csv", "1000,dev1,3.14\n" + "2000,dev1,2.71\n"); + + ImportSchema schema = + buildSchema( + "time", + null, + new String[] {"time", "INT64"}, + new String[] {"device", "TEXT"}, + new String[] {"value", "FLOAT"}); + schema.setHasHeader(false); + + try (CsvSourceReader reader = new CsvSourceReader(csv, schema)) { + SourceBatch batch = reader.readBatch(); + assertNotNull(batch); + assertEquals(2, batch.getRowCount()); + assertEquals("1000", batch.getValue(0, 0)); + } + } + + @Test + public void testNullFormatHandling() throws Exception { + File csv = writeCsv("nulls.csv", "time,value\n" + "1000,3.14\n" + "2000,\\N\n" + "3000,\n"); + + ImportSchema schema = + buildSchema("time", null, new String[] {"time", "INT64"}, new String[] {"value", "FLOAT"}); + schema.setNullFormat("\\N"); + + try (CsvSourceReader reader = new CsvSourceReader(csv, schema)) { + SourceBatch batch = reader.readBatch(); + assertNotNull(batch); + assertEquals(3, batch.getRowCount()); + assertEquals("3.14", batch.getValue(0, 1)); + assertNull(batch.getValue(1, 1)); + assertNull(batch.getValue(2, 1)); + } + } + + @Test + public void testChunking() throws Exception { + StringBuilder sb = new StringBuilder(); + sb.append("time,value\n"); + for (int i = 0; i < 100; i++) { + sb.append(i).append(",").append(i * 1.0).append("\n"); + } + File csv = writeCsv("large.csv", sb.toString()); + + ImportSchema schema = + buildSchema("time", null, new String[] {"time", "INT64"}, new String[] {"value", "FLOAT"}); + + // Very small chunk size to force multiple batches + try (CsvSourceReader reader = new CsvSourceReader(csv, schema, 100)) { + int totalRows = 0; + int batchCount = 0; + SourceBatch batch; + while ((batch = reader.readBatch()) != null) { + totalRows += batch.getRowCount(); + batchCount++; + } + assertEquals(100, totalRows); + assertTrue("Should have multiple batches", batchCount > 1); + } + } + + @Test(expected = IllegalArgumentException.class) + public void testColumnCountMismatch() throws Exception { + File csv = writeCsv("mismatch.csv", "time,value,extra\n" + "1000,3.14,x\n"); + + ImportSchema schema = + buildSchema("time", null, new String[] {"time", "INT64"}, new String[] {"value", "FLOAT"}); + + try (CsvSourceReader reader = new CsvSourceReader(csv, schema)) { + reader.readBatch(); + } + } + + @Test + public void testEmptyFile() throws Exception { + File csv = writeCsv("empty.csv", "time,value\n"); + + ImportSchema schema = + buildSchema("time", null, new String[] {"time", "INT64"}, new String[] {"value", "FLOAT"}); + + try (CsvSourceReader reader = new CsvSourceReader(csv, schema)) { + SourceBatch batch = reader.readBatch(); + assertNull(batch); + } + } + + @Test + public void testSkipColumn() throws Exception { + File csv = writeCsv("skip.csv", "time,unused,value\n" + "1000,x,3.14\n" + "2000,y,2.71\n"); + + ImportSchema schema = + buildSchema( + "time", + null, + new String[] {"time", "INT64"}, + new String[] {"SKIP", "SKIP"}, + new String[] {"value", "FLOAT"}); + + try (CsvSourceReader reader = new CsvSourceReader(csv, schema)) { + SourceBatch batch = reader.readBatch(); + assertNotNull(batch); + assertEquals(2, batch.getRowCount()); + assertEquals(3, batch.getColumnCount()); + assertEquals("x", batch.getValue(0, 1)); + } + } + + @Test + public void testSemicolonSeparator() throws Exception { + File csv = writeCsv("semi.csv", "time;value\n" + "1000;3.14\n" + "2000;2.71\n"); + + ImportSchema schema = + buildSchema("time", null, new String[] {"time", "INT64"}, new String[] {"value", "FLOAT"}); + schema.setSeparator(";"); + + try (CsvSourceReader reader = new CsvSourceReader(csv, schema)) { + SourceBatch batch = reader.readBatch(); + assertNotNull(batch); + assertEquals(2, batch.getRowCount()); + assertEquals("1000", batch.getValue(0, 0)); + assertEquals("3.14", batch.getValue(0, 1)); + } + } + + @Test + public void testTabSeparator() throws Exception { + File csv = writeCsv("tab.csv", "time\tvalue\n" + "1000\t3.14\n" + "2000\t2.71\n"); + + ImportSchema schema = + buildSchema("time", null, new String[] {"time", "INT64"}, new String[] {"value", "FLOAT"}); + schema.setSeparator("\t"); + + try (CsvSourceReader reader = new CsvSourceReader(csv, schema)) { + SourceBatch batch = reader.readBatch(); + assertNotNull(batch); + assertEquals(2, batch.getRowCount()); + assertEquals("1000", batch.getValue(0, 0)); + assertEquals("3.14", batch.getValue(0, 1)); + } + } + + // ===== Auto mode tests ===== + + @Test + public void testAutoModeInferSchema() throws Exception { + File csv = + writeCsv("auto.csv", "time,temp,status\n" + "1000,25.5,true\n" + "2000,30.1,false\n"); + + try (CsvSourceReader reader = new CsvSourceReader(csv, ",")) { + ImportSchema schema = reader.inferSchema(); + assertEquals("auto", schema.getTableName()); + assertEquals("time", schema.getTimeColumnName()); + assertTrue(schema.getTagColumns().isEmpty()); + + List fields = schema.fieldColumns(); + assertEquals(2, fields.size()); + assertEquals("temp", fields.get(0).getName()); + assertEquals(TSDataType.DOUBLE, fields.get(0).getDataType()); + assertEquals("status", fields.get(1).getName()); + assertEquals(TSDataType.BOOLEAN, fields.get(1).getDataType()); + } + } + + @Test + public void testAutoModeReadBatchIncludesSampleRows() throws Exception { + File csv = writeCsv("autoread.csv", "time,value\n" + "1000,10\n" + "2000,20\n" + "3000,30\n"); + + try (CsvSourceReader reader = new CsvSourceReader(csv, ",")) { + reader.inferSchema(); + SourceBatch batch = reader.readBatch(); + assertNotNull(batch); + assertEquals(3, batch.getRowCount()); + assertNull(reader.readBatch()); + } + } + + @Test + public void testAutoModeTableNameFromFilename() throws Exception { + File csv = writeCsv("sensor_data.csv", "time,val\n1000,1\n"); + + try (CsvSourceReader reader = new CsvSourceReader(csv, ",")) { + ImportSchema schema = reader.inferSchema(); + assertEquals("sensor_data", schema.getTableName()); + } + } + + @Test + public void testAutoModeTableNameOverride() throws Exception { + File csv = writeCsv("data.csv", "time,val\n1000,1\n"); + + try (CsvSourceReader reader = new CsvSourceReader(csv, ",")) { + reader.setOverrideTableName("my_table"); + ImportSchema schema = reader.inferSchema(); + assertEquals("my_table", schema.getTableName()); + } + } + + @Test + public void testAutoModeTimePrecisionDefault() throws Exception { + File csv = writeCsv("prec.csv", "time,val\n1000,1\n"); + + try (CsvSourceReader reader = new CsvSourceReader(csv, ",")) { + ImportSchema schema = reader.inferSchema(); + assertEquals("ms", schema.getTimePrecision()); + } + } + + @Test + public void testAutoModeTimePrecisionOverride() throws Exception { + File csv = writeCsv("precov.csv", "time,val\n1000,1\n"); + + try (CsvSourceReader reader = new CsvSourceReader(csv, ",")) { + reader.setOverrideTimePrecision("us"); + ImportSchema schema = reader.inferSchema(); + assertEquals("us", schema.getTimePrecision()); + } + } + + @Test + public void testAutoModeNullFormatSet() throws Exception { + File csv = writeCsv("autonull.csv", "time,val\n1000,\\N\n2000,3.14\n"); + + try (CsvSourceReader reader = new CsvSourceReader(csv, ",")) { + ImportSchema schema = reader.inferSchema(); + assertEquals("\\N", schema.getNullFormat()); + + SourceBatch batch = reader.readBatch(); + assertNotNull(batch); + assertEquals(2, batch.getRowCount()); + assertNull(batch.getValue(0, 1)); + assertEquals("3.14", batch.getValue(1, 1)); + } + } + + @Test(expected = IllegalArgumentException.class) + public void testAutoModeNoTimeColumnFails() throws Exception { + File csv = writeCsv("notime.csv", "ts,val\n1000,1\n"); + + try (CsvSourceReader reader = new CsvSourceReader(csv, ",")) { + reader.inferSchema(); + } + } + + @Test(expected = IllegalArgumentException.class) + public void testAutoModeEmptyFileFails() throws Exception { + File csv = writeCsv("emptyauto.csv", ""); + + try (CsvSourceReader reader = new CsvSourceReader(csv, ",")) { + reader.inferSchema(); + } + } + + @Test + public void testAutoModeSemicolonSeparator() throws Exception { + File csv = writeCsv("semi_auto.csv", "time;value\n1000;3.14\n2000;2.71\n"); + + try (CsvSourceReader reader = new CsvSourceReader(csv, ";")) { + ImportSchema schema = reader.inferSchema(); + assertEquals("time", schema.getTimeColumnName()); + + SourceBatch batch = reader.readBatch(); + assertNotNull(batch); + assertEquals(2, batch.getRowCount()); + } + } + + @Test + public void testAutoModeTabSeparator() throws Exception { + File csv = writeCsv("tab_auto.csv", "time\tvalue\n1000\t3.14\n2000\t2.71\n"); + + try (CsvSourceReader reader = new CsvSourceReader(csv, "\t")) { + ImportSchema schema = reader.inferSchema(); + assertEquals("time", schema.getTimeColumnName()); + + SourceBatch batch = reader.readBatch(); + assertNotNull(batch); + assertEquals(2, batch.getRowCount()); + } + } + + @Test + public void testMultipleBatchesReturnAllData() throws Exception { + StringBuilder sb = new StringBuilder(); + sb.append("time,value\n"); + for (int i = 0; i < 50; i++) { + sb.append((1000 + i)).append(",").append(i * 0.1).append("\n"); + } + File csv = writeCsv("multi.csv", sb.toString()); + + ImportSchema schema = + buildSchema("time", null, new String[] {"time", "INT64"}, new String[] {"value", "FLOAT"}); + + try (CsvSourceReader reader = new CsvSourceReader(csv, schema, 200)) { + List batches = new ArrayList<>(); + SourceBatch batch; + while ((batch = reader.readBatch()) != null) { + batches.add(batch); + } + int totalRows = 0; + for (SourceBatch b : batches) { + totalRows += b.getRowCount(); + } + assertEquals(50, totalRows); + } + } +} diff --git a/java/tools/src/test/java/org/apache/tsfile/tools/ImportExecutorTest.java b/java/tools/src/test/java/org/apache/tsfile/tools/ImportExecutorTest.java new file mode 100644 index 000000000..4239f72cc --- /dev/null +++ b/java/tools/src/test/java/org/apache/tsfile/tools/ImportExecutorTest.java @@ -0,0 +1,197 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.tools; + +import org.apache.tsfile.enums.TSDataType; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class ImportExecutorTest { + + private final String testDir = "target" + File.separator + "executorTest"; + + @Before + public void setUp() { + new File(testDir).mkdirs(); + } + + @After + public void tearDown() { + deleteRecursive(new File(testDir)); + } + + private void deleteRecursive(File dir) { + File[] files = dir.listFiles(); + if (files != null) { + for (File f : files) { + if (f.isDirectory()) { + deleteRecursive(f); + } + f.delete(); + } + } + dir.delete(); + } + + private ImportSchema buildSchema() { + ImportSchema schema = new ImportSchema(); + schema.setTableName("test"); + schema.setTimeColumnName("time"); + schema.setTimePrecision("ms"); + schema.setTagColumns(new ArrayList()); + schema.setSourceColumns( + Arrays.asList( + new ImportSchema.SourceColumn("time", TSDataType.INT64), + new ImportSchema.SourceColumn("value", TSDataType.FLOAT))); + return schema; + } + + private SourceBatch makeBatch(Object[]... rows) { + return SourceBatch.fromRows(Arrays.asList("time", "value"), Arrays.asList(rows)); + } + + @Test + public void testSingleBatchOutputFileName() { + ImportSchema schema = buildSchema(); + ImportExecutor executor = new ImportExecutor(schema); + String outputDir = testDir + File.separator + "single"; + + SourceBatch batch = makeBatch(new Object[] {"1000", "3.14"}, new Object[] {"2000", "2.71"}); + TestSourceReader reader = new TestSourceReader(batch); + + boolean ok = executor.execute(reader, outputDir, "sensor"); + assertTrue(ok); + assertTrue(new File(outputDir, "sensor.tsfile").exists()); + assertFalse(new File(outputDir, "sensor_1.tsfile").exists()); + } + + @Test + public void testMultiBatchOutputFileNames() { + ImportSchema schema = buildSchema(); + ImportExecutor executor = new ImportExecutor(schema); + String outputDir = testDir + File.separator + "multi"; + + SourceBatch batch1 = makeBatch(new Object[] {"1000", "1.0"}, new Object[] {"2000", "2.0"}); + SourceBatch batch2 = makeBatch(new Object[] {"3000", "3.0"}, new Object[] {"4000", "4.0"}); + TestSourceReader reader = new TestSourceReader(batch1, batch2); + + boolean ok = executor.execute(reader, outputDir, "sensor"); + assertTrue(ok); + assertTrue(new File(outputDir, "sensor_1.tsfile").exists()); + assertTrue(new File(outputDir, "sensor_2.tsfile").exists()); + assertFalse(new File(outputDir, "sensor.tsfile").exists()); + } + + @Test + public void testNullBatchEndsNormally() { + ImportSchema schema = buildSchema(); + ImportExecutor executor = new ImportExecutor(schema); + String outputDir = testDir + File.separator + "nullbatch"; + + TestSourceReader reader = new TestSourceReader(); + boolean ok = executor.execute(reader, outputDir, "empty"); + assertTrue(ok); + } + + @Test + public void testEmptyBatchSkipped() { + ImportSchema schema = buildSchema(); + ImportExecutor executor = new ImportExecutor(schema); + String outputDir = testDir + File.separator + "emptybatch"; + + SourceBatch emptyBatch = + SourceBatch.fromRows(Arrays.asList("time", "value"), new ArrayList()); + SourceBatch realBatch = makeBatch(new Object[] {"1000", "1.0"}); + TestSourceReader reader = new TestSourceReader(emptyBatch, realBatch); + + boolean ok = executor.execute(reader, outputDir, "data"); + assertTrue(ok); + assertTrue(new File(outputDir, "data.tsfile").exists()); + } + + @Test + public void testOutputDirAutoCreated() { + ImportSchema schema = buildSchema(); + ImportExecutor executor = new ImportExecutor(schema); + String outputDir = testDir + File.separator + "auto" + File.separator + "created"; + + assertFalse(new File(outputDir).exists()); + + SourceBatch batch = makeBatch(new Object[] {"1000", "1.0"}); + TestSourceReader reader = new TestSourceReader(batch); + + boolean ok = executor.execute(reader, outputDir, "out"); + assertTrue(ok); + assertTrue(new File(outputDir).exists()); + assertTrue(new File(outputDir, "out.tsfile").exists()); + } + + @Test + public void testExecutorWritesCorrectData() { + ImportSchema schema = buildSchema(); + ImportExecutor executor = new ImportExecutor(schema); + String outputDir = testDir + File.separator + "verify"; + + SourceBatch batch = makeBatch(new Object[] {"1000", "3.14"}, new Object[] {"2000", "2.71"}); + TestSourceReader reader = new TestSourceReader(batch); + + boolean ok = executor.execute(reader, outputDir, "check"); + assertTrue(ok); + + File tsfile = new File(outputDir, "check.tsfile"); + assertTrue(tsfile.exists()); + assertTrue(tsfile.length() > 0); + } + + private static class TestSourceReader implements SourceReader { + private final List batches; + private int index = 0; + + TestSourceReader(SourceBatch... batches) { + this.batches = Arrays.asList(batches); + } + + @Override + public ImportSchema inferSchema() { + throw new UnsupportedOperationException(); + } + + @Override + public SourceBatch readBatch() { + if (index >= batches.size()) { + return null; + } + return batches.get(index++); + } + + @Override + public void close() {} + } +} diff --git a/java/tools/src/test/java/org/apache/tsfile/tools/ImportSchemaParserTest.java b/java/tools/src/test/java/org/apache/tsfile/tools/ImportSchemaParserTest.java new file mode 100644 index 000000000..1efa66c2d --- /dev/null +++ b/java/tools/src/test/java/org/apache/tsfile/tools/ImportSchemaParserTest.java @@ -0,0 +1,514 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.tools; + +import org.apache.tsfile.enums.TSDataType; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.List; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +public class ImportSchemaParserTest { + + private final String testDir = "target" + File.separator + "schemaParserTest"; + + @Before + public void setUp() { + new File(testDir).mkdirs(); + } + + @After + public void tearDown() { + File dir = new File(testDir); + File[] files = dir.listFiles(); + if (files != null) { + for (File f : files) { + f.delete(); + } + } + dir.delete(); + } + + private String writeSchema(String content) throws IOException { + String path = testDir + File.separator + "test.schema"; + try (BufferedWriter writer = new BufferedWriter(new FileWriter(path))) { + writer.write(content); + } + return path; + } + + @Test + public void testNewSchemaFormat() throws Exception { + String path = + writeSchema( + "table_name=root.db1\n" + + "time_precision=ms\n" + + "has_header=true\n" + + "separator=,\n" + + "null_format=\\N\n" + + "\n" + + "tag_columns\n" + + "Group DEFAULT Datang\n" + + "Region\n" + + "FactoryNumber\n" + + "\n" + + "time_column=Time\n" + + "\n" + + "source_columns\n" + + "Region TEXT,\n" + + "FactoryNumber TEXT,\n" + + "SKIP,\n" + + "Time INT64,\n" + + "Temperature FLOAT,\n" + + "Emission DOUBLE,\n"); + + ImportSchema schema = ImportSchemaParser.parse(path); + + assertEquals("root.db1", schema.getTableName()); + assertEquals("ms", schema.getTimePrecision()); + assertTrue(schema.isHasHeader()); + assertEquals(",", schema.getSeparator()); + assertEquals("\\N", schema.getNullFormat()); + assertEquals("Time", schema.getTimeColumnName()); + + List tags = schema.getTagColumns(); + assertEquals(3, tags.size()); + assertEquals("Group", tags.get(0).getName()); + assertTrue(tags.get(0).hasDefault()); + assertEquals("Datang", tags.get(0).getDefaultValue()); + assertFalse(tags.get(0).existsInSource()); + assertEquals("Region", tags.get(1).getName()); + assertFalse(tags.get(1).hasDefault()); + assertTrue(tags.get(1).existsInSource()); + assertEquals("FactoryNumber", tags.get(2).getName()); + + List srcCols = schema.getSourceColumns(); + assertEquals(6, srcCols.size()); + assertEquals("Region", srcCols.get(0).getName()); + assertEquals(TSDataType.TEXT, srcCols.get(0).getDataType()); + assertFalse(srcCols.get(0).isSkip()); + assertTrue(srcCols.get(2).isSkip()); + assertEquals("Time", srcCols.get(3).getName()); + assertEquals(TSDataType.INT64, srcCols.get(3).getDataType()); + assertEquals("Temperature", srcCols.get(4).getName()); + assertEquals(TSDataType.FLOAT, srcCols.get(4).getDataType()); + assertEquals("Emission", srcCols.get(5).getName()); + assertEquals(TSDataType.DOUBLE, srcCols.get(5).getDataType()); + + List fields = schema.fieldColumns(); + assertEquals(2, fields.size()); + assertEquals("Temperature", fields.get(0).getName()); + assertEquals("Emission", fields.get(1).getName()); + } + + @Test + public void testOldSchemaFormat() throws Exception { + String path = + writeSchema( + "table_name=root.db1\n" + + "time_precision=ms\n" + + "has_header=true\n" + + "separator=,\n" + + "null_format=\\N\n" + + "\n" + + "id_columns\n" + + "tmp1\n" + + "\n" + + "time_column=time\n" + + "\n" + + "csv_columns\n" + + "time INT64,\n" + + "tmp1 TEXT,\n" + + "tmp2 FLOAT,\n" + + "tmp3 FLOAT,\n" + + "SKIP,\n" + + "tmp5 FLOAT\n"); + + ImportSchema schema = ImportSchemaParser.parse(path); + + assertEquals("root.db1", schema.getTableName()); + assertEquals("time", schema.getTimeColumnName()); + + List tags = schema.getTagColumns(); + assertEquals(1, tags.size()); + assertEquals("tmp1", tags.get(0).getName()); + assertFalse(tags.get(0).hasDefault()); + + List srcCols = schema.getSourceColumns(); + assertEquals(6, srcCols.size()); + assertTrue(srcCols.get(4).isSkip()); + + List fields = schema.fieldColumns(); + assertEquals(3, fields.size()); + assertEquals("tmp2", fields.get(0).getName()); + assertEquals(TSDataType.FLOAT, fields.get(0).getDataType()); + assertEquals("tmp3", fields.get(1).getName()); + assertEquals("tmp5", fields.get(2).getName()); + } + + @Test + public void testFieldDerivationExcludesTimeTagSkip() throws Exception { + String path = + writeSchema( + "table_name=test\n" + + "time_column=ts\n" + + "\n" + + "tag_columns\n" + + "device\n" + + "\n" + + "source_columns\n" + + "ts INT64,\n" + + "device TEXT,\n" + + "SKIP,\n" + + "value1 FLOAT,\n" + + "value2 DOUBLE\n"); + + ImportSchema schema = ImportSchemaParser.parse(path); + + List fields = schema.fieldColumns(); + assertEquals(2, fields.size()); + assertEquals("value1", fields.get(0).getName()); + assertEquals(TSDataType.FLOAT, fields.get(0).getDataType()); + assertEquals("value2", fields.get(1).getName()); + assertEquals(TSDataType.DOUBLE, fields.get(1).getDataType()); + } + + @Test + public void testNoTagColumns() throws Exception { + String path = + writeSchema( + "table_name=test\n" + + "time_column=time\n" + + "\n" + + "source_columns\n" + + "time INT64,\n" + + "temp FLOAT,\n" + + "humidity DOUBLE\n"); + + ImportSchema schema = ImportSchemaParser.parse(path); + + assertTrue(schema.getTagColumns().isEmpty()); + + List fields = schema.fieldColumns(); + assertEquals(2, fields.size()); + assertEquals("temp", fields.get(0).getName()); + assertEquals("humidity", fields.get(1).getName()); + } + + @Test + public void testDefaultTagColumn() throws Exception { + String path = + writeSchema( + "table_name=test\n" + + "time_column=time\n" + + "\n" + + "tag_columns\n" + + "region DEFAULT beijing\n" + + "site\n" + + "\n" + + "source_columns\n" + + "time INT64,\n" + + "site TEXT,\n" + + "value FLOAT\n"); + + ImportSchema schema = ImportSchemaParser.parse(path); + + List tags = schema.getTagColumns(); + assertEquals(2, tags.size()); + + assertTrue(tags.get(0).hasDefault()); + assertEquals("beijing", tags.get(0).getDefaultValue()); + assertFalse(tags.get(0).existsInSource()); + + assertFalse(tags.get(1).hasDefault()); + assertTrue(tags.get(1).existsInSource()); + + List fields = schema.fieldColumns(); + assertEquals(1, fields.size()); + assertEquals("value", fields.get(0).getName()); + } + + @Test + public void testTabSeparator() throws Exception { + String path = + writeSchema( + "table_name=test\n" + + "separator=tab\n" + + "time_column=time\n" + + "\n" + + "source_columns\n" + + "time INT64,\n" + + "value FLOAT\n"); + + ImportSchema schema = ImportSchemaParser.parse(path); + assertEquals("\t", schema.getSeparator()); + } + + @Test + public void testSemicolonSeparator() throws Exception { + String path = + writeSchema( + "table_name=test\n" + + "separator=;\n" + + "time_column=time\n" + + "\n" + + "source_columns\n" + + "time INT64,\n" + + "value FLOAT\n"); + + ImportSchema schema = ImportSchemaParser.parse(path); + assertEquals(";", schema.getSeparator()); + } + + @Test + public void testMissingTableName() throws Exception { + String path = + writeSchema("time_column=time\n" + "source_columns\n" + "time INT64,\n" + "value FLOAT\n"); + + try { + ImportSchemaParser.parse(path); + fail("Expected exception for missing table_name"); + } catch (IllegalArgumentException e) { + assertTrue(e.getMessage().contains("table_name")); + } + } + + @Test + public void testMissingTimeColumn() throws Exception { + String path = + writeSchema("table_name=test\n" + "source_columns\n" + "time INT64,\n" + "value FLOAT\n"); + + try { + ImportSchemaParser.parse(path); + fail("Expected exception for missing time_column"); + } catch (IllegalArgumentException e) { + assertTrue(e.getMessage().contains("time_column")); + } + } + + @Test + public void testTimeColumnNotInSourceColumns() throws Exception { + String path = + writeSchema( + "table_name=test\n" + + "time_column=missing_col\n" + + "\n" + + "source_columns\n" + + "time INT64,\n" + + "value FLOAT\n"); + + try { + ImportSchemaParser.parse(path); + fail("Expected exception for time_column not in source_columns"); + } catch (IllegalArgumentException e) { + assertTrue(e.getMessage().contains("missing_col")); + } + } + + @Test + public void testTagColumnNotInSourceColumns() throws Exception { + String path = + writeSchema( + "table_name=test\n" + + "time_column=time\n" + + "\n" + + "tag_columns\n" + + "missing_tag\n" + + "\n" + + "source_columns\n" + + "time INT64,\n" + + "value FLOAT\n"); + + try { + ImportSchemaParser.parse(path); + fail("Expected exception for tag not in source_columns"); + } catch (IllegalArgumentException e) { + assertTrue(e.getMessage().contains("missing_tag")); + } + } + + @Test + public void testInvalidTimePrecision() throws Exception { + String path = + writeSchema( + "table_name=test\n" + + "time_precision=sec\n" + + "time_column=time\n" + + "\n" + + "source_columns\n" + + "time INT64,\n" + + "value FLOAT\n"); + + try { + ImportSchemaParser.parse(path); + fail("Expected exception for invalid time_precision"); + } catch (IllegalArgumentException e) { + assertTrue(e.getMessage().contains("time_precision")); + } + } + + @Test + public void testNamedSkipForParquetArrow() throws Exception { + String path = + writeSchema( + "table_name=test\n" + + "time_column=time\n" + + "\n" + + "source_columns\n" + + "time INT64,\n" + + "unused_col SKIP,\n" + + "value FLOAT\n"); + + ImportSchema schema = ImportSchemaParser.parse(path); + + List srcCols = schema.getSourceColumns(); + assertEquals(3, srcCols.size()); + assertTrue(srcCols.get(1).isSkip()); + assertEquals("unused_col", srcCols.get(1).getName()); + + List fields = schema.fieldColumns(); + assertEquals(1, fields.size()); + assertEquals("value", fields.get(0).getName()); + } + + @Test + public void testAllDataTypes() throws Exception { + String path = + writeSchema( + "table_name=test\n" + + "time_column=ts\n" + + "\n" + + "source_columns\n" + + "ts TIMESTAMP,\n" + + "b BOOLEAN,\n" + + "i32 INT32,\n" + + "i64 INT64,\n" + + "f FLOAT,\n" + + "d DOUBLE,\n" + + "s STRING,\n" + + "t TEXT,\n" + + "bl BLOB,\n" + + "dt DATE\n"); + + ImportSchema schema = ImportSchemaParser.parse(path); + + List srcCols = schema.getSourceColumns(); + assertEquals(10, srcCols.size()); + assertEquals(TSDataType.TIMESTAMP, srcCols.get(0).getDataType()); + assertEquals(TSDataType.BOOLEAN, srcCols.get(1).getDataType()); + assertEquals(TSDataType.INT32, srcCols.get(2).getDataType()); + assertEquals(TSDataType.INT64, srcCols.get(3).getDataType()); + assertEquals(TSDataType.FLOAT, srcCols.get(4).getDataType()); + assertEquals(TSDataType.DOUBLE, srcCols.get(5).getDataType()); + assertEquals(TSDataType.STRING, srcCols.get(6).getDataType()); + assertEquals(TSDataType.TEXT, srcCols.get(7).getDataType()); + assertEquals(TSDataType.BLOB, srcCols.get(8).getDataType()); + assertEquals(TSDataType.DATE, srcCols.get(9).getDataType()); + } + + @Test + public void testCommentsAndEmptyLines() throws Exception { + String path = + writeSchema( + "// This is a comment\n" + + "table_name=test\n" + + "\n" + + "// Another comment\n" + + "time_column=time\n" + + "\n" + + "source_columns\n" + + "// Column definitions\n" + + "time INT64,\n" + + "\n" + + "value FLOAT\n"); + + ImportSchema schema = ImportSchemaParser.parse(path); + assertEquals("test", schema.getTableName()); + assertEquals(2, schema.getSourceColumns().size()); + } + + @Test + public void testDefaultValues() throws Exception { + String path = + writeSchema( + "table_name=test\n" + + "time_column=time\n" + + "\n" + + "source_columns\n" + + "time INT64,\n" + + "value FLOAT\n"); + + ImportSchema schema = ImportSchemaParser.parse(path); + + assertEquals("ms", schema.getTimePrecision()); + assertTrue(schema.isHasHeader()); + assertEquals(",", schema.getSeparator()); + assertNull(schema.getNullFormat()); + } + + @Test + public void testNewSchemaEndToEndCsv() throws Exception { + String schemaPath = + writeSchema( + "table_name=root.test\n" + + "time_precision=ms\n" + + "has_header=true\n" + + "separator=,\n" + + "\n" + + "tag_columns\n" + + "device\n" + + "\n" + + "time_column=ts\n" + + "\n" + + "source_columns\n" + + "ts INT64,\n" + + "device TEXT,\n" + + "temperature FLOAT,\n" + + "humidity DOUBLE\n"); + + String csvPath = testDir + File.separator + "sensor.csv"; + try (BufferedWriter w = new BufferedWriter(new FileWriter(csvPath))) { + w.write("ts,device,temperature,humidity\n"); + long ts = System.currentTimeMillis(); + for (int i = 0; i < 10; i++) { + w.write((ts + i) + ",dev1," + (20.0f + i) + "," + (50.0 + i) + "\n"); + } + } + + String targetPath = testDir + File.separator + "output"; + String[] args = new String[] {"-s" + csvPath, "-schema" + schemaPath, "-t" + targetPath}; + TsFileTool.main(args); + + String tsfilePath = targetPath + File.separator + "sensor.tsfile"; + assertTrue("TsFile should exist", new File(tsfilePath).exists()); + } +} diff --git a/java/tools/src/test/java/org/apache/tsfile/tools/ParquetSourceReaderTest.java b/java/tools/src/test/java/org/apache/tsfile/tools/ParquetSourceReaderTest.java new file mode 100644 index 000000000..984be8aa7 --- /dev/null +++ b/java/tools/src/test/java/org/apache/tsfile/tools/ParquetSourceReaderTest.java @@ -0,0 +1,467 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.tools; + +import org.apache.tsfile.enums.TSDataType; + +import org.apache.parquet.example.data.Group; +import org.apache.parquet.example.data.simple.SimpleGroupFactory; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.example.ExampleParquetWriter; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.io.LocalOutputFile; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Types; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +public class ParquetSourceReaderTest { + + private final String testDir = "target" + File.separator + "parquetReaderTest"; + + @Before + public void setUp() { + new File(testDir).mkdirs(); + } + + @After + public void tearDown() { + deleteRecursive(new File(testDir)); + } + + private void deleteRecursive(File dir) { + File[] files = dir.listFiles(); + if (files != null) { + for (File f : files) { + if (f.isDirectory()) { + deleteRecursive(f); + } + f.delete(); + } + } + dir.delete(); + } + + private File writeParquetFile(String name, MessageType schema, List rows) + throws IOException { + File file = new File(testDir, name); + if (file.exists()) { + file.delete(); + } + + try (ParquetWriter writer = + ExampleParquetWriter.builder(new LocalOutputFile(file.toPath())) + .withType(schema) + .withCompressionCodec(CompressionCodecName.UNCOMPRESSED) + .build()) { + for (Group row : rows) { + writer.write(row); + } + } + return file; + } + + private MessageType buildBasicSchema() { + return Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.INT64) + .named("time") + .required(PrimitiveType.PrimitiveTypeName.FLOAT) + .named("temperature") + .required(PrimitiveType.PrimitiveTypeName.DOUBLE) + .named("humidity") + .named("test"); + } + + @Test + public void testAutoModeInferSchema() throws Exception { + MessageType pqSchema = buildBasicSchema(); + SimpleGroupFactory factory = new SimpleGroupFactory(pqSchema); + + List rows = new ArrayList<>(); + rows.add( + factory + .newGroup() + .append("time", 1000L) + .append("temperature", 25.5f) + .append("humidity", 60.0)); + rows.add( + factory + .newGroup() + .append("time", 2000L) + .append("temperature", 26.0f) + .append("humidity", 55.0)); + + File file = writeParquetFile("auto.parquet", pqSchema, rows); + + try (ParquetSourceReader reader = new ParquetSourceReader(file)) { + ImportSchema schema = reader.inferSchema(); + assertEquals("auto", schema.getTableName()); + assertEquals("time", schema.getTimeColumnName()); + assertTrue(schema.getTagColumns().isEmpty()); + + List fields = schema.fieldColumns(); + assertEquals(2, fields.size()); + assertEquals("temperature", fields.get(0).getName()); + assertEquals("humidity", fields.get(1).getName()); + } + } + + @Test + public void testAutoModeReadBatch() throws Exception { + MessageType pqSchema = buildBasicSchema(); + SimpleGroupFactory factory = new SimpleGroupFactory(pqSchema); + + List rows = new ArrayList<>(); + for (int i = 0; i < 5; i++) { + rows.add( + factory + .newGroup() + .append("time", 1000L + i) + .append("temperature", 20.0f + i) + .append("humidity", 50.0 + i)); + } + + File file = writeParquetFile("batch.parquet", pqSchema, rows); + + try (ParquetSourceReader reader = new ParquetSourceReader(file)) { + reader.inferSchema(); + SourceBatch batch = reader.readBatch(); + assertNotNull(batch); + assertEquals(5, batch.getRowCount()); + + assertEquals(1000L, batch.getValue(0, 0)); + assertEquals(20.0f, (float) batch.getValue(0, 1), 0.001f); + assertEquals(50.0, (double) batch.getValue(0, 2), 0.001); + + assertNull(reader.readBatch()); + } + } + + @Test + public void testAutoModeUppercaseTIME() throws Exception { + MessageType pqSchema = + Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.INT64) + .named("TIME") + .required(PrimitiveType.PrimitiveTypeName.FLOAT) + .named("value") + .named("test"); + SimpleGroupFactory factory = new SimpleGroupFactory(pqSchema); + + List rows = new ArrayList<>(); + rows.add(factory.newGroup().append("TIME", 1000L).append("value", 1.0f)); + + File file = writeParquetFile("upper_time.parquet", pqSchema, rows); + + try (ParquetSourceReader reader = new ParquetSourceReader(file)) { + ImportSchema schema = reader.inferSchema(); + assertEquals("TIME", schema.getTimeColumnName()); + } + } + + @Test(expected = IllegalArgumentException.class) + public void testAutoModeMixedCaseTimeFails() throws Exception { + MessageType pqSchema = + Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.INT64) + .named("Time") + .required(PrimitiveType.PrimitiveTypeName.FLOAT) + .named("value") + .named("test"); + SimpleGroupFactory factory = new SimpleGroupFactory(pqSchema); + + List rows = new ArrayList<>(); + rows.add(factory.newGroup().append("Time", 1000L).append("value", 1.0f)); + + File file = writeParquetFile("mixed_time.parquet", pqSchema, rows); + + try (ParquetSourceReader reader = new ParquetSourceReader(file)) { + reader.inferSchema(); + } + } + + @Test + public void testAutoModeTableNameFromFilename() throws Exception { + MessageType pqSchema = buildBasicSchema(); + SimpleGroupFactory factory = new SimpleGroupFactory(pqSchema); + + List rows = new ArrayList<>(); + rows.add( + factory + .newGroup() + .append("time", 1000L) + .append("temperature", 25.0f) + .append("humidity", 60.0)); + + File file = writeParquetFile("sensor_data.parquet", pqSchema, rows); + + try (ParquetSourceReader reader = new ParquetSourceReader(file)) { + ImportSchema schema = reader.inferSchema(); + assertEquals("sensor_data", schema.getTableName()); + } + } + + @Test + public void testAutoModeTableNameOverride() throws Exception { + MessageType pqSchema = buildBasicSchema(); + SimpleGroupFactory factory = new SimpleGroupFactory(pqSchema); + + List rows = new ArrayList<>(); + rows.add( + factory + .newGroup() + .append("time", 1000L) + .append("temperature", 25.0f) + .append("humidity", 60.0)); + + File file = writeParquetFile("data.parquet", pqSchema, rows); + + try (ParquetSourceReader reader = new ParquetSourceReader(file)) { + reader.setOverrideTableName("custom_table"); + ImportSchema schema = reader.inferSchema(); + assertEquals("custom_table", schema.getTableName()); + } + } + + @Test + public void testAutoModeTimestampPrecisionMillis() throws Exception { + MessageType pqSchema = + Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.INT64) + .as(LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MILLIS)) + .named("time") + .required(PrimitiveType.PrimitiveTypeName.FLOAT) + .named("value") + .named("test"); + SimpleGroupFactory factory = new SimpleGroupFactory(pqSchema); + + List rows = new ArrayList<>(); + rows.add(factory.newGroup().append("time", 1000L).append("value", 1.0f)); + + File file = writeParquetFile("ts_millis.parquet", pqSchema, rows); + + try (ParquetSourceReader reader = new ParquetSourceReader(file)) { + ImportSchema schema = reader.inferSchema(); + assertEquals("ms", schema.getTimePrecision()); + } + } + + @Test + public void testAutoModeTimestampPrecisionMicros() throws Exception { + MessageType pqSchema = + Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.INT64) + .as(LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS)) + .named("time") + .required(PrimitiveType.PrimitiveTypeName.FLOAT) + .named("value") + .named("test"); + SimpleGroupFactory factory = new SimpleGroupFactory(pqSchema); + + List rows = new ArrayList<>(); + rows.add(factory.newGroup().append("time", 1000000L).append("value", 1.0f)); + + File file = writeParquetFile("ts_micros.parquet", pqSchema, rows); + + try (ParquetSourceReader reader = new ParquetSourceReader(file)) { + ImportSchema schema = reader.inferSchema(); + assertEquals("us", schema.getTimePrecision()); + } + } + + @Test + public void testSchemaMode() throws Exception { + MessageType pqSchema = buildBasicSchema(); + SimpleGroupFactory factory = new SimpleGroupFactory(pqSchema); + + List rows = new ArrayList<>(); + rows.add( + factory + .newGroup() + .append("time", 1000L) + .append("temperature", 25.5f) + .append("humidity", 60.0)); + rows.add( + factory + .newGroup() + .append("time", 2000L) + .append("temperature", 26.0f) + .append("humidity", 55.0)); + + File file = writeParquetFile("schema_mode.parquet", pqSchema, rows); + + ImportSchema importSchema = new ImportSchema(); + importSchema.setTableName("test_table"); + importSchema.setTimeColumnName("time"); + importSchema.setTimePrecision("ms"); + importSchema.setTagColumns(new ArrayList()); + importSchema.setSourceColumns( + Arrays.asList( + new ImportSchema.SourceColumn("time", TSDataType.INT64), + new ImportSchema.SourceColumn("temperature", TSDataType.FLOAT), + new ImportSchema.SourceColumn("humidity", TSDataType.DOUBLE))); + + try (ParquetSourceReader reader = new ParquetSourceReader(file, importSchema)) { + SourceBatch batch = reader.readBatch(); + assertNotNull(batch); + assertEquals(2, batch.getRowCount()); + assertEquals(1000L, batch.getValue(0, 0)); + assertEquals(25.5f, (float) batch.getValue(0, 1), 0.001f); + assertEquals(60.0, (double) batch.getValue(0, 2), 0.001); + + assertNull(reader.readBatch()); + } + } + + @Test + public void testSchemaModeNamedSkip() throws Exception { + MessageType pqSchema = + Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.INT64) + .named("time") + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.stringType()) + .named("unused") + .required(PrimitiveType.PrimitiveTypeName.FLOAT) + .named("value") + .named("test"); + SimpleGroupFactory factory = new SimpleGroupFactory(pqSchema); + + List rows = new ArrayList<>(); + rows.add(factory.newGroup().append("time", 1000L).append("unused", "x").append("value", 3.14f)); + + File file = writeParquetFile("skip.parquet", pqSchema, rows); + + ImportSchema importSchema = new ImportSchema(); + importSchema.setTableName("test"); + importSchema.setTimeColumnName("time"); + importSchema.setTimePrecision("ms"); + importSchema.setTagColumns(new ArrayList()); + importSchema.setSourceColumns( + Arrays.asList( + new ImportSchema.SourceColumn("time", TSDataType.INT64), + ImportSchema.SourceColumn.skip("unused"), + new ImportSchema.SourceColumn("value", TSDataType.FLOAT))); + + try (ParquetSourceReader reader = new ParquetSourceReader(file, importSchema)) { + SourceBatch batch = reader.readBatch(); + assertNotNull(batch); + assertEquals(1, batch.getRowCount()); + } + } + + @Test + public void testNativeNullHandling() throws Exception { + MessageType pqSchema = + Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.INT64) + .named("time") + .optional(PrimitiveType.PrimitiveTypeName.FLOAT) + .named("value") + .named("test"); + SimpleGroupFactory factory = new SimpleGroupFactory(pqSchema); + + List rows = new ArrayList<>(); + rows.add(factory.newGroup().append("time", 1000L).append("value", 3.14f)); + rows.add(factory.newGroup().append("time", 2000L)); + + File file = writeParquetFile("nulls.parquet", pqSchema, rows); + + try (ParquetSourceReader reader = new ParquetSourceReader(file)) { + reader.inferSchema(); + SourceBatch batch = reader.readBatch(); + assertNotNull(batch); + assertEquals(2, batch.getRowCount()); + assertEquals(3.14f, (float) batch.getValue(0, 1), 0.001f); + assertNull(batch.getValue(1, 1)); + } + } + + @Test + public void testTypeMapping() throws Exception { + MessageType pqSchema = + Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.INT64) + .named("time") + .required(PrimitiveType.PrimitiveTypeName.BOOLEAN) + .named("flag") + .required(PrimitiveType.PrimitiveTypeName.INT32) + .named("count") + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.stringType()) + .named("name") + .named("test"); + SimpleGroupFactory factory = new SimpleGroupFactory(pqSchema); + + List rows = new ArrayList<>(); + rows.add( + factory + .newGroup() + .append("time", 1000L) + .append("flag", true) + .append("count", 42) + .append("name", "hello")); + + File file = writeParquetFile("types.parquet", pqSchema, rows); + + try (ParquetSourceReader reader = new ParquetSourceReader(file)) { + ImportSchema schema = reader.inferSchema(); + + List fields = schema.fieldColumns(); + assertEquals(3, fields.size()); + assertEquals(TSDataType.BOOLEAN, fields.get(0).getDataType()); + assertEquals(TSDataType.INT32, fields.get(1).getDataType()); + assertEquals(TSDataType.STRING, fields.get(2).getDataType()); + + SourceBatch batch = reader.readBatch(); + assertNotNull(batch); + assertEquals(true, batch.getValue(0, 1)); + assertEquals(42, batch.getValue(0, 2)); + assertEquals("hello", batch.getValue(0, 3)); + } + } + + @Test + public void testEmptyFile() throws Exception { + MessageType pqSchema = buildBasicSchema(); + + File file = writeParquetFile("empty.parquet", pqSchema, new ArrayList()); + + try (ParquetSourceReader reader = new ParquetSourceReader(file)) { + reader.inferSchema(); + SourceBatch batch = reader.readBatch(); + assertNull(batch); + } + } +} diff --git a/java/tools/src/test/java/org/apache/tsfile/tools/TabletBuilderTest.java b/java/tools/src/test/java/org/apache/tsfile/tools/TabletBuilderTest.java new file mode 100644 index 000000000..a5b4c3e71 --- /dev/null +++ b/java/tools/src/test/java/org/apache/tsfile/tools/TabletBuilderTest.java @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.tools; + +import org.apache.tsfile.enums.TSDataType; +import org.apache.tsfile.write.record.Tablet; + +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class TabletBuilderTest { + + private ImportSchema buildSchema( + String tableName, + String timeCol, + List tags, + ImportSchema.SourceColumn... srcCols) { + ImportSchema schema = new ImportSchema(); + schema.setTableName(tableName); + schema.setTimeColumnName(timeCol); + schema.setTagColumns(tags != null ? tags : new ArrayList<>()); + schema.setSourceColumns(Arrays.asList(srcCols)); + return schema; + } + + @Test + public void testBasicBuild() { + ImportSchema schema = + buildSchema( + "test", + "time", + null, + new ImportSchema.SourceColumn("time", TSDataType.INT64), + new ImportSchema.SourceColumn("value", TSDataType.FLOAT)); + + TabletBuilder builder = new TabletBuilder(schema, new TimeConverter("ms")); + + SourceBatch batch = + SourceBatch.fromRows( + Arrays.asList("time", "value"), + Arrays.asList(new Object[] {"1000", "3.14"}, new Object[] {"2000", "2.71"})); + + Tablet tablet = builder.build(batch); + assertEquals(2, tablet.getRowSize()); + } + + @Test + public void testTimeSorting() { + ImportSchema schema = + buildSchema( + "test", + "time", + null, + new ImportSchema.SourceColumn("time", TSDataType.INT64), + new ImportSchema.SourceColumn("value", TSDataType.FLOAT)); + + TabletBuilder builder = new TabletBuilder(schema, new TimeConverter("ms")); + + SourceBatch batch = + SourceBatch.fromRows( + Arrays.asList("time", "value"), + Arrays.asList( + new Object[] {"3000", "3.0"}, + new Object[] {"1000", "1.0"}, + new Object[] {"2000", "2.0"})); + + Tablet tablet = builder.build(batch); + assertEquals(3, tablet.getRowSize()); + assertTrue(tablet.getTimestamps()[0] <= tablet.getTimestamps()[1]); + assertTrue(tablet.getTimestamps()[1] <= tablet.getTimestamps()[2]); + assertEquals(1000L, tablet.getTimestamps()[0]); + assertEquals(2000L, tablet.getTimestamps()[1]); + assertEquals(3000L, tablet.getTimestamps()[2]); + } + + @Test + public void testTagColumns() { + List tags = new ArrayList<>(); + tags.add(new ImportSchema.TagColumn("device")); + + ImportSchema schema = + buildSchema( + "test", + "time", + tags, + new ImportSchema.SourceColumn("time", TSDataType.INT64), + new ImportSchema.SourceColumn("device", TSDataType.TEXT), + new ImportSchema.SourceColumn("value", TSDataType.FLOAT)); + + TabletBuilder builder = new TabletBuilder(schema, new TimeConverter("ms")); + + SourceBatch batch = + SourceBatch.fromRows( + Arrays.asList("time", "device", "value"), + Collections.singletonList(new Object[] {"1000", "dev1", "3.14"})); + + Tablet tablet = builder.build(batch); + assertEquals(1, tablet.getRowSize()); + } + + @Test + public void testTagDefaultValue() { + List tags = new ArrayList<>(); + tags.add(new ImportSchema.TagColumn("region", "beijing")); + + ImportSchema schema = + buildSchema( + "test", + "time", + tags, + new ImportSchema.SourceColumn("time", TSDataType.INT64), + new ImportSchema.SourceColumn("value", TSDataType.FLOAT)); + + TabletBuilder builder = new TabletBuilder(schema, new TimeConverter("ms")); + + SourceBatch batch = + SourceBatch.fromRows( + Arrays.asList("time", "value"), + Collections.singletonList(new Object[] {"1000", "3.14"})); + + Tablet tablet = builder.build(batch); + assertEquals(1, tablet.getRowSize()); + } + + @Test + public void testNullValues() { + ImportSchema schema = + buildSchema( + "test", + "time", + null, + new ImportSchema.SourceColumn("time", TSDataType.INT64), + new ImportSchema.SourceColumn("value", TSDataType.FLOAT)); + + TabletBuilder builder = new TabletBuilder(schema, new TimeConverter("ms")); + + SourceBatch batch = + SourceBatch.fromRows( + Arrays.asList("time", "value"), + Arrays.asList(new Object[] {"1000", null}, new Object[] {"2000", "2.71"})); + + Tablet tablet = builder.build(batch); + assertEquals(2, tablet.getRowSize()); + } + + @Test + public void testNullFormatRecognition() { + ImportSchema schema = + buildSchema( + "test", + "time", + null, + new ImportSchema.SourceColumn("time", TSDataType.INT64), + new ImportSchema.SourceColumn("value", TSDataType.FLOAT)); + schema.setNullFormat("\\N"); + + TabletBuilder builder = new TabletBuilder(schema, new TimeConverter("ms")); + + SourceBatch batch = + SourceBatch.fromRows( + Arrays.asList("time", "value"), + Arrays.asList(new Object[] {"1000", "\\N"}, new Object[] {"2000", "2.71"})); + + Tablet tablet = builder.build(batch); + assertEquals(2, tablet.getRowSize()); + } + + @Test + public void testSkipColumn() { + ImportSchema schema = + buildSchema( + "test", + "time", + null, + new ImportSchema.SourceColumn("time", TSDataType.INT64), + ImportSchema.SourceColumn.skip(), + new ImportSchema.SourceColumn("value", TSDataType.FLOAT)); + + TabletBuilder builder = new TabletBuilder(schema, new TimeConverter("ms")); + + SourceBatch batch = + SourceBatch.fromRows( + Arrays.asList("time", "unused", "value"), + Collections.singletonList(new Object[] {"1000", "x", "3.14"})); + + Tablet tablet = builder.build(batch); + assertEquals(1, tablet.getRowSize()); + + List fields = schema.fieldColumns(); + assertEquals(1, fields.size()); + assertEquals("value", fields.get(0).getName()); + } + + @Test + public void testTableSchemaStructure() { + List tags = new ArrayList<>(); + tags.add(new ImportSchema.TagColumn("device")); + + ImportSchema schema = + buildSchema( + "myTable", + "ts", + tags, + new ImportSchema.SourceColumn("ts", TSDataType.INT64), + new ImportSchema.SourceColumn("device", TSDataType.TEXT), + new ImportSchema.SourceColumn("temp", TSDataType.FLOAT), + new ImportSchema.SourceColumn("humidity", TSDataType.DOUBLE)); + + TabletBuilder builder = new TabletBuilder(schema, new TimeConverter("ms")); + + assertEquals("mytable", builder.getTableSchema().getTableName()); + // TAG: device, FIELD: temp, humidity → 3 column schemas + assertEquals(3, builder.getTableSchema().getColumnSchemas().size()); + } +} diff --git a/java/tools/src/test/java/org/apache/tsfile/tools/TimeConverterTest.java b/java/tools/src/test/java/org/apache/tsfile/tools/TimeConverterTest.java new file mode 100644 index 000000000..edc757208 --- /dev/null +++ b/java/tools/src/test/java/org/apache/tsfile/tools/TimeConverterTest.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.tools; + +import org.junit.Test; + +import java.time.Instant; + +import static org.junit.Assert.assertEquals; + +public class TimeConverterTest { + + @Test + public void testStringNumericWithSourcePrecisionMs() { + TimeConverter converter = new TimeConverter("ms"); + long result = converter.convert("1745231234567", "ms"); + assertEquals(1745231234567L, result); + } + + @Test + public void testStringNumericWithSourcePrecisionUs() { + TimeConverter converter = new TimeConverter("ms"); + long result = converter.convert("1745231234567000", "us"); + assertEquals(1745231234567L, result); + } + + @Test + public void testStringNumericWithSourcePrecisionNs() { + TimeConverter converter = new TimeConverter("ms"); + long result = converter.convert("1745231234567000000", "ns"); + assertEquals(1745231234567L, result); + } + + @Test + public void testSourcePrecisionMatchesTarget() { + TimeConverter converter = new TimeConverter("us"); + long result = converter.convert("12345678", "us"); + assertEquals(12345678L, result); + } + + @Test + public void testLongWithSourcePrecision() { + TimeConverter converter = new TimeConverter("ms"); + long result = converter.convert(1745231234567000L, "us"); + assertEquals(1745231234567L, result); + } + + @Test + public void testIntegerWithSourcePrecision() { + TimeConverter converter = new TimeConverter("ms"); + long result = converter.convert(1000, "ms"); + assertEquals(1000L, result); + } + + @Test + public void testInferPrecisionNanoseconds() { + assertEquals("ns", TimeConverter.inferPrecision(1745231234567000000L)); + assertEquals("ns", TimeConverter.inferPrecision(2000000000000000L)); + } + + @Test + public void testInferPrecisionMicroseconds() { + // >1e12 and <=1e15 → us + assertEquals("us", TimeConverter.inferPrecision(1745231234567L)); + assertEquals("us", TimeConverter.inferPrecision(2000000000000L)); + } + + @Test + public void testInferPrecisionMilliseconds() { + // >1e11 and <=1e12 → ms + assertEquals("ms", TimeConverter.inferPrecision(200000000000L)); + assertEquals("ms", TimeConverter.inferPrecision(999999999999L)); + } + + @Test + public void testInferPrecisionSeconds() { + assertEquals("s", TimeConverter.inferPrecision(1745231234L)); + assertEquals("s", TimeConverter.inferPrecision(100000000000L)); + } + + @Test + public void testConvertWithoutSourcePrecisionUsesInference() { + TimeConverter converter = new TimeConverter("ms"); + // 1745231234567 is >1e12, inferred as "us", should be rescaled to ms + long result = converter.convert("1745231234567"); + assertEquals(1745231234L, result); + } + + @Test + public void testConvertWithSourcePrecisionSkipsInference() { + TimeConverter converter = new TimeConverter("ms"); + // Same value, but with explicit source precision "ms", should NOT rescale + long result = converter.convert("1745231234567", "ms"); + assertEquals(1745231234567L, result); + } + + @Test + public void testRescaleMsToUs() { + assertEquals(1000000L, TimeConverter.rescale(1000L, "ms", "us")); + } + + @Test + public void testRescaleUsToMs() { + assertEquals(1L, TimeConverter.rescale(1000L, "us", "ms")); + } + + @Test + public void testRescaleNsToMs() { + assertEquals(1L, TimeConverter.rescale(1000000L, "ns", "ms")); + } + + @Test + public void testRescaleSToMs() { + assertEquals(1000L, TimeConverter.rescale(1L, "s", "ms")); + } + + @Test + public void testRescaleSameUnit() { + assertEquals(42L, TimeConverter.rescale(42L, "ms", "ms")); + } + + @Test + public void testInstantToMs() { + TimeConverter converter = new TimeConverter("ms"); + Instant instant = Instant.ofEpochMilli(1745231234567L); + long result = converter.convert(instant); + assertEquals(1745231234567L, result); + } + + @Test + public void testInstantToUs() { + TimeConverter converter = new TimeConverter("us"); + Instant instant = Instant.ofEpochSecond(1000, 500000000L); + long result = converter.convert(instant); + assertEquals(1000500000L, result); + } + + @Test + public void testInstantToNs() { + TimeConverter converter = new TimeConverter("ns"); + Instant instant = Instant.ofEpochSecond(1, 123456789L); + long result = converter.convert(instant); + assertEquals(1123456789L, result); + } + + @Test + public void testDatetimeString() { + TimeConverter converter = new TimeConverter("ms"); + long result = converter.convert("2025-01-01T00:00:00+00:00"); + assertEquals(1735689600000L, result); + } + + @Test + public void testDatetimeStringWithSourcePrecision() { + TimeConverter converter = new TimeConverter("ms"); + long result = converter.convert("2025-01-01T00:00:00+00:00", "ms"); + assertEquals(1735689600000L, result); + } + + @Test(expected = IllegalArgumentException.class) + public void testNullValueThrows() { + TimeConverter converter = new TimeConverter("ms"); + converter.convert(null); + } + + @Test(expected = IllegalArgumentException.class) + public void testNullValueWithPrecisionThrows() { + TimeConverter converter = new TimeConverter("ms"); + converter.convert(null, "ms"); + } +} diff --git a/java/tools/src/test/java/org/apache/tsfile/tools/TsFileToolCliTest.java b/java/tools/src/test/java/org/apache/tsfile/tools/TsFileToolCliTest.java new file mode 100644 index 000000000..fed5b437b --- /dev/null +++ b/java/tools/src/test/java/org/apache/tsfile/tools/TsFileToolCliTest.java @@ -0,0 +1,591 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.tools; + +import org.apache.tsfile.external.commons.io.FileUtils; +import org.apache.tsfile.read.TsFileSequenceReader; +import org.apache.tsfile.read.controller.CachedChunkLoaderImpl; +import org.apache.tsfile.read.controller.MetadataQuerierByFileImpl; +import org.apache.tsfile.read.query.executor.TableQueryExecutor; +import org.apache.tsfile.read.reader.block.TsBlockReader; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.ipc.ArrowFileWriter; +import org.apache.arrow.vector.types.FloatingPointPrecision; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.example.data.simple.SimpleGroupFactory; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.example.ExampleParquetWriter; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.io.LocalOutputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Types; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileOutputStream; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class TsFileToolCliTest { + private final String testDir = "target" + File.separator + "cliTest"; + private final String outputDir = testDir + File.separator + "output"; + private final String failedDir = testDir + File.separator + "failed"; + + @Before + public void setUp() { + new File(testDir).mkdirs(); + new File(outputDir).mkdirs(); + } + + @After + public void tearDown() throws Exception { + FileUtils.deleteDirectory(new File(testDir)); + } + + private String createCsvFile(String name, String[] headers, String[][] rows) throws IOException { + String path = testDir + File.separator + name; + try (BufferedWriter w = new BufferedWriter(new FileWriter(path))) { + w.write(String.join(",", headers) + "\n"); + for (String[] row : rows) { + w.write(String.join(",", row) + "\n"); + } + } + return new File(path).getAbsolutePath(); + } + + private String createSchemaFile(String name, String content) throws IOException { + String path = testDir + File.separator + name; + try (BufferedWriter w = new BufferedWriter(new FileWriter(path))) { + w.write(content); + } + return new File(path).getAbsolutePath(); + } + + private int queryRowCount(String tsfilePath, String tableName, List columns) + throws Exception { + try (TsFileSequenceReader sequenceReader = new TsFileSequenceReader(tsfilePath)) { + TableQueryExecutor tableQueryExecutor = + new TableQueryExecutor( + new MetadataQuerierByFileImpl(sequenceReader), + new CachedChunkLoaderImpl(sequenceReader), + TableQueryExecutor.TableQueryOrdering.DEVICE); + TsBlockReader reader = tableQueryExecutor.query(tableName, columns, null, null, null); + int cnt = 0; + while (reader.hasNext()) { + cnt += reader.next().getPositionCount(); + } + return cnt; + } + } + + @Test + public void testCsvAutoModeViaCli() throws Exception { + String csvPath = + createCsvFile( + "auto.csv", + new String[] {"time", "temp", "humidity"}, + new String[][] { + {"1000", "25.5", "60.0"}, + {"1001", "26.0", "61.0"}, + {"1002", "26.5", "62.0"} + }); + + String target = new File(outputDir).getAbsolutePath(); + TsFileTool.main(new String[] {"-s" + csvPath, "-t" + target}); + + String tsfile = outputDir + File.separator + "auto.tsfile"; + assertTrue(new File(tsfile).exists()); + assertEquals(3, queryRowCount(tsfile, "auto", Arrays.asList("temp", "humidity"))); + } + + @Test + public void testCsvSchemaModeViaCli() throws Exception { + String csvPath = + createCsvFile( + "schema.csv", + new String[] {"time", "tag1", "val"}, + new String[][] { + {"1000", "s1", "10.0"}, + {"1001", "s1", "11.0"} + }); + + String schemaPath = + createSchemaFile( + "schema.txt", + "table_name=root.test\n" + + "time_precision=ms\n" + + "has_header=true\n" + + "separator=,\n" + + "null_format=\\N\n\n" + + "id_columns\n" + + "tag1\n" + + "time_column=time\n" + + "csv_columns\n" + + "time INT64,\n" + + "tag1 TEXT,\n" + + "val DOUBLE\n"); + + String target = new File(outputDir).getAbsolutePath(); + TsFileTool.main(new String[] {"-s" + csvPath, "-schema" + schemaPath, "-t" + target}); + + String tsfile = outputDir + File.separator + "schema.tsfile"; + assertTrue(new File(tsfile).exists()); + assertEquals(2, queryRowCount(tsfile, "root.test", Arrays.asList("val"))); + } + + @Test + public void testTableNameOverrideViaCli() throws Exception { + String csvPath = + createCsvFile( + "data.csv", + new String[] {"time", "val"}, + new String[][] {{"1000", "1.0"}, {"1001", "2.0"}}); + + String target = new File(outputDir).getAbsolutePath(); + TsFileTool.main(new String[] {"-s" + csvPath, "-t" + target, "--table_name", "custom_table"}); + + String tsfile = outputDir + File.separator + "data.tsfile"; + assertTrue(new File(tsfile).exists()); + assertEquals(2, queryRowCount(tsfile, "custom_table", Arrays.asList("val"))); + } + + @Test + public void testSeparatorTabViaCli() throws Exception { + String csvPath = testDir + File.separator + "tab.csv"; + try (BufferedWriter w = new BufferedWriter(new FileWriter(csvPath))) { + w.write("time\tval\n"); + w.write("1000\t10.0\n"); + w.write("1001\t20.0\n"); + } + + String target = new File(outputDir).getAbsolutePath(); + TsFileTool.main( + new String[] { + "-s" + new File(csvPath).getAbsolutePath(), "-t" + target, "--separator", "tab" + }); + + String tsfile = outputDir + File.separator + "tab.tsfile"; + assertTrue(new File(tsfile).exists()); + assertEquals(2, queryRowCount(tsfile, "tab", Arrays.asList("val"))); + } + + @Test + public void testFormatParquetViaCli() throws Exception { + String pqFile = testDir + File.separator + "data.parquet"; + MessageType pqSchema = + Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.INT64) + .named("time") + .required(PrimitiveType.PrimitiveTypeName.DOUBLE) + .named("value") + .named("test"); + SimpleGroupFactory factory = new SimpleGroupFactory(pqSchema); + try (ParquetWriter writer = + ExampleParquetWriter.builder(new LocalOutputFile(new File(pqFile).toPath())) + .withType(pqSchema) + .withCompressionCodec(CompressionCodecName.UNCOMPRESSED) + .build()) { + for (int i = 0; i < 5; i++) { + writer.write(factory.newGroup().append("time", 1000L + i).append("value", i * 1.1)); + } + } + + String target = new File(outputDir).getAbsolutePath(); + TsFileTool.main( + new String[] { + "-s" + new File(pqFile).getAbsolutePath(), "-t" + target, "--format", "parquet" + }); + + String tsfile = outputDir + File.separator + "data.tsfile"; + assertTrue(new File(tsfile).exists()); + assertEquals(5, queryRowCount(tsfile, "data", Arrays.asList("value"))); + } + + @Test + public void testFormatArrowViaCli() throws Exception { + String arFile = testDir + File.separator + "data.arrow"; + List fields = new ArrayList<>(); + fields.add(new Field("time", FieldType.notNullable(new ArrowType.Int(64, true)), null)); + fields.add( + new Field( + "value", + FieldType.notNullable(new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)), + null)); + org.apache.arrow.vector.types.pojo.Schema arrowSchema = + new org.apache.arrow.vector.types.pojo.Schema(fields); + + try (BufferAllocator alloc = new RootAllocator(); + VectorSchemaRoot root = VectorSchemaRoot.create(arrowSchema, alloc); + FileOutputStream fos = new FileOutputStream(arFile); + ArrowFileWriter writer = new ArrowFileWriter(root, null, fos.getChannel())) { + writer.start(); + BigIntVector tv = (BigIntVector) root.getVector("time"); + Float8Vector vv = (Float8Vector) root.getVector("value"); + tv.allocateNew(4); + vv.allocateNew(4); + for (int i = 0; i < 4; i++) { + tv.set(i, 1000L + i); + vv.set(i, i * 2.2); + } + root.setRowCount(4); + writer.writeBatch(); + writer.end(); + } + + String target = new File(outputDir).getAbsolutePath(); + TsFileTool.main( + new String[] { + "-s" + new File(arFile).getAbsolutePath(), "-t" + target, "--format", "arrow" + }); + + String tsfile = outputDir + File.separator + "data.tsfile"; + assertTrue(new File(tsfile).exists()); + assertEquals(4, queryRowCount(tsfile, "data", Arrays.asList("value"))); + } + + @Test + public void testMissingSourceParam() { + String target = new File(outputDir).getAbsolutePath(); + TsFileTool.main(new String[] {"-t" + target}); + File[] files = new File(outputDir).listFiles((d, n) -> n.endsWith(".tsfile")); + assertTrue("No tsfile should be generated", files == null || files.length == 0); + } + + @Test + public void testMissingTargetParam() throws Exception { + String csvPath = + createCsvFile( + "missing_target.csv", new String[] {"time", "val"}, new String[][] {{"1000", "1.0"}}); + + TsFileTool.main(new String[] {"-s" + csvPath}); + File[] files = new File(outputDir).listFiles((d, n) -> n.endsWith(".tsfile")); + assertTrue("No tsfile should be generated", files == null || files.length == 0); + } + + @Test + public void testDirectoryModeViaCli() throws Exception { + String subDir = testDir + File.separator + "multi"; + new File(subDir).mkdirs(); + + for (String name : new String[] {"a.csv", "b.csv"}) { + try (BufferedWriter w = new BufferedWriter(new FileWriter(subDir + File.separator + name))) { + w.write("time,val\n"); + w.write("1000,10.0\n"); + w.write("1001,20.0\n"); + } + } + + String target = new File(outputDir).getAbsolutePath(); + TsFileTool.main(new String[] {"-s" + new File(subDir).getAbsolutePath(), "-t" + target}); + + assertTrue(new File(outputDir, "a.tsfile").exists()); + assertTrue(new File(outputDir, "b.tsfile").exists()); + assertEquals( + 2, queryRowCount(outputDir + File.separator + "a.tsfile", "a", Arrays.asList("val"))); + assertEquals( + 2, queryRowCount(outputDir + File.separator + "b.tsfile", "b", Arrays.asList("val"))); + } + + @Test + public void testDirectoryModeWithTableNameOverride() throws Exception { + String subDir = testDir + File.separator + "multi_tn"; + new File(subDir).mkdirs(); + + for (String name : new String[] {"x.csv", "y.csv"}) { + try (BufferedWriter w = new BufferedWriter(new FileWriter(subDir + File.separator + name))) { + w.write("time,val\n"); + w.write("2000,5.0\n"); + } + } + + String target = new File(outputDir).getAbsolutePath(); + TsFileTool.main( + new String[] { + "-s" + new File(subDir).getAbsolutePath(), "-t" + target, "--table_name", "shared" + }); + + assertTrue(new File(outputDir, "x.tsfile").exists()); + assertTrue(new File(outputDir, "y.tsfile").exists()); + assertEquals( + 1, queryRowCount(outputDir + File.separator + "x.tsfile", "shared", Arrays.asList("val"))); + assertEquals( + 1, queryRowCount(outputDir + File.separator + "y.tsfile", "shared", Arrays.asList("val"))); + } + + @Test + public void testParquetSchemaModeWithTagViaCli() throws Exception { + String pqFile = testDir + File.separator + "tagged.parquet"; + MessageType pqSchema = + Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.INT64) + .named("time") + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .as(org.apache.parquet.schema.LogicalTypeAnnotation.stringType()) + .named("device") + .required(PrimitiveType.PrimitiveTypeName.DOUBLE) + .named("temp") + .named("sensor"); + SimpleGroupFactory factory = new SimpleGroupFactory(pqSchema); + try (ParquetWriter writer = + ExampleParquetWriter.builder(new LocalOutputFile(new File(pqFile).toPath())) + .withType(pqSchema) + .withCompressionCodec(CompressionCodecName.UNCOMPRESSED) + .build()) { + for (int i = 0; i < 5; i++) { + writer.write( + factory + .newGroup() + .append("time", 1000L + i) + .append("device", "d1") + .append("temp", 20.0 + i)); + } + } + + String schemaPath = + createSchemaFile( + "pq_schema.txt", + "table_name=root.pq\n" + + "time_precision=ms\n" + + "has_header=true\n" + + "separator=,\n\n" + + "tag_columns\n" + + "device\n" + + "time_column=time\n" + + "source_columns\n" + + "time INT64,\n" + + "device TEXT,\n" + + "temp DOUBLE\n"); + + String target = new File(outputDir).getAbsolutePath(); + TsFileTool.main( + new String[] { + "-s" + new File(pqFile).getAbsolutePath(), + "-schema" + schemaPath, + "-t" + target, + "--format", + "parquet" + }); + + String tsfile = outputDir + File.separator + "tagged.tsfile"; + assertTrue("TsFile should exist", new File(tsfile).exists()); + assertEquals(5, queryRowCount(tsfile, "root.pq", Arrays.asList("temp"))); + } + + @Test + public void testArrowSchemaModeWithTagViaCli() throws Exception { + String arFile = testDir + File.separator + "tagged.arrow"; + List arrowFields = new ArrayList<>(); + arrowFields.add(new Field("time", FieldType.notNullable(new ArrowType.Int(64, true)), null)); + arrowFields.add(new Field("device", FieldType.notNullable(new ArrowType.Utf8()), null)); + arrowFields.add( + new Field( + "temp", + FieldType.notNullable(new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)), + null)); + org.apache.arrow.vector.types.pojo.Schema arrowSchema = + new org.apache.arrow.vector.types.pojo.Schema(arrowFields); + + try (BufferAllocator alloc = new RootAllocator(); + VectorSchemaRoot root = VectorSchemaRoot.create(arrowSchema, alloc); + FileOutputStream fos = new FileOutputStream(arFile); + ArrowFileWriter writer = new ArrowFileWriter(root, null, fos.getChannel())) { + writer.start(); + BigIntVector tv = (BigIntVector) root.getVector("time"); + org.apache.arrow.vector.VarCharVector dv = + (org.apache.arrow.vector.VarCharVector) root.getVector("device"); + Float8Vector tp = (Float8Vector) root.getVector("temp"); + tv.allocateNew(4); + dv.allocateNew(); + tp.allocateNew(4); + for (int i = 0; i < 4; i++) { + tv.set(i, 2000L + i); + dv.set(i, "sensor1".getBytes(StandardCharsets.UTF_8)); + tp.set(i, 30.0 + i); + } + root.setRowCount(4); + writer.writeBatch(); + writer.end(); + } + + String schemaPath = + createSchemaFile( + "ar_schema.txt", + "table_name=root.ar\n" + + "time_precision=ms\n" + + "has_header=true\n" + + "separator=,\n\n" + + "tag_columns\n" + + "device\n" + + "time_column=time\n" + + "source_columns\n" + + "time INT64,\n" + + "device TEXT,\n" + + "temp DOUBLE\n"); + + String target = new File(outputDir).getAbsolutePath(); + TsFileTool.main( + new String[] { + "-s" + new File(arFile).getAbsolutePath(), + "-schema" + schemaPath, + "-t" + target, + "--format", + "arrow" + }); + + String tsfile = outputDir + File.separator + "tagged.tsfile"; + assertTrue("TsFile should exist", new File(tsfile).exists()); + assertEquals(4, queryRowCount(tsfile, "root.ar", Arrays.asList("temp"))); + } + + @Test + public void testParseBlockSize() { + assertEquals(1024L, TsFileTool.parseBlockSize("1K")); + assertEquals(256L * 1024 * 1024, TsFileTool.parseBlockSize("256M")); + assertEquals(1024L * 1024 * 1024, TsFileTool.parseBlockSize("1G")); + assertEquals(12345L, TsFileTool.parseBlockSize("12345")); + } + + @Test + public void testCsvFailDirViaCli() throws Exception { + String csvPath = testDir + File.separator + "bad.csv"; + try (BufferedWriter w = new BufferedWriter(new FileWriter(csvPath))) { + w.write("time,tag1,val\n"); + w.write("1000,s1,10.0\n"); + w.write("badtime,s1,20.0\n"); + w.write("1002,s1,30.0\n"); + } + + String schemaPath = + createSchemaFile( + "fail_schema.txt", + "table_name=root.fail\n" + + "time_precision=ms\n" + + "has_header=true\n" + + "separator=,\n" + + "null_format=\\N\n\n" + + "id_columns\n" + + "tag1\n" + + "time_column=time\n" + + "csv_columns\n" + + "time INT64,\n" + + "tag1 TEXT,\n" + + "val DOUBLE\n"); + + String target = new File(outputDir).getAbsolutePath(); + String fd = new File(failedDir).getAbsolutePath(); + TsFileTool.main( + new String[] { + "-s" + new File(csvPath).getAbsolutePath(), + "-schema" + schemaPath, + "-t" + target, + "-fail_dir" + fd + }); + + assertTrue("Failed file should exist in fail_dir", new File(failedDir, "bad.csv").exists()); + } + + @Test + public void testCsvAutoModeTypeVerification() throws Exception { + String csvPath = + createCsvFile( + "types.csv", + new String[] {"time", "bool_col", "int_col", "double_col", "str_col"}, + new String[][] { + {"1000", "true", "42", "3.14", "hello"}, + {"1001", "false", "99", "2.72", "world"}, + {"1002", "true", "7", "1.41", "test"} + }); + + File csvFile = new File(csvPath); + try (CsvSourceReader reader = new CsvSourceReader(csvFile, ",")) { + ImportSchema schema = reader.inferSchema(); + assertEquals("types", schema.getTableName()); + assertEquals("time", schema.getTimeColumnName()); + + List fields = schema.fieldColumns(); + assertEquals(4, fields.size()); + assertEquals( + org.apache.tsfile.enums.TSDataType.BOOLEAN, findField(fields, "bool_col").getDataType()); + assertEquals( + org.apache.tsfile.enums.TSDataType.INT64, findField(fields, "int_col").getDataType()); + assertEquals( + org.apache.tsfile.enums.TSDataType.DOUBLE, findField(fields, "double_col").getDataType()); + assertEquals( + org.apache.tsfile.enums.TSDataType.STRING, findField(fields, "str_col").getDataType()); + + String outDir = testDir + File.separator + "type_output"; + ImportExecutor executor = new ImportExecutor(schema); + assertTrue(executor.execute(reader, outDir, "types")); + + String tsfile = outDir + File.separator + "types.tsfile"; + assertTrue(new File(tsfile).exists()); + assertEquals( + 3, queryRowCount(tsfile, "types", Arrays.asList("bool_col", "int_col", "double_col"))); + } + } + + @Test + public void testCsvAutoModeTypePromotion() throws Exception { + String csvPath = + createCsvFile( + "promo.csv", + new String[] {"time", "mixed"}, + new String[][] { + {"1000", "42"}, + {"1001", "3.14"}, + {"1002", "100"} + }); + + File csvFile = new File(csvPath); + try (CsvSourceReader reader = new CsvSourceReader(csvFile, ",")) { + ImportSchema schema = reader.inferSchema(); + ImportSchema.SourceColumn mixedField = findField(schema.fieldColumns(), "mixed"); + assertEquals(org.apache.tsfile.enums.TSDataType.DOUBLE, mixedField.getDataType()); + } + } + + private ImportSchema.SourceColumn findField(List fields, String name) { + for (ImportSchema.SourceColumn f : fields) { + if (f.getName().equals(name)) { + return f; + } + } + throw new AssertionError("Field not found: " + name); + } +} diff --git a/java/tools/src/test/java/org/apache/tsfile/tools/TsfiletoolsTest.java b/java/tools/src/test/java/org/apache/tsfile/tools/TsfiletoolsTest.java index 5fb09c757..176bc687c 100644 --- a/java/tools/src/test/java/org/apache/tsfile/tools/TsfiletoolsTest.java +++ b/java/tools/src/test/java/org/apache/tsfile/tools/TsfiletoolsTest.java @@ -27,6 +27,26 @@ import org.apache.tsfile.read.query.executor.TableQueryExecutor; import org.apache.tsfile.read.reader.block.TsBlockReader; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.ipc.ArrowFileWriter; +import org.apache.arrow.vector.types.FloatingPointPrecision; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.example.data.simple.SimpleGroupFactory; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.example.ExampleParquetWriter; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.io.LocalOutputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Types; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -34,14 +54,17 @@ import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; +import java.io.FileOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Random; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; public class TsfiletoolsTest { @@ -162,6 +185,23 @@ public void tearDown() throws Exception { FileUtils.deleteDirectory(new File(testDir)); } + private int queryTsFile(String tsfilePath, String tableName, List columns) + throws Exception { + try (TsFileSequenceReader sequenceReader = new TsFileSequenceReader(tsfilePath)) { + TableQueryExecutor tableQueryExecutor = + new TableQueryExecutor( + new MetadataQuerierByFileImpl(sequenceReader), + new CachedChunkLoaderImpl(sequenceReader), + TableQueryExecutor.TableQueryOrdering.DEVICE); + TsBlockReader reader = tableQueryExecutor.query(tableName, columns, null, null, null); + int cnt = 0; + while (reader.hasNext()) { + cnt += reader.next().getPositionCount(); + } + return cnt; + } + } + @Test public void testCsvToTsfile() throws Exception { String scFilePath = new File(schemaFile).getAbsolutePath(); @@ -199,6 +239,544 @@ public void testCsvToTsfile() throws Exception { } } + @Test + public void testCsvAutoModeEndToEnd() throws Exception { + String autoDir = testDir + File.separator + "auto"; + new File(autoDir).mkdirs(); + + String autoCsvFile = autoDir + File.separator + "autotest.csv"; + try (BufferedWriter w = new BufferedWriter(new FileWriter(autoCsvFile))) { + w.write("time,temperature,humidity\n"); + long ts = System.currentTimeMillis(); + for (int i = 0; i < 10; i++) { + w.write((ts + i) + "," + (20.0 + i) + "," + (50.0 + i) + "\n"); + } + } + + String autoOutput = autoDir + File.separator + "output"; + File csvFile = new File(autoCsvFile); + + try (CsvSourceReader reader = new CsvSourceReader(csvFile, ",")) { + ImportSchema schema = reader.inferSchema(); + assertEquals("autotest", schema.getTableName()); + assertEquals("time", schema.getTimeColumnName()); + assertTrue(schema.getTagColumns().isEmpty()); + assertEquals(2, schema.fieldColumns().size()); + + ImportExecutor executor = new ImportExecutor(schema); + boolean ok = executor.execute(reader, autoOutput, "autotest"); + assertTrue(ok); + } + + String tsfilePath = autoOutput + File.separator + "autotest.tsfile"; + assertTrue("TsFile should exist", new File(tsfilePath).exists()); + + List columns = new ArrayList<>(); + columns.add("temperature"); + columns.add("humidity"); + try (TsFileSequenceReader sequenceReader = new TsFileSequenceReader(tsfilePath)) { + TableQueryExecutor tableQueryExecutor = + new TableQueryExecutor( + new MetadataQuerierByFileImpl(sequenceReader), + new CachedChunkLoaderImpl(sequenceReader), + TableQueryExecutor.TableQueryOrdering.DEVICE); + final TsBlockReader reader = tableQueryExecutor.query("autotest", columns, null, null, null); + assertTrue(reader.hasNext()); + int cnt = 0; + while (reader.hasNext()) { + final TsBlock result = reader.next(); + cnt += result.getPositionCount(); + } + assertEquals(10, cnt); + } + } + + @Test + public void testCsvAutoModeWithTableNameOverride() throws Exception { + String autoDir = testDir + File.separator + "auto_override"; + new File(autoDir).mkdirs(); + + String autoCsvFile = autoDir + File.separator + "raw.csv"; + try (BufferedWriter w = new BufferedWriter(new FileWriter(autoCsvFile))) { + w.write("time,val\n"); + w.write(System.currentTimeMillis() + ",1.23\n"); + } + + String autoOutput = autoDir + File.separator + "output"; + File csvFile = new File(autoCsvFile); + + try (CsvSourceReader reader = new CsvSourceReader(csvFile, ",")) { + reader.setOverrideTableName("my_custom_table"); + ImportSchema schema = reader.inferSchema(); + assertEquals("my_custom_table", schema.getTableName()); + + ImportExecutor executor = new ImportExecutor(schema); + boolean ok = executor.execute(reader, autoOutput, "raw"); + assertTrue(ok); + } + + assertTrue(new File(autoOutput, "raw.tsfile").exists()); + } + + @Test + public void testParquetAutoModeEndToEnd() throws Exception { + String pqDir = testDir + File.separator + "parquet_auto"; + new File(pqDir).mkdirs(); + + MessageType pqSchema = + Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.INT64) + .named("time") + .required(PrimitiveType.PrimitiveTypeName.FLOAT) + .named("temperature") + .required(PrimitiveType.PrimitiveTypeName.DOUBLE) + .named("humidity") + .named("sensor"); + SimpleGroupFactory factory = new SimpleGroupFactory(pqSchema); + + String pqFile = pqDir + File.separator + "sensor.parquet"; + try (ParquetWriter writer = + ExampleParquetWriter.builder(new LocalOutputFile(new File(pqFile).toPath())) + .withType(pqSchema) + .withCompressionCodec(CompressionCodecName.UNCOMPRESSED) + .build()) { + long ts = System.currentTimeMillis(); + for (int i = 0; i < 10; i++) { + writer.write( + factory + .newGroup() + .append("time", ts + i) + .append("temperature", 20.0f + i) + .append("humidity", 50.0 + i)); + } + } + + String outputDir = pqDir + File.separator + "output"; + File inputFile = new File(pqFile); + + try (ParquetSourceReader reader = new ParquetSourceReader(inputFile)) { + ImportSchema schema = reader.inferSchema(); + assertEquals("sensor", schema.getTableName()); + + ImportExecutor executor = new ImportExecutor(schema); + boolean ok = executor.execute(reader, outputDir, "sensor"); + assertTrue(ok); + } + + String tsfilePath = outputDir + File.separator + "sensor.tsfile"; + assertTrue("TsFile should exist", new File(tsfilePath).exists()); + + List columns = new ArrayList<>(); + columns.add("temperature"); + columns.add("humidity"); + try (TsFileSequenceReader sequenceReader = new TsFileSequenceReader(tsfilePath)) { + TableQueryExecutor tableQueryExecutor = + new TableQueryExecutor( + new MetadataQuerierByFileImpl(sequenceReader), + new CachedChunkLoaderImpl(sequenceReader), + TableQueryExecutor.TableQueryOrdering.DEVICE); + final TsBlockReader reader = tableQueryExecutor.query("sensor", columns, null, null, null); + assertTrue(reader.hasNext()); + int cnt = 0; + while (reader.hasNext()) { + final TsBlock result = reader.next(); + cnt += result.getPositionCount(); + } + assertEquals(10, cnt); + } + } + + @Test + public void testArrowAutoModeEndToEnd() throws Exception { + String arDir = testDir + File.separator + "arrow_auto"; + new File(arDir).mkdirs(); + + List arrowFields = new ArrayList<>(); + arrowFields.add(new Field("time", FieldType.notNullable(new ArrowType.Int(64, true)), null)); + arrowFields.add( + new Field( + "temperature", + FieldType.notNullable(new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), + null)); + arrowFields.add( + new Field( + "humidity", + FieldType.notNullable(new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)), + null)); + org.apache.arrow.vector.types.pojo.Schema arrowSchema = + new org.apache.arrow.vector.types.pojo.Schema(arrowFields); + + String arrowFile = arDir + File.separator + "telemetry.arrow"; + try (BufferAllocator alloc = new RootAllocator(); + VectorSchemaRoot root = VectorSchemaRoot.create(arrowSchema, alloc); + FileOutputStream fos = new FileOutputStream(arrowFile); + ArrowFileWriter writer = new ArrowFileWriter(root, null, fos.getChannel())) { + writer.start(); + BigIntVector tv = (BigIntVector) root.getVector("time"); + Float4Vector tp = (Float4Vector) root.getVector("temperature"); + Float8Vector hm = (Float8Vector) root.getVector("humidity"); + long ts = System.currentTimeMillis(); + tv.allocateNew(10); + tp.allocateNew(10); + hm.allocateNew(10); + for (int i = 0; i < 10; i++) { + tv.set(i, ts + i); + tp.set(i, 20.0f + i); + hm.set(i, 50.0 + i); + } + root.setRowCount(10); + writer.writeBatch(); + writer.end(); + } + + String outputDir = arDir + File.separator + "output"; + File inputFile = new File(arrowFile); + + try (ArrowSourceReader reader = new ArrowSourceReader(inputFile)) { + ImportSchema schema = reader.inferSchema(); + assertEquals("telemetry", schema.getTableName()); + + ImportExecutor executor = new ImportExecutor(schema); + boolean ok = executor.execute(reader, outputDir, "telemetry"); + assertTrue(ok); + } + + String tsfilePath = outputDir + File.separator + "telemetry.tsfile"; + assertTrue("TsFile should exist", new File(tsfilePath).exists()); + + List columns = new ArrayList<>(); + columns.add("temperature"); + columns.add("humidity"); + try (TsFileSequenceReader sequenceReader = new TsFileSequenceReader(tsfilePath)) { + TableQueryExecutor tableQueryExecutor = + new TableQueryExecutor( + new MetadataQuerierByFileImpl(sequenceReader), + new CachedChunkLoaderImpl(sequenceReader), + TableQueryExecutor.TableQueryOrdering.DEVICE); + final TsBlockReader reader = tableQueryExecutor.query("telemetry", columns, null, null, null); + assertTrue(reader.hasNext()); + int cnt = 0; + while (reader.hasNext()) { + final TsBlock result = reader.next(); + cnt += result.getPositionCount(); + } + assertEquals(10, cnt); + } + } + + @Test + public void testCsvNewSchemaEndToEnd() throws Exception { + String newDir = testDir + File.separator + "new_schema"; + new File(newDir).mkdirs(); + + String newCsvFile = newDir + File.separator + "data.csv"; + try (BufferedWriter w = new BufferedWriter(new FileWriter(newCsvFile))) { + w.write("time,tmp1,tmp2,tmp3,tmp4,tmp5\n"); + long timestamp = System.currentTimeMillis(); + for (int i = 0; i < 10; i++) { + w.write( + (timestamp + i) + + ",s1," + + (i * 1.1f) + + "," + + (i * 2.2f) + + "," + + (i * 3.3f) + + "," + + (i * 4.4f) + + "\n"); + } + } + + String newSchemaFile = newDir + File.separator + "schema.txt"; + try (BufferedWriter w = new BufferedWriter(new FileWriter(newSchemaFile))) { + w.write("table_name=root.newdb\n"); + w.write("time_precision=ms\n"); + w.write("has_header=true\n"); + w.write("separator=,\n"); + w.write("null_format=\\N\n\n"); + w.write("tag_columns\n"); + w.write("tmp1\n"); + w.write("time_column=time\n"); + w.write("source_columns\n"); + w.write("time INT64,\n"); + w.write("tmp1 TEXT,\n"); + w.write("tmp2 FLOAT,\n"); + w.write("tmp3 FLOAT,\n"); + w.write("SKIP,\n"); + w.write("tmp5 FLOAT\n"); + } + + String scFilePath = new File(newSchemaFile).getAbsolutePath(); + String csvFilePath = new File(newCsvFile).getAbsolutePath(); + String targetPath = new File(newDir + File.separator + "output").getAbsolutePath(); + String[] args = new String[] {"-s" + csvFilePath, "-schema" + scFilePath, "-t" + targetPath}; + TsFileTool.main(args); + + String tsfilePath = targetPath + File.separator + "data.tsfile"; + assertTrue("TsFile should exist", new File(tsfilePath).exists()); + + List columns = new ArrayList<>(); + columns.add("tmp2"); + columns.add("tmp3"); + columns.add("tmp5"); + try (TsFileSequenceReader sequenceReader = new TsFileSequenceReader(tsfilePath)) { + TableQueryExecutor tableQueryExecutor = + new TableQueryExecutor( + new MetadataQuerierByFileImpl(sequenceReader), + new CachedChunkLoaderImpl(sequenceReader), + TableQueryExecutor.TableQueryOrdering.DEVICE); + final TsBlockReader reader = + tableQueryExecutor.query("root.newdb", columns, null, null, null); + assertTrue(reader.hasNext()); + int cnt = 0; + while (reader.hasNext()) { + cnt += reader.next().getPositionCount(); + } + assertEquals(10, cnt); + } + } + + @Test + public void testParquetSchemaModeEndToEnd() throws Exception { + String pqDir = testDir + File.separator + "parquet_schema"; + new File(pqDir).mkdirs(); + + MessageType pqSchema = + Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.INT64) + .named("time") + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .as(org.apache.parquet.schema.LogicalTypeAnnotation.stringType()) + .named("region") + .required(PrimitiveType.PrimitiveTypeName.DOUBLE) + .named("value") + .named("test"); + SimpleGroupFactory factory = new SimpleGroupFactory(pqSchema); + + String pqFile = pqDir + File.separator + "sensor.parquet"; + try (ParquetWriter writer = + ExampleParquetWriter.builder(new LocalOutputFile(new File(pqFile).toPath())) + .withType(pqSchema) + .withCompressionCodec(CompressionCodecName.UNCOMPRESSED) + .build()) { + for (int i = 0; i < 5; i++) { + writer.write( + factory + .newGroup() + .append("time", 1000L + i) + .append("region", "east") + .append("value", i * 1.1)); + } + } + + String schemaFile2 = pqDir + File.separator + "schema.txt"; + try (BufferedWriter w = new BufferedWriter(new FileWriter(schemaFile2))) { + w.write("table_name=root.pq\n"); + w.write("time_precision=ms\n"); + w.write("has_header=true\n"); + w.write("separator=,\n\n"); + w.write("tag_columns\n"); + w.write("region\n"); + w.write("time_column=time\n"); + w.write("source_columns\n"); + w.write("time INT64,\n"); + w.write("region TEXT,\n"); + w.write("value DOUBLE\n"); + } + + ImportSchema schema = ImportSchemaParser.parse(schemaFile2); + String outputDir = pqDir + File.separator + "output"; + + try (ParquetSourceReader reader = new ParquetSourceReader(new File(pqFile), schema)) { + ImportExecutor executor = new ImportExecutor(schema); + assertTrue(executor.execute(reader, outputDir, "sensor")); + } + + String tsfilePath = outputDir + File.separator + "sensor.tsfile"; + assertTrue("TsFile should exist", new File(tsfilePath).exists()); + assertEquals(5, queryTsFile(tsfilePath, "root.pq", Arrays.asList("value"))); + } + + @Test + public void testArrowSchemaModeEndToEnd() throws Exception { + String arDir = testDir + File.separator + "arrow_schema"; + new File(arDir).mkdirs(); + + List arrowFields = new ArrayList<>(); + arrowFields.add(new Field("time", FieldType.notNullable(new ArrowType.Int(64, true)), null)); + arrowFields.add(new Field("region", FieldType.notNullable(new ArrowType.Utf8()), null)); + arrowFields.add( + new Field( + "value", + FieldType.notNullable(new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)), + null)); + org.apache.arrow.vector.types.pojo.Schema arrowSchema = + new org.apache.arrow.vector.types.pojo.Schema(arrowFields); + + String arrowFile = arDir + File.separator + "device.arrow"; + try (BufferAllocator alloc = new RootAllocator(); + VectorSchemaRoot root = VectorSchemaRoot.create(arrowSchema, alloc); + FileOutputStream fos = new FileOutputStream(arrowFile); + ArrowFileWriter writer = new ArrowFileWriter(root, null, fos.getChannel())) { + writer.start(); + BigIntVector tv = (BigIntVector) root.getVector("time"); + org.apache.arrow.vector.VarCharVector rv = + (org.apache.arrow.vector.VarCharVector) root.getVector("region"); + Float8Vector vv = (Float8Vector) root.getVector("value"); + tv.allocateNew(5); + rv.allocateNew(); + vv.allocateNew(5); + for (int i = 0; i < 5; i++) { + tv.set(i, 1000L + i); + rv.set(i, "west".getBytes(java.nio.charset.StandardCharsets.UTF_8)); + vv.set(i, i * 2.2); + } + root.setRowCount(5); + writer.writeBatch(); + writer.end(); + } + + String schemaFile2 = arDir + File.separator + "schema.txt"; + try (BufferedWriter w = new BufferedWriter(new FileWriter(schemaFile2))) { + w.write("table_name=root.ar\n"); + w.write("time_precision=ms\n"); + w.write("has_header=true\n"); + w.write("separator=,\n\n"); + w.write("tag_columns\n"); + w.write("region\n"); + w.write("time_column=time\n"); + w.write("source_columns\n"); + w.write("time INT64,\n"); + w.write("region TEXT,\n"); + w.write("value DOUBLE\n"); + } + + ImportSchema schema = ImportSchemaParser.parse(schemaFile2); + String outputDir = arDir + File.separator + "output"; + + try (ArrowSourceReader reader = new ArrowSourceReader(new File(arrowFile), schema)) { + ImportExecutor executor = new ImportExecutor(schema); + assertTrue(executor.execute(reader, outputDir, "device")); + } + + String tsfilePath = outputDir + File.separator + "device.tsfile"; + assertTrue("TsFile should exist", new File(tsfilePath).exists()); + assertEquals(5, queryTsFile(tsfilePath, "root.ar", Arrays.asList("value"))); + } + + @Test + public void testCsvLargeFileMultiBatch() throws Exception { + String bigDir = testDir + File.separator + "big_csv"; + new File(bigDir).mkdirs(); + + String bigCsvFile = bigDir + File.separator + "big.csv"; + int rowCount = 5000; + try (BufferedWriter w = new BufferedWriter(new FileWriter(bigCsvFile))) { + w.write("time,value1,value2\n"); + for (int i = 0; i < rowCount; i++) { + w.write((1000L + i) + "," + (i * 1.1) + "," + (i * 2.2) + "\n"); + } + } + + String outputDir = bigDir + File.separator + "output"; + File csvFile = new File(bigCsvFile); + long smallChunkSize = 1024; + + try (CsvSourceReader reader = new CsvSourceReader(csvFile, ",", smallChunkSize)) { + ImportSchema schema = reader.inferSchema(); + assertEquals("big", schema.getTableName()); + assertEquals("time", schema.getTimeColumnName()); + assertEquals(2, schema.fieldColumns().size()); + + ImportExecutor executor = new ImportExecutor(schema); + assertTrue(executor.execute(reader, outputDir, "big")); + } + + File outputDirFile = new File(outputDir); + File[] tsfiles = outputDirFile.listFiles((dir, name) -> name.endsWith(".tsfile")); + assertNotNull(tsfiles); + assertTrue("Should have multiple output files for chunked CSV", tsfiles.length > 1); + + int totalRows = 0; + for (File tsfile : tsfiles) { + try (TsFileSequenceReader sequenceReader = + new TsFileSequenceReader(tsfile.getAbsolutePath())) { + TableQueryExecutor tableQueryExecutor = + new TableQueryExecutor( + new MetadataQuerierByFileImpl(sequenceReader), + new CachedChunkLoaderImpl(sequenceReader), + TableQueryExecutor.TableQueryOrdering.DEVICE); + final TsBlockReader reader = + tableQueryExecutor.query("big", Arrays.asList("value1"), null, null, null); + while (reader.hasNext()) { + totalRows += reader.next().getPositionCount(); + } + } + } + assertEquals(rowCount, totalRows); + + for (File tsfile : tsfiles) { + String name = tsfile.getName(); + assertTrue( + "File should follow naming convention: " + name, name.matches("big_\\d+\\.tsfile")); + } + } + + @Test + public void testCsvDirectoryMultipleFiles() throws Exception { + String multiDir = testDir + File.separator + "multi_csv"; + new File(multiDir).mkdirs(); + + String[] fileNames = {"alpha.csv", "beta.csv", "gamma.csv"}; + for (String fn : fileNames) { + try (BufferedWriter w = new BufferedWriter(new FileWriter(multiDir + File.separator + fn))) { + w.write("time,measurement\n"); + for (int i = 0; i < 5; i++) { + w.write((1000L + i) + "," + (fn.hashCode() + i * 1.0) + "\n"); + } + } + } + + String outputDir = multiDir + File.separator + "output"; + new File(outputDir).mkdirs(); + + for (String fn : fileNames) { + File csvFile = new File(multiDir + File.separator + fn); + String baseName = fn.substring(0, fn.lastIndexOf('.')); + try (CsvSourceReader reader = new CsvSourceReader(csvFile, ",")) { + ImportSchema schema = reader.inferSchema(); + assertEquals(baseName, schema.getTableName()); + + ImportExecutor executor = new ImportExecutor(schema); + assertTrue(executor.execute(reader, outputDir, baseName)); + } + } + + for (String fn : fileNames) { + String baseName = fn.substring(0, fn.lastIndexOf('.')); + File tsfile = new File(outputDir, baseName + ".tsfile"); + assertTrue("Missing output: " + tsfile.getName(), tsfile.exists()); + + try (TsFileSequenceReader sequenceReader = + new TsFileSequenceReader(tsfile.getAbsolutePath())) { + TableQueryExecutor tableQueryExecutor = + new TableQueryExecutor( + new MetadataQuerierByFileImpl(sequenceReader), + new CachedChunkLoaderImpl(sequenceReader), + TableQueryExecutor.TableQueryOrdering.DEVICE); + final TsBlockReader reader = + tableQueryExecutor.query(baseName, Arrays.asList("measurement"), null, null, null); + assertTrue(reader.hasNext()); + int cnt = 0; + while (reader.hasNext()) { + cnt += reader.next().getPositionCount(); + } + assertEquals(5, cnt); + } + } + } + @Test public void testCsvToTsfileFailed() { String scFilePath = new File(schemaFile).getAbsolutePath(); diff --git a/java/tools/src/test/java/org/apache/tsfile/tools/ValueConverterTest.java b/java/tools/src/test/java/org/apache/tsfile/tools/ValueConverterTest.java new file mode 100644 index 000000000..0c87e1717 --- /dev/null +++ b/java/tools/src/test/java/org/apache/tsfile/tools/ValueConverterTest.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.tools; + +import org.apache.tsfile.enums.TSDataType; +import org.apache.tsfile.utils.Binary; + +import org.junit.Test; + +import java.nio.charset.StandardCharsets; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +public class ValueConverterTest { + + // --- null --- + + @Test + public void testNullReturnsNull() { + assertNull(ValueConverter.convert(null, TSDataType.INT32, false)); + assertNull(ValueConverter.convert(null, TSDataType.FLOAT, true)); + } + + // --- String → types --- + + @Test + public void testStringToInt32() { + Object result = ValueConverter.convert("42", TSDataType.INT32, false); + assertEquals(42, result); + } + + @Test + public void testStringToInt64() { + Object result = ValueConverter.convert("123456789012", TSDataType.INT64, false); + assertEquals(123456789012L, result); + } + + @Test + public void testStringToFloat() { + Object result = ValueConverter.convert("3.14", TSDataType.FLOAT, true); + assertEquals(3.14f, (float) result, 0.001f); + } + + @Test + public void testStringToDouble() { + Object result = ValueConverter.convert("3.14159", TSDataType.DOUBLE, true); + assertEquals(3.14159, (double) result, 0.00001); + } + + @Test + public void testStringToBoolean() { + assertEquals(true, ValueConverter.convert("true", TSDataType.BOOLEAN, false)); + assertEquals(false, ValueConverter.convert("false", TSDataType.BOOLEAN, false)); + } + + @Test + public void testStringToTextAsMeasurement() { + Object result = ValueConverter.convert("hello", TSDataType.TEXT, true); + assertTrue(result instanceof Binary); + assertEquals("hello", ((Binary) result).getStringValue(StandardCharsets.UTF_8)); + } + + @Test + public void testStringToTextAsTag() { + Object result = ValueConverter.convert("hello", TSDataType.TEXT, false); + assertTrue(result instanceof String); + assertEquals("hello", result); + } + + @Test + public void testStringToStringAsMeasurement() { + Object result = ValueConverter.convert("hello", TSDataType.STRING, true); + assertTrue(result instanceof Binary); + } + + @Test + public void testStringToStringAsTag() { + Object result = ValueConverter.convert("hello", TSDataType.STRING, false); + assertEquals("hello", result); + } + + // --- Native type passthrough --- + + @Test + public void testIntegerPassthroughInt32() { + Object result = ValueConverter.convert(42, TSDataType.INT32, false); + assertEquals(42, result); + } + + @Test + public void testLongPassthroughInt64() { + Object result = ValueConverter.convert(100L, TSDataType.INT64, false); + assertEquals(100L, result); + } + + @Test + public void testFloatPassthroughFloat() { + Object result = ValueConverter.convert(1.5f, TSDataType.FLOAT, true); + assertEquals(1.5f, result); + } + + @Test + public void testDoublePassthroughDouble() { + Object result = ValueConverter.convert(2.5, TSDataType.DOUBLE, true); + assertEquals(2.5, result); + } + + @Test + public void testBooleanPassthrough() { + assertEquals(true, ValueConverter.convert(true, TSDataType.BOOLEAN, false)); + } + + // --- Type promotion --- + + @Test + public void testIntegerToInt64() { + Object result = ValueConverter.convert(42, TSDataType.INT64, false); + assertEquals(42L, result); + } + + @Test + public void testIntegerToDouble() { + Object result = ValueConverter.convert(42, TSDataType.DOUBLE, true); + assertEquals(42.0, result); + } + + @Test + public void testFloatToDouble() { + Object result = ValueConverter.convert(1.5f, TSDataType.DOUBLE, true); + assertEquals(1.5, (double) result, 0.001); + } + + @Test + public void testLongToInt32() { + Object result = ValueConverter.convert(42L, TSDataType.INT32, false); + assertEquals(42, result); + } + + // --- BLOB --- + + @Test + public void testStringToBlob() { + Object result = ValueConverter.convert("data", TSDataType.BLOB, true); + assertTrue(result instanceof Binary); + } + + @Test + public void testBytesToBlob() { + byte[] bytes = new byte[] {1, 2, 3}; + Object result = ValueConverter.convert(bytes, TSDataType.BLOB, true); + assertTrue(result instanceof Binary); + } + + @Test + public void testBinaryPassthrough() { + Binary binary = new Binary(new byte[] {1, 2}); + Object result = ValueConverter.convert(binary, TSDataType.BLOB, true); + assertEquals(binary, result); + } + + // --- Object toString fallback --- + + @Test + public void testObjectToStringForText() { + Object result = ValueConverter.convert(12345, TSDataType.TEXT, false); + assertEquals("12345", result); + } +} From 882755f0c8f7fcdf4dc64ac20698181949a6d098 Mon Sep 17 00:00:00 2001 From: ziyangeng Date: Fri, 24 Apr 2026 13:29:39 +0800 Subject: [PATCH 2/5] fix(tools): fix dependency-plugin analysis errors in pom.xml --- java/tools/pom.xml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/java/tools/pom.xml b/java/tools/pom.xml index 74c014b29..a03a584b1 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -145,7 +145,14 @@ ch.qos.logback:logback-classic org.apache.hadoop:hadoop-common org.apache.arrow:arrow-memory-unsafe + org.apache.hadoop:hadoop-mapreduce-client-core + + + org.apache.parquet:parquet-common + org.apache.parquet:parquet-column + org.apache.arrow:arrow-memory-core + From 911fd81462a74cc11d621b8caf98330478a8e525 Mon Sep 17 00:00:00 2001 From: ziyangeng Date: Fri, 24 Apr 2026 17:02:07 +0800 Subject: [PATCH 3/5] fix(tools): preserve column positions for SKIP columns to avoid index mismatch --- .../java/org/apache/tsfile/tools/ArrowSourceReader.java | 8 ++++---- .../java/org/apache/tsfile/tools/ParquetSourceReader.java | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/java/tools/src/main/java/org/apache/tsfile/tools/ArrowSourceReader.java b/java/tools/src/main/java/org/apache/tsfile/tools/ArrowSourceReader.java index 9a9bc4441..0d7081202 100644 --- a/java/tools/src/main/java/org/apache/tsfile/tools/ArrowSourceReader.java +++ b/java/tools/src/main/java/org/apache/tsfile/tools/ArrowSourceReader.java @@ -221,10 +221,10 @@ private void ensureReaderOpen() throws IOException { private List getSchemaColumnNames() { List names = new ArrayList<>(); - for (ImportSchema.SourceColumn col : schema.getSourceColumns()) { - if (!col.isSkip()) { - names.add(col.getName()); - } + List srcCols = schema.getSourceColumns(); + for (int i = 0; i < srcCols.size(); i++) { + ImportSchema.SourceColumn col = srcCols.get(i); + names.add(col.isSkip() ? "_skip_" + i : col.getName()); } return names; } diff --git a/java/tools/src/main/java/org/apache/tsfile/tools/ParquetSourceReader.java b/java/tools/src/main/java/org/apache/tsfile/tools/ParquetSourceReader.java index 5cc547d2b..fcbf3f468 100644 --- a/java/tools/src/main/java/org/apache/tsfile/tools/ParquetSourceReader.java +++ b/java/tools/src/main/java/org/apache/tsfile/tools/ParquetSourceReader.java @@ -211,10 +211,10 @@ private void ensureReaderOpen() throws IOException { private List getSchemaColumnNames() { List names = new ArrayList<>(); - for (ImportSchema.SourceColumn col : schema.getSourceColumns()) { - if (!col.isSkip()) { - names.add(col.getName()); - } + List srcCols = schema.getSourceColumns(); + for (int i = 0; i < srcCols.size(); i++) { + ImportSchema.SourceColumn col = srcCols.get(i); + names.add(col.isSkip() ? "_skip_" + i : col.getName()); } return names; } From 7fd33edb3f2017d8cc2d3b26f945011dc2e8d316 Mon Sep 17 00:00:00 2001 From: ziyangeng Date: Fri, 24 Apr 2026 18:12:21 +0800 Subject: [PATCH 4/5] refactor(tools): exclude unnecessary hadoop-yarn-client and hadoop-hdfs-client transitive dependencies --- java/tools/pom.xml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/java/tools/pom.xml b/java/tools/pom.xml index a03a584b1..68181feb8 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -69,6 +69,14 @@ org.apache.hadoop hadoop-yarn-common + + org.apache.hadoop + hadoop-yarn-client + + + org.apache.hadoop + hadoop-hdfs-client + From d84e697499edb91829dd8af04182696d51a70836 Mon Sep 17 00:00:00 2001 From: ziyangeng Date: Thu, 30 Apr 2026 15:35:25 +0800 Subject: [PATCH 5/5] fix(java-tools): address PR review comments for import tools --- java/pom.xml | 77 +++++++++++++++++++ java/tools/README-zh.md | 2 +- java/tools/README.md | 2 +- java/tools/pom.xml | 9 --- .../tsfile/tools/ArrowSourceReader.java | 12 ++- .../apache/tsfile/tools/CsvSourceReader.java | 1 + .../apache/tsfile/tools/ImportExecutor.java | 5 -- .../org/apache/tsfile/tools/ImportSchema.java | 6 +- .../tsfile/tools/ImportSchemaParser.java | 6 +- .../org/apache/tsfile/tools/TsFileTool.java | 4 +- .../tsfile/tools/ImportSchemaParserTest.java | 8 +- 11 files changed, 103 insertions(+), 29 deletions(-) diff --git a/java/pom.xml b/java/pom.xml index 7587a67e5..b09f6a015 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -86,6 +86,83 @@ 1.3.16 + + commons-cli + commons-cli + 1.9.0 + + + org.apache.parquet + parquet-hadoop + + 1.14.4 + + + org.apache.hadoop + hadoop-common + 3.3.6 + + + org.apache.hadoop + hadoop-mapreduce-client-core + 3.3.6 + + + org.apache.arrow + arrow-vector + 15.0.2 + + + org.apache.arrow + arrow-memory-unsafe + 15.0.2 + + + + org.codehaus.woodstox + stax2-api + 4.2.1 + + + commons-codec + commons-codec + 1.15 + + + com.google.guava + guava + 27.0-jre + + + commons-logging + commons-logging + 1.2 + + + org.apache.commons + commons-compress + 1.21 + + + com.fasterxml.jackson.core + jackson-databind + 2.17.0 + + + com.fasterxml.jackson.core + jackson-core + 2.17.0 + + + com.fasterxml.jackson.core + jackson-annotations + 2.17.0 + + + org.apache.commons + commons-text + 1.10.0 + diff --git a/java/tools/README-zh.md b/java/tools/README-zh.md index e25e5ca0a..0efeba2eb 100644 --- a/java/tools/README-zh.md +++ b/java/tools/README-zh.md @@ -163,7 +163,7 @@ arrow2tsfile.bat --source .\data\arrow --target .\output --fail_dir .\failed --s **Auto 模式规则:** - 时间列:必须严格命名为 `time` 或 `TIME`(区分大小写) - 其余所有列自动成为 FIELD(不自动推断标签列) -- CSV 类型推断基于前 100 行采样,提升链为:`BOOLEAN → INT64 → DOUBLE → STRING` +- CSV 类型推断基于前 100 行采样。提升规则:INT64 和 DOUBLE 混合提升为 DOUBLE;其他任何混合对(包括 BOOLEAN 与数字类型)直接提升为 STRING。 - Parquet / Arrow 直接使用原生 schema 类型 - 默认表名:从源文件名推导(如 `sensor.csv` → 表名 `sensor`) - 默认 null 识别(仅 CSV):空单元格和 `\N` diff --git a/java/tools/README.md b/java/tools/README.md index 3de5332bd..8da7af0d9 100644 --- a/java/tools/README.md +++ b/java/tools/README.md @@ -162,7 +162,7 @@ Omit `--schema` to automatically infer column types and detect the time column. **Auto mode rules:** - Time column: must be named exactly `time` or `TIME` (case-sensitive, strict match) - All other columns become FIELD (no tag inference) -- CSV type inference uses a 100-row sampling window with promotion chain: `BOOLEAN → INT64 → DOUBLE → STRING` +- CSV type inference uses a 100-row sampling window. Promotion rules: INT64 and DOUBLE promote to DOUBLE; any other mixed pair (including BOOLEAN with numeric) promotes to STRING. - Parquet / Arrow use native schema types directly - Default table name: derived from source filename (e.g. `sensor.csv` → table `sensor`) - Default null tokens (CSV only): empty cell and `\N` diff --git a/java/tools/pom.xml b/java/tools/pom.xml index 68181feb8..79afd24e7 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -28,9 +28,6 @@ tools TsFile: Java: Tools - - true - org.apache.tsfile @@ -40,7 +37,6 @@ commons-cli commons-cli - 1.9.0 org.apache.tsfile @@ -58,12 +54,10 @@ org.apache.parquet parquet-hadoop - 1.14.4 org.apache.hadoop hadoop-mapreduce-client-core - 3.3.6 org.apache.hadoop @@ -82,7 +76,6 @@ org.apache.hadoop hadoop-common - 3.3.6 org.apache.hadoop @@ -129,12 +122,10 @@ org.apache.arrow arrow-vector - 15.0.2 org.apache.arrow arrow-memory-unsafe - 15.0.2 junit diff --git a/java/tools/src/main/java/org/apache/tsfile/tools/ArrowSourceReader.java b/java/tools/src/main/java/org/apache/tsfile/tools/ArrowSourceReader.java index 0d7081202..683396783 100644 --- a/java/tools/src/main/java/org/apache/tsfile/tools/ArrowSourceReader.java +++ b/java/tools/src/main/java/org/apache/tsfile/tools/ArrowSourceReader.java @@ -55,6 +55,7 @@ public class ArrowSourceReader implements SourceReader { private final File sourceFile; private ImportSchema schema; private BufferAllocator allocator; + private FileInputStream fileInputStream; private ArrowFileReader arrowReader; private Schema arrowSchema; private List recordBatches; @@ -204,6 +205,14 @@ public void close() { } arrowReader = null; } + if (fileInputStream != null) { + try { + fileInputStream.close(); + } catch (IOException e) { + LOGGER.error("Error closing FileInputStream", e); + } + fileInputStream = null; + } if (allocator != null) { allocator.close(); allocator = null; @@ -213,7 +222,8 @@ public void close() { private void ensureReaderOpen() throws IOException { if (arrowReader == null) { allocator = new RootAllocator(); - arrowReader = new ArrowFileReader(new FileInputStream(sourceFile).getChannel(), allocator); + fileInputStream = new FileInputStream(sourceFile); + arrowReader = new ArrowFileReader(fileInputStream.getChannel(), allocator); arrowSchema = arrowReader.getVectorSchemaRoot().getSchema(); recordBatches = arrowReader.getRecordBlocks(); } diff --git a/java/tools/src/main/java/org/apache/tsfile/tools/CsvSourceReader.java b/java/tools/src/main/java/org/apache/tsfile/tools/CsvSourceReader.java index 4db24bba6..7c4dcf1ff 100644 --- a/java/tools/src/main/java/org/apache/tsfile/tools/CsvSourceReader.java +++ b/java/tools/src/main/java/org/apache/tsfile/tools/CsvSourceReader.java @@ -182,6 +182,7 @@ public SourceBatch readBatch() { if (currentSize > 0 && currentSize + lineSize > chunkSizeBytes) { rows.add(parseLine(line)); + currentSize += lineSize; return buildBatch(rows); } diff --git a/java/tools/src/main/java/org/apache/tsfile/tools/ImportExecutor.java b/java/tools/src/main/java/org/apache/tsfile/tools/ImportExecutor.java index 80961019e..3eb7399f5 100644 --- a/java/tools/src/main/java/org/apache/tsfile/tools/ImportExecutor.java +++ b/java/tools/src/main/java/org/apache/tsfile/tools/ImportExecutor.java @@ -44,11 +44,6 @@ public ImportExecutor(ImportSchema importSchema) { } public boolean execute(SourceReader reader, String outputDir, String sourceBaseName) { - return execute(reader, outputDir, sourceBaseName, null); - } - - public boolean execute( - SourceReader reader, String outputDir, String sourceBaseName, String failDir) { try { Files.createDirectories(Paths.get(outputDir)); } catch (IOException e) { diff --git a/java/tools/src/main/java/org/apache/tsfile/tools/ImportSchema.java b/java/tools/src/main/java/org/apache/tsfile/tools/ImportSchema.java index 234cd6a20..9ceb0c419 100644 --- a/java/tools/src/main/java/org/apache/tsfile/tools/ImportSchema.java +++ b/java/tools/src/main/java/org/apache/tsfile/tools/ImportSchema.java @@ -66,8 +66,8 @@ public String getDefaultValue() { return defaultValue; } - public boolean existsInSource() { - return !hasDefault; + public boolean isVirtual() { + return hasDefault; } @Override @@ -134,7 +134,7 @@ public String toString() { public List fieldColumns() { Set tagNames = new HashSet<>(); for (TagColumn tag : tagColumns) { - if (tag.existsInSource()) { + if (!tag.isVirtual()) { tagNames.add(tag.getName()); } } diff --git a/java/tools/src/main/java/org/apache/tsfile/tools/ImportSchemaParser.java b/java/tools/src/main/java/org/apache/tsfile/tools/ImportSchemaParser.java index 28410b421..69a92cbfc 100644 --- a/java/tools/src/main/java/org/apache/tsfile/tools/ImportSchemaParser.java +++ b/java/tools/src/main/java/org/apache/tsfile/tools/ImportSchemaParser.java @@ -165,8 +165,8 @@ private static TSDataType resolveDataType(String typeStr) { private static void validate(ImportSchema schema) { String tp = schema.getTimePrecision(); - if (!"ms".equals(tp) && !"us".equals(tp) && !"ns".equals(tp)) { - throw new IllegalArgumentException("time_precision must be ms, us, or ns"); + if (!"ms".equals(tp) && !"us".equals(tp) && !"ns".equals(tp) && !"s".equals(tp)) { + throw new IllegalArgumentException("time_precision must be ms, us, ns, or s"); } String sep = schema.getSeparator(); @@ -205,7 +205,7 @@ private static void validate(ImportSchema schema) { } } for (ImportSchema.TagColumn tag : schema.getTagColumns()) { - if (tag.existsInSource() && !sourceNames.contains(tag.getName())) { + if (!tag.isVirtual() && !sourceNames.contains(tag.getName())) { throw new IllegalArgumentException( "tag_columns '" + tag.getName() + "' not found in source_columns"); } diff --git a/java/tools/src/main/java/org/apache/tsfile/tools/TsFileTool.java b/java/tools/src/main/java/org/apache/tsfile/tools/TsFileTool.java index fb3251112..bb68b99e1 100644 --- a/java/tools/src/main/java/org/apache/tsfile/tools/TsFileTool.java +++ b/java/tools/src/main/java/org/apache/tsfile/tools/TsFileTool.java @@ -165,7 +165,7 @@ private static void processSchemaMode( File inputFile, String baseName, String outputDir, String format) { try (SourceReader reader = createSchemaReader(inputFile, format)) { ImportExecutor importExecutor = new ImportExecutor(importSchema); - boolean success = importExecutor.execute(reader, outputDir, baseName, failedDirectoryStr); + boolean success = importExecutor.execute(reader, outputDir, baseName); if (success) { LOGGER.info(baseName + ".tsfile successfully generated"); } else { @@ -182,7 +182,7 @@ private static void processAutoMode( try (SourceReader reader = createAutoReader(inputFile, format)) { ImportSchema autoSchema = reader.inferSchema(); ImportExecutor importExecutor = new ImportExecutor(autoSchema); - boolean success = importExecutor.execute(reader, outputDir, baseName, failedDirectoryStr); + boolean success = importExecutor.execute(reader, outputDir, baseName); if (success) { LOGGER.info(baseName + ".tsfile successfully generated"); } else { diff --git a/java/tools/src/test/java/org/apache/tsfile/tools/ImportSchemaParserTest.java b/java/tools/src/test/java/org/apache/tsfile/tools/ImportSchemaParserTest.java index 1efa66c2d..473bdae37 100644 --- a/java/tools/src/test/java/org/apache/tsfile/tools/ImportSchemaParserTest.java +++ b/java/tools/src/test/java/org/apache/tsfile/tools/ImportSchemaParserTest.java @@ -105,10 +105,10 @@ public void testNewSchemaFormat() throws Exception { assertEquals("Group", tags.get(0).getName()); assertTrue(tags.get(0).hasDefault()); assertEquals("Datang", tags.get(0).getDefaultValue()); - assertFalse(tags.get(0).existsInSource()); + assertTrue(tags.get(0).isVirtual()); assertEquals("Region", tags.get(1).getName()); assertFalse(tags.get(1).hasDefault()); - assertTrue(tags.get(1).existsInSource()); + assertFalse(tags.get(1).isVirtual()); assertEquals("FactoryNumber", tags.get(2).getName()); List srcCols = schema.getSourceColumns(); @@ -247,10 +247,10 @@ public void testDefaultTagColumn() throws Exception { assertTrue(tags.get(0).hasDefault()); assertEquals("beijing", tags.get(0).getDefaultValue()); - assertFalse(tags.get(0).existsInSource()); + assertTrue(tags.get(0).isVirtual()); assertFalse(tags.get(1).hasDefault()); - assertTrue(tags.get(1).existsInSource()); + assertFalse(tags.get(1).isVirtual()); List fields = schema.fieldColumns(); assertEquals(1, fields.size());