Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions include/paimon/table/source/data_split.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#pragma once

#include <cstddef>
#include <cstdint>
#include <memory>
#include <optional>
#include <string>
#include <vector>

#include "paimon/data/timestamp.h"
#include "paimon/memory/memory_pool.h"
#include "paimon/result.h"
#include "paimon/table/source/split.h"
#include "paimon/visibility.h"

namespace paimon {
class MemoryPool;

/// Input data split for reading operation. Needed by most batch computation engines.
class PAIMON_EXPORT DataSplit : public Split {
public:
/// Metadata structure for simple data files.
///
/// Contains essential information about a data file including its location,
/// size, row count, sequence numbers, schema information, and timestamps.
/// This structure is used to track file metadata without loading the actual file content.
struct SimpleDataFileMeta {
SimpleDataFileMeta(const std::string& _file_path, int64_t _file_size, int64_t _row_count,
int64_t _min_sequence_number, int64_t _max_sequence_number,
int64_t _schema_id, int32_t _level, const Timestamp& _creation_time,
const std::optional<int64_t>& _delete_row_count)
: file_path(_file_path),
file_size(_file_size),
row_count(_row_count),
min_sequence_number(_min_sequence_number),
max_sequence_number(_max_sequence_number),
schema_id(_schema_id),
level(_level),
creation_time(_creation_time),
delete_row_count(_delete_row_count) {}

/// Absolute path of the data file.
///
/// If external path is enabled, `file_path` indicates the actual location in the external
/// storage system.
std::string file_path;
int64_t file_size;
int64_t row_count;
int64_t min_sequence_number;
int64_t max_sequence_number;
int64_t schema_id;
int32_t level;
Timestamp creation_time;
std::optional<int64_t> delete_row_count;

bool operator==(const SimpleDataFileMeta& other) const;

std::string ToString() const;
};

/// Get the list of metadata for all data files in this split.
/// @note This method will be removed in future versions and is only used for append tables.
virtual std::vector<SimpleDataFileMeta> GetFileList() const = 0;
};
} // namespace paimon
38 changes: 38 additions & 0 deletions include/paimon/table/source/plan.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#pragma once

#include <memory>
#include <optional>
#include <vector>

#include "paimon/table/source/split.h"

namespace paimon {
/// %Result plan of this `TableScan`.
class PAIMON_EXPORT Plan {
public:
virtual ~Plan() = default;
/// %Result splits.
virtual const std::vector<std::shared_ptr<Split>>& Splits() const = 0;
/// Snapshot id of this plan, return `std::nullopt` if the table is empty.
virtual std::optional<int64_t> SnapshotId() const = 0;
};
} // namespace paimon
69 changes: 69 additions & 0 deletions include/paimon/table/source/split.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#pragma once

#include <cstddef>
#include <cstdint>
#include <memory>
#include <optional>
#include <string>
#include <vector>

#include "paimon/memory/memory_pool.h"
#include "paimon/result.h"
#include "paimon/visibility.h"

namespace paimon {
class MemoryPool;

/// An input split for reading operation. Needed by most batch computation engines. Support
/// Serialize and Deserialize, compatible with java version.
/// This split can be either a `DataSplit` (for direct data file reads) or an `IndexedSplit`
/// (for reads leveraging global indexes).
class PAIMON_EXPORT Split {
public:
virtual ~Split() = default;

/// Deserialize a `Split` from a binary buffer.
///
/// Creates a `Split` instance from its serialized binary representation.
/// This is typically used in distributed computing scenarios where splits
/// are transmitted between different nodes or processes.
///
/// @param buffer Const pointer to the binary data containing the serialized `Split`.
/// @param length Size of the buffer in bytes.
/// @param pool Memory pool for allocating objects during deserialization.
/// @return Result containing the deserialized `Split` or an error status.
static Result<std::shared_ptr<Split>> Deserialize(const char* buffer, size_t length,
const std::shared_ptr<MemoryPool>& pool);

/// Serialize a `Split` to a binary string.
///
/// Converts a `Split` instance to its binary representation for storage
/// or transmission. The serialized data can later be deserialized using
/// the Deserialize method.
///
/// @param split The `Split` instance to serialize.
/// @param pool Memory pool for allocating temporary objects during serialization.
/// @return Result containing the serialized binary data as a string or an error status.
static Result<std::string> Serialize(const std::shared_ptr<Split>& split,
const std::shared_ptr<MemoryPool>& pool);
};
} // namespace paimon
71 changes: 71 additions & 0 deletions include/paimon/table/source/startup_mode.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#pragma once
#include <string>

#include "paimon/result.h"
#include "paimon/visibility.h"

namespace paimon {
/// Specifies the startup mode for log consumer.
class PAIMON_EXPORT StartupMode {
public:
/// Determines actual startup mode according to other table properties. If "scan.snapshot-id" is
/// set the actual startup mode will be "from-snapshot", otherwise the actual startup mode will
/// be "latest-full".
static const StartupMode Default();

/// For streaming sources, produces the latest snapshot on the table upon first startup, and
/// continue to read the latest changes. For batch sources, just produce the latest snapshot but
/// does not read new changes.
static const StartupMode LatestFull();

/// For streaming sources, continuously reads latest changes without producing a snapshot at the
/// beginning. For batch sources, behaves the same as the "latest-full" startup mode.
static const StartupMode Latest();

/// For streaming sources, continuously reads changes starting from snapshot specified by
/// "scan.snapshot-id", without producing a snapshot at the beginning. For batch sources,
/// produces a snapshot specified by "scan.snapshot-id" but does not read new changes.
static const StartupMode FromSnapshot();

/// For streaming sources, produces from snapshot specified by "scan.snapshot-id" on the table
/// upon first startup, and continuously reads changes. For batch sources, produces a snapshot
/// specified by "scan.snapshot-id" but does not read new changes
static const StartupMode FromSnapshotFull();

/// Starts from a timestamp specified by either "scan.timestamp-millis" or
/// "scan.timestamp". For batch sources, produces the latest snapshot whose
/// timestamp is <= the specified timestamp. For streaming sources, continuously
/// reads changes starting from the first snapshot at or after the timestamp.
static const StartupMode FromTimestamp();

public:
std::string ToString() const;
bool operator==(const StartupMode& other) const;
static Result<StartupMode> FromString(const std::string& str);

private:
explicit StartupMode(const std::string& value) : value_(value) {}

private:
std::string value_;
};
} // namespace paimon
79 changes: 79 additions & 0 deletions include/paimon/table/source/table_read.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#pragma once

#include <memory>
#include <vector>

#include "paimon/executor.h"
#include "paimon/memory/memory_pool.h"
#include "paimon/read_context.h"
#include "paimon/reader/batch_reader.h"
#include "paimon/result.h"
#include "paimon/table/source/split.h"
#include "paimon/visibility.h"

namespace paimon {
class MemoryPool;
class ReadContext;

/// Given a `Split` or a list of `Split`, generate a reader for batch reading.
class PAIMON_EXPORT TableRead {
public:
virtual ~TableRead() = default;

/// Create an instance of `TableRead`.
///
/// @param context A unique pointer to the `ReadContext` used for read operations.
/// @return A Result containing a unique pointer to the `TableRead` instance.
static Result<std::unique_ptr<TableRead>> Create(std::unique_ptr<ReadContext> context);

/// Creates a `BatchReader` instance for reading data.
///
/// This method creates a BatchReader that will be responsible for reading data from the
/// provided splits.
///
/// @param splits A vector of shared pointers to `Split` instances representing the
/// data to be read.
/// @return A Result containing a unique pointer to the `BatchReader` instance.
/// @note `BatchReader`s created by the same `TableRead` are not thread-safe for
/// concurrent reading.
virtual Result<std::unique_ptr<BatchReader>> CreateReader(
const std::vector<std::shared_ptr<Split>>& splits);

/// Creates a `BatchReader` instance for a single split.
///
/// @param split A shared pointer to the `Split` instance that defines the data to be
/// read.
/// @return A Result containing a unique pointer to the `BatchReader` instance.
virtual Result<std::unique_ptr<BatchReader>> CreateReader(
const std::shared_ptr<Split>& split) = 0;

protected:
explicit TableRead(const std::shared_ptr<MemoryPool>& memory_pool);

std::shared_ptr<MemoryPool> GetMemoryPool() const {
return pool_;
}

private:
std::shared_ptr<MemoryPool> pool_;
};
} // namespace paimon
48 changes: 48 additions & 0 deletions include/paimon/table/source/table_scan.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#pragma once

#include <memory>

#include "paimon/result.h"
#include "paimon/table/source/plan.h"
#include "paimon/type_fwd.h"
#include "paimon/visibility.h"

namespace paimon {
class ScanContext;

/// A scanner interface for reading table's meta and create a plan.
class PAIMON_EXPORT TableScan {
public:
/// Create an instance of `TableScan`.
///
/// @param context A unique pointer to the `ScanContext` used for scan operations.
/// @return A Result containing a unique pointer to the `TableScan` instance.
static Result<std::unique_ptr<TableScan>> Create(std::unique_ptr<ScanContext> context);

virtual ~TableScan() = default;

/// Create a scan plan.
///
/// @return A Result containing a shared pointer to the created `Plan` or an error status.
virtual Result<std::shared_ptr<Plan>> CreatePlan() = 0;
};
} // namespace paimon
Loading