Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
af6169a
feat(tantivy): Tantivy-fts global index integration via Rust FFI
spaces-X May 14, 2026
1809c55
test(tantivy): Java <-> C++ tantivy archive cross-read compatibility …
spaces-X May 14, 2026
a7346ad
chore: CI / dev container / sanitizer + cross-platform fixes
spaces-X May 14, 2026
35711b9
fix(tantivy): fix io_meta is null and jieba dir not be set
spaces-X May 21, 2026
56ae38d
chore(tantivy_ffi): install log bridge
spaces-X May 26, 2026
be16850
refactor(tantivy_ffi): Read row_id fast field inline via custom colle…
spaces-X Jun 1, 2026
c513145
feat(tantivy_ffi): unscored LIMIT pushdown via LimitedDocSetCollector
spaces-X May 27, 2026
2432970
feat(tantivy): add min_score threshold filtering to FullTextSearch
wxl24life May 29, 2026
0b1b4f0
fix(tantivy): adapt to GlobalIndexWriter / GlobalIndexIOMeta API change
spaces-X May 29, 2026
397bead
fix(tantivy): preserve full-text pre-filter score semantics
spaces-X Jun 8, 2026
0fe910b
ci(tantivy): use Rust 1.88 and skip tantivy on gcc-8
spaces-X Jun 8, 2026
ed466a9
chore(tantivy): use the full Apache license header in tantivy sources
spaces-X Jun 8, 2026
5958931
style(tantivy): satisfy pre-commit (clang-format, cmake-format, cppli…
spaces-X Jun 8, 2026
d749ead
fix(build): fix clang error and sanitizer error
spaces-X Jun 8, 2026
9f38da6
fix(build): make test_utils objlib depend on googletest_ep
spaces-X Jun 9, 2026
3853929
chore(tantivy): translate Chinese code comments to English
spaces-X Jun 9, 2026
8aa3179
refactor(tantivy): share jieba dict env lookup and complete license h…
spaces-X Jun 10, 2026
bf4c752
fix(tantivy): address PR review feedback
spaces-X Jun 12, 2026
f7b18b7
Merge branch 'main' into baseline-tantivy
lxy-9602 Jun 13, 2026
37aa4a7
fix(tantivy): address second-round review feedback
spaces-X Jun 14, 2026
719597f
Merge branch 'main' into baseline-tantivy
spaces-X Jun 15, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 32 additions & 2 deletions .devcontainer/Dockerfile.template
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,32 @@

# Adapted from Apache Iceberg C++
# https://github.com/apache/iceberg-cpp/blob/main/.devcontainer/Dockerfile.template

#
# This Dockerfile is used to build a development container for Paimon C++.
# It is based on the Ubuntu image and installs necessary dependencies.
# Base: Ubuntu 24.04. Rust toolchain is installed via Dev Container
# Feature `ghcr.io/devcontainers/features/rust:1` (see devcontainer.json),
# so it does NOT appear in this Dockerfile.

FROM ubuntu:24.04

# Switch apt to Aliyun mirror for faster downloads (covers both

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hardcoded Aliyun/USTC mirrors will break or slow down builds for contributors outside mainland China. Consider parameterizing via ARG so the mirror URL can be overridden at build time without editing the Dockerfile:

ARG APT_MIRROR=http://archive.ubuntu.com/ubuntu
RUN sed -i "s|http://archive.ubuntu.com/ubuntu|${APT_MIRROR}|g" ...

Alternatively, move the mirror setup into a separate optional script.

# x86_64 archive.ubuntu.com and aarch64 ports.ubuntu.com paths).
# If you are outside mainland China or your network has its own internal
# mirror, edit or remove this block.
RUN sed -i \
-e 's|http://archive.ubuntu.com/ubuntu|http://mirrors.aliyun.com/ubuntu|g' \
-e 's|http://security.ubuntu.com/ubuntu|http://mirrors.aliyun.com/ubuntu|g' \
-e 's|http://ports.ubuntu.com/ubuntu-ports|http://mirrors.aliyun.com/ubuntu-ports|g' \
/etc/apt/sources.list.d/ubuntu.sources

# Point rustup at USTC mirror so the Dev Container Feature
# `ghcr.io/devcontainers/features/rust:1` (and any later `rustup` calls)
# download the Rust toolchain from a China-friendly CDN instead of
# the default static.rust-lang.org. Set as ENV so it is inherited by
# every subsequent layer (including features installed after this image).
ENV RUSTUP_DIST_SERVER=https://mirrors.ustc.edu.cn/rust-static \
RUSTUP_UPDATE_ROOT=https://mirrors.ustc.edu.cn/rust-static/rustup

# Install necessary packages
RUN apt update && \
apt install -y \
Expand All @@ -48,6 +68,16 @@ RUN apt update && \
vim \
wget \
sudo \
# ---- additions for tantivy-fts migration (Rust + Sanitizer + LLVM) ----
clang \
clang-format \
clang-tidy \
lld \
llvm \
libclang-rt-dev \
gdb \
lldb \
valgrind \
&& rm -rf /var/lib/apt/lists/*

# Add a user for development
Expand Down
239 changes: 239 additions & 0 deletions .devcontainer/centos7/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
# Copyright 2026-present Alibaba Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# CentOS 7 cross-build verification image for paimon-cpp + tantivy-fts.
#
# Purpose:
# Prove the tantivy-fts stack builds on the OLDEST reasonable Linux target
# (glibc 2.17, EOL 2024-06-30). The default Ubuntu 24.04 dev container
# proves nothing about glibc compatibility; this image does.
#
# Build:
# docker build -t paimon-cpp-centos7:latest -f .devcontainer/centos7/Dockerfile .
#
# Run:
# docker run -d --name paimon-centos7 \
# --privileged \
# -v "$(pwd):/workspaces/paimon-cpp" \
# paimon-cpp-centos7:latest sleep infinity
# docker exec -it paimon-centos7 bash -l
#
# Inside the container:
# scl enable devtoolset-11 rh-python38 -- bash # activate modern gcc + python
# source /opt/paimon-env.sh # PATH for rust, cmake
# cd /workspaces/paimon-cpp
# git lfs install --local && git lfs pull # critical: boost & friends are LFS
# ./scripts/tantivy_smoke.sh

# ---------- Base ----------
# CentOS 7 reached EOL 2024-06-30; its default mirrorlist.centos.org is down.
# Pin to vault.centos.org (Red Hat's archived location) via the `linuxserver/centos`
# vault image to avoid retired-mirror failures on `yum install`.
#
# Base image: we pull from quay.io (CentOS community's canonical registry post
# Docker Hub deprecation). Override with CENTOS7_IMAGE build arg when behind a
# firewall that can't reach quay.io (e.g. registry.aliyuncs.com/library/centos:7).
ARG CENTOS7_IMAGE=quay.io/centos/centos:centos7
FROM ${CENTOS7_IMAGE}

# Repoint yum at aliyun's CentOS 7 vault mirror — vault.centos.org itself
# works but is slow/blocked from many CN networks; the aliyun mirror is a
# complete rsync and reliably fast. We overwrite CentOS-Base.repo rather
# than sed-patch it so the result is deterministic regardless of what the
# upstream image ships. fastestmirror plugin is disabled because its ping
# probes against the retired mirror list add ~60s to every `yum install`.
RUN echo -e '[base]\n\
name=CentOS-7 - Base - aliyun vault\n\
baseurl=https://mirrors.aliyun.com/centos-vault/7.9.2009/os/$basearch/\n\
gpgcheck=0\n\
enabled=1\n\
\n\
[updates]\n\
name=CentOS-7 - Updates - aliyun vault\n\
baseurl=https://mirrors.aliyun.com/centos-vault/7.9.2009/updates/$basearch/\n\
gpgcheck=0\n\
enabled=1\n\
\n\
[extras]\n\
name=CentOS-7 - Extras - aliyun vault\n\
baseurl=https://mirrors.aliyun.com/centos-vault/7.9.2009/extras/$basearch/\n\
gpgcheck=0\n\
enabled=1\n\
\n\
[centosplus]\n\
name=CentOS-7 - Plus - aliyun vault\n\
baseurl=https://mirrors.aliyun.com/centos-vault/7.9.2009/centosplus/$basearch/\n\
gpgcheck=0\n\
enabled=0\n' > /etc/yum.repos.d/CentOS-Base.repo \
&& rm -f /etc/yum.repos.d/CentOS-CR.repo \
/etc/yum.repos.d/CentOS-Debuginfo.repo \
/etc/yum.repos.d/CentOS-Media.repo \
/etc/yum.repos.d/CentOS-Sources.repo \
/etc/yum.repos.d/CentOS-Vault.repo \
/etc/yum.repos.d/CentOS-fasttrack.repo \
/etc/yum.repos.d/CentOS-x86_64-kernel.repo \
&& if [ -f /etc/yum/pluginconf.d/fastestmirror.conf ]; then \
sed -i 's/^enabled=1/enabled=0/' /etc/yum/pluginconf.d/fastestmirror.conf; \
fi \
&& yum clean all \
&& yum makecache

# ---------- Base toolchain ----------
# EPEL provides git-lfs, ninja-build, a newer python3 than the base 3.6.
# SCL (Software Collections) provides devtoolset-11 (gcc 11) and rh-python38
# without overriding the system gcc/python. CentOS 7's default gcc 4.8 is
# too old for C++17/20 used by lucene++ and our tantivy wrapper.
#
# Same story as CentOS-Base.repo: both epel + SCL default to mirrorlist
# endpoints that are effectively dead; overwrite with aliyun URLs that we
# know respond.
RUN yum install -y epel-release centos-release-scl \
&& echo -e '[epel]\n\
name=Extra Packages for Enterprise Linux 7 - aliyun\n\
baseurl=https://mirrors.aliyun.com/epel/7/$basearch\n\
gpgcheck=0\n\
enabled=1\n' > /etc/yum.repos.d/epel.repo \
&& rm -f /etc/yum.repos.d/epel-testing.repo /etc/yum.repos.d/epel.repo.rpmnew \
&& rm -f /etc/yum.repos.d/CentOS-SCLo-*.repo \
/etc/yum.repos.d/CentOS-SCLo-*.repo.rpmnew \
&& echo -e '[centos-sclo-rh]\n\
name=CentOS-7 - SCLo rh - aliyun vault\n\
baseurl=https://mirrors.aliyun.com/centos-vault/7.9.2009/sclo/$basearch/rh/\n\
gpgcheck=0\n\
enabled=1\n\
\n\
[centos-sclo-sclo]\n\
name=CentOS-7 - SCLo sclo - aliyun vault\n\
baseurl=https://mirrors.aliyun.com/centos-vault/7.9.2009/sclo/$basearch/sclo/\n\
gpgcheck=0\n\
enabled=1\n' > /etc/yum.repos.d/CentOS-SCLo-scl.repo \
&& yum clean all && yum makecache \
&& yum install -y \
devtoolset-11-gcc \
devtoolset-11-gcc-c++ \
devtoolset-11-binutils \
devtoolset-11-libasan-devel \
devtoolset-11-libubsan-devel \
rh-python38 \
rh-python38-python-pip \
git \
git-lfs \
ninja-build \
make \
patch \
curl \
wget \
unzip \
which \
file \
sudo \
openssl-devel \
zlib-devel \
libffi-devel \
bzip2-devel \
xz-devel \
perl-IPC-Cmd \
&& yum clean all

# Enable the SCL collections for all subsequent shells (including RUN).
ENV BASH_ENV=/etc/profile.d/scl-enable.sh
SHELL ["/bin/bash", "-c"]
RUN printf '%s\n' \
'source scl_source enable devtoolset-11' \
'source scl_source enable rh-python38' \
> /etc/profile.d/scl-enable.sh \
&& chmod +x /etc/profile.d/scl-enable.sh

# ---------- CMake (must be >= 3.22 for Corrosion) ----------
# CentOS 7's cmake package is 2.8.12; EPEL cmake3 is 3.17 — still too old.
# Install via pip in the rh-python38 SCL so we get a modern CMake without
# touching the system /usr/bin. Point pip at aliyun's pypi mirror: default
# pypi.org is 10-30s per request from CN, aliyun responds in <1s.
ENV PIP_INDEX_URL=https://mirrors.aliyun.com/pypi/simple/ \
PIP_TRUSTED_HOST=mirrors.aliyun.com
RUN source /etc/profile.d/scl-enable.sh \
&& python3 -m pip install --upgrade pip \
&& python3 -m pip install 'cmake==3.28.*' ninja

# ---------- Rust toolchain ----------
# Install rustup as root into /opt/rust so all users share the same toolchain.
# Use the USTC mirror to keep downloads fast in CN; the CI runner version of
# this is mirrored in ci/scripts/setup_rust.sh.
ENV RUSTUP_HOME=/opt/rust/rustup \
CARGO_HOME=/opt/rust/cargo \
RUSTUP_DIST_SERVER=https://mirrors.ustc.edu.cn/rust-static \
RUSTUP_UPDATE_ROOT=https://mirrors.ustc.edu.cn/rust-static/rustup
# In-container network for Docker Desktop builds is unreliable through many
# CN mirrors (observed: curl 7.29 on CentOS 7 + rsproxy.cn HTTP/2 path ⇒
# partial-read truncations; USTC ⇒ 5xx; rustup sh installer ⇒ 403 from
# legacy cipher). The most reliable fix is to sidestep the issue entirely:
# pre-download rustup-init on the host (where network is solid) and COPY it
# into the image. See .devcontainer/centos7/run.sh for the prefetch step.
COPY .devcontainer/centos7/rustup-init.bin /tmp/rustup-init
RUN chmod +x /tmp/rustup-init \
&& /tmp/rustup-init -y --default-toolchain stable --profile minimal --no-modify-path \
&& rm -f /tmp/rustup-init \
&& mkdir -p $CARGO_HOME \
&& echo -e '[source.crates-io]\n\
replace-with = "rsproxy-sparse"\n\
\n\
[source.rsproxy]\n\
registry = "https://rsproxy.cn/crates.io-index"\n\
\n\
[source.rsproxy-sparse]\n\
registry = "sparse+https://rsproxy.cn/index/"\n\
\n\
[registries.rsproxy]\n\
index = "https://rsproxy.cn/crates.io-index"\n\
\n\
[net]\n\
git-fetch-with-cli = true\n' > $CARGO_HOME/config.toml \
&& $CARGO_HOME/bin/cargo install cbindgen --version 0.29.2 --locked \
&& chmod -R a+rwx /opt/rust \
&& $CARGO_HOME/bin/rustc --version \
&& $CARGO_HOME/bin/cargo --version \
&& $CARGO_HOME/bin/cbindgen --version

# ---------- Environment file consumed by every shell ----------
# Sets PATH for rust / cmake / cargo so `docker exec paimon-centos7 bash -l`
# and interactive sessions have the toolchain on $PATH.
RUN printf '%s\n' \
'export PATH=/opt/rust/cargo/bin:$PATH' \
'# cmake + ninja live under the rh-python38 SCL; path prefix differs by arch.' \
'# `command -v cmake` confirms which one is in use.' \
> /opt/paimon-env.sh \
&& chmod +x /opt/paimon-env.sh \
&& printf '%s\n' 'source /opt/paimon-env.sh' >> /etc/profile.d/scl-enable.sh

# ---------- Non-root user ----------
# Build as `paimon` (uid 1000) so LFS objects under the mount stay owned by
# your host user, matching the main Ubuntu dev container.
RUN useradd -m -u 1000 -s /bin/bash paimon \
&& echo 'paimon ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/paimon

USER paimon
WORKDIR /workspaces/paimon-cpp

# Sanity check surfaces the tool versions in `docker run ... paimon-cpp-centos7 --version`.
CMD ["bash", "-lc", "\
echo '--- CentOS 7 cross-build image sanity check ---'; \
cat /etc/centos-release; \
echo '--- glibc ---'; ldd --version | head -1; \
echo '--- gcc ---'; gcc --version | head -1; \
echo '--- cmake ---'; cmake --version | head -1; \
echo '--- ninja ---'; ninja --version; \
echo '--- rust ---'; rustc --version; \
echo '--- cargo ---'; cargo --version; \
echo '--- cbindgen ---'; cbindgen --version; \
echo 'Ready. Mount paimon-cpp at /workspaces/paimon-cpp and run ./scripts/tantivy_smoke.sh'"]
Loading
Loading