mudler · richiejp · Apr 27, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/Makefile b/Makefile
@@ -232,6 +232,20 @@ run-e2e-aio: protogen-go
 	@echo 'Running e2e AIO tests'
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e-aio
 
+# vLLM multi-node DP smoke (CPU). Builds local-ai:tests and the
+# cpu-vllm backend from the current working tree, then drives a
+# head + headless follower via testcontainers-go and asserts a chat
+# completion. BuildKit caches both images, so re-runs only rebuild
+# what changed. The test lives under tests/e2e/distributed and is
+# selected by the VLLMMultinode label so it doesn't run alongside
+# the other distributed-suite tests by default.
+test-e2e-vllm-multinode: docker-build-e2e extract-backend-vllm protogen-go
+	@echo 'Running e2e vLLM multi-node DP test'
+	LOCALAI_IMAGE=local-ai \
+	LOCALAI_IMAGE_TAG=tests \
+	LOCALAI_VLLM_BACKEND_DIR=$(abspath ./local-backends/vllm) \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter='VLLMMultinode' -v -r ./tests/e2e/distributed
+
 ########################################################
 ## E2E tests
 ########################################################
@@ -319,7 +333,7 @@ local-backends:
 
 extract-backend-%: docker-build-% local-backends
 	@echo "Extracting backend $*..."
-	@CID=$$(docker create local-ai-backend:$*) && \
+	@CID=$$(docker create --entrypoint=/run.sh local-ai-backend:$*) && \
 	  rm -rf local-backends/$* && mkdir -p local-backends/$* && \
 	  docker cp $$CID:/ - | tar -xf - -C local-backends/$* && \
 	  docker rm $$CID > /dev/null
@@ -594,6 +608,14 @@ test-extra-backend-vllm: docker-build-vllm
 	BACKEND_TEST_OPTIONS=tool_parser:hermes \
 	$(MAKE) test-extra-backend
 
+## vllm multi-node data-parallel smoke test. Runs LocalAI head + a
+## `local-ai p2p-worker vllm` follower in docker compose against
+## Qwen2.5-0.5B with data_parallel_size=2. Requires 2 NVIDIA GPUs and
+## nvidia-container-runtime on the host — vLLM v1's DP coordinator is
+## not viable on CPU so this cannot run in CI without GPU.
+test-extra-backend-vllm-multinode:
+	./tests/e2e/vllm-multinode/smoke.sh
+
 ## tinygrad mirrors the vllm target (same model, same caps, same parser) so
 ## the two backends are directly comparable. The LLM path covers Predict,
 ## streaming and native tool-call extraction. Companion targets below cover

diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh
@@ -18,12 +18,15 @@ else
     source $backend_dir/../common/libbackend.sh
 fi
 
-# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
-# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
-# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
-# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
+# Intel XPU: torch==2.11.0+xpu lives on the PyTorch XPU index, transitive
+# deps on PyPI — unsafe-best-match lets uv mix both. vllm-xpu-kernels only
+# ships a python3.12 wheel per upstream docs, so bump the portable Python
+# before installRequirements (matches the l4t13 pattern below).
+# https://github.com/vllm-project/vllm/blob/main/docs/getting_started/installation/gpu.xpu.inc.md
 if [ "x${BUILD_PROFILE}" == "xintel" ]; then
-    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
+    PYTHON_VERSION="3.12"
+    PYTHON_PATCH="11"
+    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
 fi
 
 # CPU builds need unsafe-best-match to pull torch==2.10.0+cpu from the
@@ -42,27 +45,90 @@ fi
 
 # JetPack 7 / L4T arm64 wheels (torch, vllm, flash-attn) live on
 # pypi.jetson-ai-lab.io and are built for cp312, so bump the venv Python
-# accordingly. JetPack 6 keeps cp310 + USE_PIP=true. unsafe-best-match
-# is required because the jetson-ai-lab index lists transitive deps at
-# limited versions — without it uv pins to the first matching index and
-# fails to resolve a compatible wheel from PyPI.
+# accordingly. JetPack 6 keeps cp310 + USE_PIP=true.
+#
+# l4t13 uses pyproject.toml (see the elif branch below) to pin only the
+# L4T-specific wheels to the jetson-ai-lab index via [tool.uv.sources].
+# That keeps PyPI as the resolution path for transitive deps like
+# anthropic/openai/propcache, which the L4T mirror's proxy 503s on.
 if [ "x${BUILD_PROFILE}" == "xl4t12" ]; then
     USE_PIP=true
 fi
 if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
     PYTHON_VERSION="3.12"
     PYTHON_PATCH="12"
     PY_STANDALONE_TAG="20251120"
-    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
 fi
 
+# Intel XPU has no upstream-published vllm wheels, so we always build vllm
+# from source against torch-xpu and replace the default triton with
+# triton-xpu (matching torch 2.11). Mirrors the upstream procedure:
+# https://github.com/vllm-project/vllm/blob/main/docs/getting_started/installation/gpu.xpu.inc.md
+if [ "x${BUILD_TYPE}" == "xintel" ]; then
+    # Hide requirements-intel-after.txt so installRequirements doesn't
+    # try `pip install vllm` (would either fail or grab a non-XPU wheel).
+    _intel_after="${backend_dir}/requirements-intel-after.txt"
+    _intel_after_bak=""
+    if [ -f "${_intel_after}" ]; then
+        _intel_after_bak="${_intel_after}.xpu.bak"
+        mv "${_intel_after}" "${_intel_after_bak}"
+    fi
+    installRequirements
+    if [ -n "${_intel_after_bak}" ]; then
+        mv "${_intel_after_bak}" "${_intel_after}"
+    fi
+
+    # vllm's CMake build needs the Intel oneAPI dpcpp/sycl compiler — the
+    # base image (intel/oneapi-basekit) has it but the env isn't sourced.
+    if [ -f /opt/intel/oneapi/setvars.sh ]; then
+        set +u
+        source /opt/intel/oneapi/setvars.sh --force
+        set -u
+    fi
+
+    _vllm_src=$(mktemp -d)
+    trap 'rm -rf "${_vllm_src}"' EXIT
+    git clone --depth 1 https://github.com/vllm-project/vllm "${_vllm_src}/vllm"
+    pushd "${_vllm_src}/vllm"
+        # Install vllm's own runtime deps (torch-xpu, vllm_xpu_kernels,
+        # pydantic, fastapi, …) from upstream's requirements/xpu.txt — the
+        # canonical source of truth. Avoids re-pinning everything ourselves.
+        uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} -r requirements/xpu.txt
+        # Stock triton (NVIDIA-only) may have come in transitively; replace
+        # with triton-xpu==3.7.0 which matches torch 2.11.
+        uv pip uninstall triton triton-xpu 2>/dev/null || true
+        uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} \
+            --extra-index-url https://download.pytorch.org/whl/xpu \
+            triton-xpu==3.7.0
+        export CMAKE_PREFIX_PATH="$(python -c 'import site; print(site.getsitepackages()[0])'):${CMAKE_PREFIX_PATH:-}"
+        VLLM_TARGET_DEVICE=xpu uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} --no-deps .
+    popd
+# L4T arm64 (JetPack 7): drive the install through pyproject.toml so that
+# [tool.uv.sources] can pin torch/vllm/flash-attn/torchvision/torchaudio
+# to the jetson-ai-lab index, while everything else (transitive deps and
+# PyPI-resolvable packages like transformers) comes from PyPI. Bypasses
+# installRequirements because uv pip install -r requirements.txt does not
+# honor sources — see backend/python/vllm/pyproject.toml for the rationale.
+elif [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
+    ensureVenv
+    if [ "x${PORTABLE_PYTHON}" == "xtrue" ]; then
+        export C_INCLUDE_PATH="${C_INCLUDE_PATH:-}:$(_portable_dir)/include/python${PYTHON_VERSION}"
+    fi
+    pushd "${backend_dir}"
+        # Build deps first (matches installRequirements' requirements-install.txt
+        # pass — fastsafetensors and friends need pybind11 in the venv before
+        # their sdists can build under --no-build-isolation).
+        uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} -r requirements-install.txt
+        uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} --requirement pyproject.toml
+    popd
+    runProtogen
 # FROM_SOURCE=true on a CPU build skips the prebuilt vllm wheel in
 # requirements-cpu-after.txt and compiles vllm locally against the host's
 # actual CPU. Not used by default because it takes ~30-40 minutes, but
 # kept here for hosts where the prebuilt wheel SIGILLs (CPU without the
 # required SIMD baseline, e.g. AVX-512 VNNI/BF16). Default CI uses a
 # bigger-runner with compatible hardware instead.
-if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then
+elif [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then
     # Temporarily hide the prebuilt wheel so installRequirements doesn't
     # pull it — the rest of the requirements files (base deps, torch,
     # transformers) are still installed normally.

diff --git a/backend/python/vllm/package.sh b/backend/python/vllm/package.sh
@@ -45,5 +45,109 @@ copy_with_symlinks() {
 copy_with_symlinks libnuma.so.1
 copy_with_symlinks libgomp.so.1
 
+# CPU profile only: bundle a g++ toolchain so torch._inductor's
+# ISA probe (always run at vllm engine startup, regardless of
+# enforce_eager) finds a C++ compiler. The LocalAI runtime image
+# is FROM ubuntu:24.04 with a minimal apt list that does not
+# include build-essential, and the backend image itself is FROM
+# scratch -- so without this, cpu-vllm crashes with
+# torch._inductor.exc.InvalidCxxCompiler at first inference
+# unless the operator manually sets TORCH_COMPILE_DISABLE=1.
+#
+# We snapshot every file owned by the toolchain packages, mirroring
+# the /usr/... layout into ${BACKEND}/toolchain/ so g++ can find
+# cc1plus, headers, libs etc. via GCC_EXEC_PREFIX / CPATH /
+# LIBRARY_PATH at runtime (libbackend.sh wires those up). Adds
+# ~400 MB to the cpu-vllm image, which is tolerable -- cpu-vllm is
+# already a niche profile.
+if [ "${BUILD_TYPE:-}" = "" ] && command -v dpkg-query >/dev/null 2>&1; then
+    TOOLCHAIN_DIR="${CURDIR}/toolchain"
+    mkdir -p "${TOOLCHAIN_DIR}"
+    # The unversioned g++/gcc packages on Debian/Ubuntu only ship
+    # symlinks; the actual binaries live in g++-${VER}/gcc-${VER}.
+    # Discover the active version so the symlink targets get bundled
+    # along with their owners.
+    GCC_VER=$(gcc -dumpversion 2>/dev/null | cut -d. -f1 || true)
+    # `g++-${VER}` itself is just another symlink layer on Debian/
+    # Ubuntu — the real binary `x86_64-linux-gnu-g++-${VER}` lives
+    # in `g++-${VER}-x86-64-linux-gnu` (a separate package pulled in
+    # as a dependency). Same story for gcc/cpp. Compute the dpkg
+    # arch-triplet to find the right package name for both amd64 and
+    # arm64 hosts.
+    case "$(dpkg --print-architecture 2>/dev/null)" in
+        amd64) HOST_TRIPLET="x86-64-linux-gnu" ;;
+        arm64) HOST_TRIPLET="aarch64-linux-gnu" ;;
+        *)     HOST_TRIPLET="" ;;
+    esac
+    PKGS=(g++ gcc cpp libstdc++-${GCC_VER}-dev libgcc-${GCC_VER}-dev libc6 libc6-dev binutils binutils-common libbinutils libc-dev-bin linux-libc-dev libcrypt-dev libgomp1 libstdc++6 libgcc-s1 libisl23 libmpc3 libmpfr6 libjansson4 libctf0 libctf-nobfd0 libsframe1)
+    if [ -n "${GCC_VER}" ]; then
+        PKGS+=("g++-${GCC_VER}" "gcc-${GCC_VER}" "cpp-${GCC_VER}" "gcc-${GCC_VER}-base")
+        if [ -n "${HOST_TRIPLET}" ]; then
+            PKGS+=(
+                "g++-${GCC_VER}-${HOST_TRIPLET}"
+                "gcc-${GCC_VER}-${HOST_TRIPLET}"
+                "cpp-${GCC_VER}-${HOST_TRIPLET}"
+                "binutils-${HOST_TRIPLET}"
+            )
+        fi
+    fi
+    for pkg in "${PKGS[@]}"; do
+        if ! dpkg-query -W "${pkg}" >/dev/null 2>&1; then
+            continue
+        fi
+        # Copy each owned path, preserving symlinks and mode. We
+        # tolerate dpkg listing directories alongside files.
+        dpkg -L "${pkg}" | while IFS= read -r path; do
+            if [ -L "${path}" ] || [ -f "${path}" ]; then
+                mkdir -p "${TOOLCHAIN_DIR}$(dirname "${path}")"
+                cp -aP "${path}" "${TOOLCHAIN_DIR}${path}" 2>/dev/null || true
+            fi
+        done
+    done
+    # Ubuntu's filesystem layout has /lib -> /usr/lib (UsrMerge) and
+    # /lib64 -> /usr/lib64. ld scripts (e.g. libm.so) hardcode
+    # `/lib/x86_64-linux-gnu/libm.so.6`; with --sysroot the linker
+    # looks for that path under the sysroot, which means we need
+    # the same symlinks under TOOLCHAIN_DIR.
+    [ -e "${TOOLCHAIN_DIR}/lib" ]   || ln -s usr/lib   "${TOOLCHAIN_DIR}/lib"
+    [ -e "${TOOLCHAIN_DIR}/lib64" ] || ln -s usr/lib64 "${TOOLCHAIN_DIR}/lib64"
+
+    # Replace the unversioned g++/gcc/cpp symlinks with wrapper
+    # scripts that pass --sysroot=<toolchain> and -B <gcc-exec-prefix>.
+    # Without these flags gcc would fall back to its compiled-in
+    # /usr search and fail to find headers (the runtime image has no
+    # libc6-dev) or fail to invoke `as`/`ld` (binutils not on PATH at
+    # /usr/bin). Wrappers self-resolve their location at runtime so
+    # they work from any BackendsPath.
+    BIN_DIR="${TOOLCHAIN_DIR}/usr/bin"
+    if [ -n "${GCC_VER}" ] && [ -n "${HOST_TRIPLET}" ]; then
+        # HOST_TRIPLET in package names uses dashes ("x86-64-linux-gnu");
+        # the binary suffix uses underscores in the arch part
+        # ("x86_64-linux-gnu-g++-13"). Translate.
+        BIN_TRIPLET=${HOST_TRIPLET//x86-64/x86_64}
+        for tool in g++ gcc cpp; do
+            real="${BIN_DIR}/${BIN_TRIPLET}-${tool}-${GCC_VER}"
+            if [ -x "${real}" ]; then
+                rm -f "${BIN_DIR}/${tool}" "${BIN_DIR}/${tool}-${GCC_VER}"
+                cat > "${BIN_DIR}/${tool}" <<EOF
+#!/bin/bash
+# Auto-generated by package.sh. Passes --sysroot and -B so the
+# bundled toolchain works from any BackendsPath without depending
+# on libc6-dev / binutils being installed at /usr in the runtime
+# image. See backend/python/vllm/package.sh.
+DIR="\$(dirname "\$(readlink -f "\$0")")"     # …/toolchain/usr/bin
+SYSROOT="\$(dirname "\$(dirname "\${DIR}")")" # …/toolchain
+exec "\${DIR}/${BIN_TRIPLET}-${tool}-${GCC_VER}" \\
+    -B "\${SYSROOT}/usr/lib/gcc/${BIN_TRIPLET}/${GCC_VER}/" \\
+    --sysroot="\${SYSROOT}" \\
+    "\$@"
+EOF
+                chmod +x "${BIN_DIR}/${tool}"
+            fi
+        done
+    fi
+    echo "Bundled g++ toolchain (gcc-${GCC_VER}) into ${TOOLCHAIN_DIR} ($(du -sh "${TOOLCHAIN_DIR}" | cut -f1))"
+fi
+
 echo "vllm packaging completed successfully"
 ls -liah "${LIB_DIR}/"
diff --git a/backend/python/vllm/pyproject.toml b/backend/python/vllm/pyproject.toml
@@ -0,0 +1,61 @@
+# L4T arm64 (JetPack 7 / sbsa cu130) install spec for the vllm backend.
+#
+# Why this file exists, and why only the l4t13 BUILD_PROFILE consumes it:
+#
+# pypi.jetson-ai-lab.io hosts the L4T-specific torch / vllm / flash-attn
+# wheels we need on aarch64 + cuda13, but it ALSO transparently proxies the
+# rest of PyPI through `/+f/<sha>/<filename>` URLs that 503 frequently. With
+# `--extra-index-url` + `--index-strategy=unsafe-best-match` (the historical
+# fix in install.sh) uv would pick those proxy URLs for ordinary PyPI
+# packages — `anthropic`, `openai`, `propcache`, `annotated-types` — and
+# trip on the 503s. See e.g. CI run 25212201349 (anthropic-0.97.0).
+#
+# `explicit = true` on the index makes uv consult the L4T mirror ONLY for
+# packages mapped under [tool.uv.sources]. Everything else goes to PyPI.
+# This breaks the historical 503 path without losing access to the L4T
+# wheels we actually need from there.
+#
+# `uv pip install -r requirements.txt` does NOT honor [tool.uv.sources]
+# (sources are project-mode only, not pip-compat mode), so install.sh's
+# l4t13 branch invokes `uv pip install --requirement pyproject.toml`
+# directly. Other BUILD_PROFILEs continue to use the requirements-*.txt
+# pipeline through libbackend.sh's installRequirements and never read
+# this file.
+[project]
+name = "localai-vllm-l4t13"
+version = "0.0.0"
+requires-python = ">=3.12,<3.13"
+dependencies = [
+    # Mirror of requirements.txt — kept in sync manually for now since the
+    # l4t13 path bypasses installRequirements (see install.sh).
+    "grpcio==1.80.0",
+    "protobuf",
+    "certifi",
+    "setuptools",
+    "pillow",
+    "charset-normalizer>=3.4.0",
+    "chardet",
+    # L4T-specific accelerator stack (sourced from jetson-ai-lab below).
+    "torch",
+    "torchvision",
+    "torchaudio",
+    "flash-attn",
+    "vllm",
+    # PyPI-resolvable packages that complete the runtime — accelerate,
+    # transformers, bitsandbytes carry their own wheels for aarch64.
+    "accelerate",
+    "transformers",
+    "bitsandbytes",
+]
+
+[[tool.uv.index]]
+name = "jetson-ai-lab"
+url = "https://pypi.jetson-ai-lab.io/sbsa/cu130"
+explicit = true
+
+[tool.uv.sources]
+torch = { index = "jetson-ai-lab" }
+torchvision = { index = "jetson-ai-lab" }
+torchaudio = { index = "jetson-ai-lab" }
+flash-attn = { index = "jetson-ai-lab" }
+vllm = { index = "jetson-ai-lab" }
diff --git a/backend/python/vllm/requirements-intel-after.txt b/backend/python/vllm/requirements-intel-after.txt
@@ -1 +1,3 @@
-vllm
+# Intel XPU has no upstream-published vllm wheels — install.sh builds vllm
+# from source with VLLM_TARGET_DEVICE=xpu and hides this file during
+# installRequirements. Don't add a `vllm` line here.
diff --git a/backend/python/vllm/requirements-intel.txt b/backend/python/vllm/requirements-intel.txt
@@ -1,7 +1,8 @@
 --extra-index-url https://download.pytorch.org/whl/xpu
+# vllm's own deps (torch==2.11.0+xpu, vllm_xpu_kernels, pydantic, …) are
+# installed from upstream's requirements/xpu.txt during the source build —
+# see install.sh. Only list what LocalAI's vllm backend.py needs directly.
 accelerate
-torch
 transformers
-optimum[openvino]
+bitsandbytes
 setuptools
-bitsandbytes
diff --git a/backend/python/vllm/requirements-l4t13-after.txt b/backend/python/vllm/requirements-l4t13-after.txt
diff --git a/backend/python/vllm/requirements-l4t13.txt b/backend/python/vllm/requirements-l4t13.txt
diff --git a/backend/python/vllm/requirements.txt b/backend/python/vllm/requirements.txt
@@ -1,4 +1,7 @@
 grpcio==1.80.0
 protobuf
 certifi
-setuptools
+setuptools
+pillow
+charset-normalizer>=3.4.0
+chardet