diff --git a/Cargo.lock b/Cargo.lock index b26b7b137..df4b657d5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2151,6 +2151,25 @@ dependencies = [ "serde", ] +[[package]] +name = "encoded_video_ingest" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "eframe", + "egui", + "egui-wgpu", + "env_logger 0.11.10", + "futures", + "libwebrtc", + "livekit", + "livekit-api", + "log", + "parking_lot", + "tokio", +] + [[package]] name = "encoding_rs" version = "0.8.35" diff --git a/Cargo.toml b/Cargo.toml index 82c214183..052028d59 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,7 @@ members = [ "examples/local_video", "examples/mobile", "examples/play_from_disk", + "examples/encoded_video_ingest", "examples/rpc", "examples/save_to_disk", "examples/screensharing", diff --git a/examples/encoded_video_ingest/Cargo.toml b/examples/encoded_video_ingest/Cargo.toml new file mode 100644 index 000000000..414346499 --- /dev/null +++ b/examples/encoded_video_ingest/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "encoded_video_ingest" +version = "0.1.0" +edition.workspace = true +publish = false + +[[bin]] +name = "sender" +path = "src/sender.rs" + +[[bin]] +name = "receiver" +path = "src/receiver.rs" + +[dependencies] +anyhow = { workspace = true } +clap = { workspace = true, features = ["derive", "env"] } +eframe = { workspace = true, features = ["default_fonts", "wgpu"] } +egui = { workspace = true } +egui-wgpu = { workspace = true } +env_logger = { workspace = true } +futures = { workspace = true } +libwebrtc = { workspace = true, features = ["encoded-video"] } +livekit = { workspace = true, features = ["encoded-video", "rustls-tls-native-roots"] } +livekit-api = { workspace = true, features = ["rustls-tls-native-roots"] } +log = { workspace = true } +parking_lot = { workspace = true } +tokio = { workspace = true, features = ["full"] } diff --git a/examples/encoded_video_ingest/README.md b/examples/encoded_video_ingest/README.md new file mode 100644 index 000000000..a027303b8 --- /dev/null +++ b/examples/encoded_video_ingest/README.md @@ -0,0 +1,594 @@ +# encoded_video_ingest + +End-to-end demo of the **encoded video ingest** feature of the Rust +SDK. Encoded H.264, H.265, VP8, or AV1 frames flow from a gstreamer +camera pipeline directly into `NativeEncodedVideoSource::capture_frame`, +get packetized by WebRTC (no software re-encode), and arrive at a +remote peer which decodes and renders them directly in a WGPU window. + +```text +┌────────────┐ encoded (TCP) ┌─────────────┐ RTP (WebRTC) ┌─────────────┐ +│ gstreamer │ ───────────► │ sender.rs │ ────────────────► │ receiver.rs │ +│ (camera) │ :5005 │ (encoded │ │ (decoded │ +│ tcpserver │ │ publish, │ │ WGPU │ +│ │ │ tcp client)│ │ display) │ +└────────────┘ └─────────────┘ └─────────────┘ +``` + +Gstreamer produces the encoded bytestream as a TCP server on :5005; the +Rust sender connects as a client and demuxes it into individual +frames. The sender supports two wire framings, picked by `--codec`: + +- **H.264 / H.265** — raw Annex-B; the sender splits on AUD NAL + boundaries. +- **VP8 / AV1** — IVF container (gstreamer's `ivfmux` or + `avmux_ivf`); the sender parses the 32-byte file header (when + present) and each 12-byte per-frame header. For AV1, each IVF + record is one Temporal Unit (TU) — a complete OBU sequence for + one frame. + +## What this exercises + +- `libwebrtc::video_source::NativeEncodedVideoSource` — the + Encoded video track source, for `VideoCodec::H264`, + `VideoCodec::H265`, `VideoCodec::Vp8`, and `VideoCodec::Av1`. +- Annex-B bytestream ingest (H.264/H.265), with automatic + parameter-set caching and keyframe prepending done by the source + (SPS/PPS for H.264, VPS/SPS/PPS for H.265) so the producer does not + need to inline parameter sets on every IDR. +- IVF-framed ingest (VP8 / AV1) — no NAL parameter sets, one + compressed frame per IVF record. Keyframe flag comes from bit 0 of + the VP8 frame tag (RFC 6386) for VP8, or the presence of an + `OBU_SEQUENCE_HEADER` (type 1) in the Temporal Unit for AV1 (AV1 + spec §5.3.2). +- `EncodedVideoSourceObserver` — keyframe-request and target-bitrate + callbacks from the WebRTC pipeline. +- `LocalParticipant::publish_track` normalization for encoded sources + (forces `simulcast=false` and remaps `video_codec` to match the + source codec). + +## Prerequisites + +- gstreamer 1.22+ with the `good`, `bad`, `ugly`, and `libav` plugin + sets: + - macOS: `brew install gstreamer gst-plugins-base gst-plugins-good + gst-plugins-bad gst-plugins-ugly gst-libav` + - Debian/Ubuntu: `sudo apt install gstreamer1.0-tools + gstreamer1.0-plugins-{base,good,bad,ugly} gstreamer1.0-libav` +- A LiveKit server (use `livekit-server --dev` locally or point at a + cloud deployment). + +# Validating Camera + +**Before bringing LiveKit into the picture**, confirm your camera +encode path and a basic H.264 decode preview work in pure GStreamer. +The **send** and **receive** commands below use the **same UDP port +(5005)** on purpose: `udpsink` sends RTP to `127.0.0.1:5005` and `udpsrc` +binds `port=5005` for a quick local check. + +That is only for this camera-validation hop. In the [full LiveKit +demo](#running-the-livekit-demo) below, **port 5005** is reserved for +**TCP** from the camera pipeline into `sender` (Annex-B bytestream). +The `receiver` renders the subscribed LiveKit track directly and does +not need a second GStreamer pipeline. + +### Send — camera → RTP/UDP 5005 + +macOS (`avfvideosrc`). Linux: replace the source with `v4l2src +device=/dev/video0`. Windows: `mfvideosrc device-index=0`. If the +camera cannot produce 640×480 natively, add `videoscale ! videorate !` +before `x264enc` and relax the first caps filter as needed. + +```bash +gst-launch-1.0 -v \ + avfvideosrc ! \ + video/x-raw,width=640,height=480,framerate=30/1 ! \ + videoconvert ! \ + x264enc tune=zerolatency bitrate=1000 speed-preset=ultrafast key-int-max=30 ! \ + video/x-h264,profile=baseline ! \ + rtph264pay pt=96 config-interval=1 ! \ + udpsink host=127.0.0.1 port=5005 +``` + +### Receive — RTP/UDP 5005 → display + +```bash +gst-launch-1.0 -v \ + udpsrc port=5005 caps="application/x-rtp,media=video,encoding-name=H264,payload=96" ! \ + rtph264depay ! \ + avdec_h264 ! \ + videoconvert ! \ + autovideosink +``` + +On macOS, if `autovideosink` hangs at `PREROLLING` (common with +`glimagesink` under `gst-launch`), replace it with `osxvideosink`. + +This path validates camera, encoder, and decoder. It is **not** the +same wire format as the Rust sender: the demo ingest uses **TCP** and +**Annex-B** with **AUD-delimited** access units (see the pipeline in +[Running the LiveKit demo](#running-the-livekit-demo)). For that path +you still want `x264enc … aud=true`, `h264parse`, and `tcpserversink` as +documented there. + +### Debugging a blank / green receive window + +Before blaming the network, collapse encode → decode into a single +local pipeline. A green square here means the encoder is being fed +buffers it cannot consume (wrong pixel format, GL memory, or no frames +at all): + +```bash +gst-launch-1.0 -v \ + avfvideosrc device-index=0 ! \ + video/x-raw,width=640,height=480,format=NV12,framerate=30/1 ! \ + videoconvert ! \ + x264enc tune=zerolatency speed-preset=ultrafast bitrate=1000 key-int-max=60 aud=true ! \ + h264parse config-interval=1 ! avdec_h264 ! videoconvert ! autovideosink sync=false +``` + +Common causes of a green (or all-black) preview: + +- **macOS camera permission.** Grant your terminal app Camera access + in *System Settings → Privacy & Security → Camera* and relaunch it. + Without permission, AVFoundation hands back solid green frames + rather than failing. +- **`memory:GLMemory` on the source pad.** `avfvideosrc` often + advertises GL-texture caps first; `x264enc` cannot consume them. + Pinning `format=NV12` (or any other plain `video/x-raw` format) on + the first caps filter forces a CPU buffer. +- **Caps pinned to a mode the camera cannot produce.** Run + `gst-device-monitor-1.0 Video/Source` and pick a + `width`/`height`/`format`/`framerate` combo listed under + `video/x-raw` (not `video/x-raw(memory:GLMemory)`). + +### Why TCP for the Rust ingest path (and not raw H.264 over UDP)? + +The camera validation above uses **RTP** over UDP on localhost, where +packets stay small enough to avoid typical OS UDP limits. + +For **raw Annex-B H.264** pushed with `udpsink`, macOS in particular has +a low default `net.inet.udp.maxdgram` (~9 KB), which large keyframes +can exceed. Symptoms look like: + +``` +Error sending message: Message too long +``` + +and broken or blocky video when the kernel drops datagrams. The demo +therefore uses **TCP** from GStreamer into `sender`, which has no such +per-write datagram cap. + +## Running the LiveKit demo + +### 0. Environment + +```bash +export LIVEKIT_URL=ws://localhost:7880 +export LIVEKIT_API_KEY=devkey +export LIVEKIT_API_SECRET=secret +``` + +Both `sender` and `receiver` use `env_logger`, so they are silent +unless `RUST_LOG` is set. The step 2/3 invocations below already +prefix `RUST_LOG=info`; lower it to `warn` once the demo is running +clean, or raise it to `RUST_LOG=info,libwebrtc=debug` to see the +underlying C++ WebRTC log sink. + +### 1. Start the gstreamer camera pipeline (Terminal 1) + +**Annex-B over TCP** into the Rust sender (not the UDP/RTP validation +pipelines). `tcpserversink` listens on **TCP** port **5005**; stop any +other **TCP** listener on that port if you have one. + +> **macOS — avoid TCP port 5000.** On macOS 12+ the *AirPlay Receiver* +> feature (managed by `ControlCenter`) binds `*:5000` by default. +> `tcpserversink host=0.0.0.0 port=5000` will log +> `Error binding to address 0.0.0.0:5000: Address already in use`, +> fall back to `current-port = 0`, and produce no data — while any +> client still "connects" to :5000 (it's talking to AirPlay, not to +> gstreamer). This demo uses **5005** to sidestep that. Either keep +> 5005, disable AirPlay Receiver in *System Settings → General → +> AirDrop & Handoff → AirPlay Receiver*, or pick another free port. +> Verify with `lsof -nP -iTCP:5005 -sTCP:LISTEN` — you should see +> `gst-launc`, not `ControlCe`. + +macOS: + +```bash +gst-launch-1.0 -v \ + avfvideosrc device-index=0 ! \ + video/x-raw,width=640,height=480,format=NV12,framerate=30/1 ! \ + videoconvert ! \ + x264enc tune=zerolatency speed-preset=veryfast bitrate=2500 key-int-max=30 \ + bframes=0 rc-lookahead=0 aud=true ! \ + h264parse config-interval=-1 ! \ + video/x-h264,stream-format=byte-stream,alignment=au ! \ + tcpserversink host=0.0.0.0 port=5005 +``` + +Linux: replace `avfvideosrc device-index=0` with `v4l2src device=/dev/video0`. Windows: `mfvideosrc device-index=0`. + +Knobs that matter for `sender`: + +- **`aud=true`** — NAL-type-9 AUD at the start of every access unit; + the Rust sender splits the TCP byte stream on those boundaries. +- **`h264parse` … `stream-format=byte-stream,alignment=au`** — Annex-B + suitable for the ingest path. +- **`tcpserversink`** accepts one TCP client at a time. Another + process cannot listen on **TCP** :5005 at the same time. The RTP + validation pipelines in [Validating Camera](#validating-camera) use + **UDP** :5000, which is a different protocol *and* a different port, + so the two setups do not interfere. + +#### H.265 variant + +For H.265/HEVC, swap the encoder and parser. `x265enc`'s AUD output is +controlled via `option-string`, which is forwarded to libx265: + +```bash +gst-launch-1.0 -v \ + avfvideosrc device-index=0 ! \ + video/x-raw,width=640,height=480,format=NV12,framerate=30/1 ! \ + videoconvert ! \ + x265enc tune=zerolatency speed-preset=ultrafast bitrate=1000 key-int-max=60 \ + option-string="aud=1:repeat-headers=1" ! \ + h265parse config-interval=1 ! \ + video/x-h265,stream-format=byte-stream,alignment=au ! \ + tcpserversink host=0.0.0.0 port=5005 +``` + +- `aud=1` emits the HEVC AUD (NAL type 35) at every AU boundary; the + sender's splitter keys on those. +- `repeat-headers=1` makes libx265 inline VPS/SPS/PPS with every + keyframe — cheap insurance in case the parser doesn't. The SDK + source also caches and re-prepends parameter sets on its own, so + either producer behaviour works. + +You must pass `--codec h265` to `sender` as well (see step 2) so the +AU splitter uses the HEVC NAL-type layout. Mixing an H.265 pipeline +with a `--codec h264` sender will look like "no AUs ever flow" — +the 5-bit H.264 NAL-type mask won't find AUD=9 in an HEVC stream. + +HEVC caveat: the **other peer** (receiver, SFU, JS client, etc.) must +actually be able to decode H.265. If the SDP answer strips the `H265` +payload type, nothing will be published even though `sender` logs look +healthy. Point-to-point between two instances of this demo on macOS +works because `RTCDefaultVideoDecoderFactory` exposes VideoToolbox +HEVC; your SFU's behaviour may differ. + +#### VP8 variant + +VP8 has no start codes, no NAL units, and no parameter sets, so we +need external framing. The sender consumes the **IVF** container +produced by gstreamer. Use `avmux_ivf` (from `gst-libav`) — it's the +most portable option and ships in Homebrew's consolidated `gstreamer` +formula: + +```bash +gst-launch-1.0 -v \ + avfvideosrc device-index=0 ! \ + video/x-raw,width=640,height=480,format=NV12,framerate=30/1 ! \ + videoconvert ! \ + vp8enc deadline=1 cpu-used=5 threads=4 \ + target-bitrate=1000000 keyframe-max-dist=60 end-usage=cbr ! \ + avmux_ivf ! \ + tcpserversink host=0.0.0.0 port=5005 +``` + +If your install has the native `ivfmux` element (gst-plugins-bad, +relatively recent versions), it's a drop-in replacement — the +Rust-side IVF parser only cares about the on-wire bytes, which are +identical. Check with `gst-inspect-1.0 ivfmux` / `gst-inspect-1.0 +avmux_ivf`; `WARNING: erroneous pipeline: no element "ivfmux"` means +you have to use `avmux_ivf` (or reinstall gstreamer to pick up the +native muxer). + +- The muxer emits a 32-byte file header once, followed by a 12-byte + per-frame header + payload. The sender parses exactly that shape. +- `target-bitrate` is in **bps** (unlike `x264enc`/`x265enc` which use + kbps). The example above is 1 Mbps. +- `keyframe-max-dist=60` matches the 60-frame IDR interval used by the + H.26x pipelines, so time-to-first-frame behaves the same. +- `deadline=1` is realtime mode; `cpu-used=5` is the fastest preset. + +Keep `--codec vp8` on the sender (step 2). VP8 is the baseline +WebRTC codec, so SFU/peer compatibility is not a concern. + +> The `DKIF` file header is optional on the wire. The native +> `ivfmux` element emits it; `avmux_ivf` (libav-backed) swallows it +> on a non-seekable sink like `tcpserversink` and emits only +> per-frame records. The sender handles both: it consumes `DKIF` if +> the first four bytes match, otherwise it starts parsing 12-byte +> per-frame records directly. Gstreamer's one-buffer-per-packet +> semantics keep every `tcpserversink` client frame-aligned, so +> start-order between sender and gstreamer does not matter for VP8. +> If the reader ever parses an absurd `frame_size`, it drops the +> TCP connection and reconnects to re-align on the next buffer. + +#### AV1 variant + +AV1 rides the same IVF wire format as VP8 (FOURCC `AV01`). The +sender treats each IVF record as a complete Temporal Unit (TU) — the +OBU sequence for one frame — and detects keyframes by scanning the +TU's OBUs for an `OBU_SEQUENCE_HEADER` (type 1), which libaom, +SVT-AV1, and rav1e only emit at keyframes. + +Use `av1enc` (libaom, in `gst-plugins-bad`). You also want `av1parse` +between the encoder and the muxer so OBUs land in the Low Overhead +Bitstream Format with size fields populated and one TU per buffer: + +```bash +gst-launch-1.0 -v \ + avfvideosrc device-index=0 ! \ + video/x-raw,width=640,height=480,format=NV12,framerate=30/1 ! \ + videoconvert ! \ + av1enc usage-profile=realtime end-usage=cbr cpu-used=9 \ + target-bitrate=1000 keyframe-max-dist=60 threads=4 ! \ + av1parse ! \ + video/x-av1,stream-format=obu-stream,alignment=tu ! \ + avmux_ivf ! \ + tcpserversink host=0.0.0.0 port=5005 +``` + +Pass `--codec av1` to the sender (step 2). Notes on the AV1 encoder: + +- **`av1enc target-bitrate` is in kbps** (libaom convention), unlike + `vp8enc` which uses bps. The example above is 1 Mbps. +- `usage-profile=realtime` + `end-usage=cbr` picks libaom's realtime + rate-control path; without it the default is high-latency good- + quality mode and frames arrive in bursts. +- `cpu-used` for libaom AV1 realtime is 0..=10 (higher = faster, + lower quality). 9 is a reasonable live-capture default on a + laptop-class CPU; drop to 7 if your CPU is idle and you want + better quality at the same bitrate. If `ingest: X fps accepted` + lags your capture framerate, bump `cpu-used` or raise `threads` + (libaom AV1 is CPU-hungry). +- `keyframe-max-dist=60` mirrors the other pipelines for identical + time-to-first-frame. +- `av1parse` normalises the bitstream to OBU-stream framing aligned + on Temporal Units, which is what `avmux_ivf` expects and what the + Rust sender's keyframe probe assumes. Leaving it out usually still + works but is encoder-dependent — keep it in the pipeline. + +Alternative encoders (same pipeline shape, only the encoder element +changes): + +- **SVT-AV1** (`svtav1enc`, `gst-plugins-bad`) — faster than libaom + at comparable quality; tuning knobs differ + (`preset=10 target-bitrate=1000 rate-control-mode=cbr`). +- **rav1e** (`rav1enc`, `gst-plugins-rs`) — pure-Rust AV1 encoder; + realtime-ish at low `speed-preset` values. + +Keep `--codec av1` on the sender regardless of which AV1 encoder you +pick — the Rust side only cares about the on-wire IVF/OBU bytes. + +> **AV1 peer compatibility.** Like H.265, the receiving peer must +> actually be able to decode AV1. All recent browsers ship a dav1d +> decoder and LiveKit's default C++ factory also enables dav1d via +> `RTC_DAV1D_IN_INTERNAL_DECODER_FACTORY`, so macOS-to-macOS (two +> instances of this demo) and browser subscribers work out of the +> box. Older SFUs may strip the AV1 payload type from the SDP +> answer; `sender` will log happy ingest stats while the peer sees +> a black frame. + +The IVF-header-optional notes apply here too: native `ivfmux` emits a +`DKIF` header with FOURCC `AV01`; `avmux_ivf` on `tcpserversink` +does not. The sender handles both. + +### 2. Start the sender (Terminal 2) + +```bash +RUST_LOG=info cargo run -p encoded_video_ingest --bin sender -- \ + --tcp-host 127.0.0.1 --tcp-port 5005 \ + --width 640 --height 480 \ + --max-bitrate-kbps 2500 --max-framerate 30 \ + --codec h264 \ + --room encoded-video-demo --identity encoded-sender +``` + +For the H.265 pipeline use `--codec h265`; for VP8 use `--codec vp8`; +for AV1 use `--codec av1`. + +Flags: + +- `--tcp-host/--tcp-port` where gstreamer's `tcpserversink` is + listening. +- `--width/--height` declared stream resolution; must match what + gstreamer is producing. +- `--max-bitrate-kbps/--max-framerate` set the single RTP encoding + envelope advertised to WebRTC. Keep these at or above the upstream + encoder's realtime output; the SDK's generic 640x480 default is too + conservative for this low-latency ingest demo. +- `--codec {h264,h265,vp8,av1}` selects the wire framing and keyframe + probe: Annex-B (AUD-split) for H.264/H.265, or IVF for VP8/AV1. + **Must match the gstreamer pipeline.** `publish_track` will + additionally remap the track's `video_codec` to match the source, + so the LiveKit publish options follow automatically. + +The sender logs one line every ~2 s with ingest stats and will print +warnings when the receiver requests keyframes or when the congestion +controller updates the target bitrate. If the gstreamer pipeline is +restarted, the sender reconnects automatically. + +### 3. Start the receiver (Terminal 3) + +```bash +RUST_LOG=info cargo run -p encoded_video_ingest --bin receiver -- \ + --room encoded-video-demo --identity encoded-receiver \ + --from encoded-sender +``` + +The receiver subscribes to the room and renders the first matching +remote video track directly in a native WGPU window. The receive side +uses `NativeVideoStream`, so the window displays decoded frames from +WebRTC's internal decoder rather than encoded packets. + +The receiver defaults to a low-latency display path (`vsync=false`, +WGPU `AutoNoVsync`, and swapchain frame latency 1). Pass `--vsync` if +you prefer smoother presentation over the lowest possible glass-to-glass +latency. + +### Low-latency tuning notes + +The first place to look is the sender's `ingest:` line. If `dropped` +frames climb or the logged encoded bitrate is much higher than the +logged WebRTC target, either lower the upstream encoder output or raise +`--max-bitrate-kbps` to match it. For 640x480@30 H.264, the default +demo command uses 2.5 Mbps to avoid the SDK's conservative generic +640x480 preset becoming the bottleneck. + +## Troubleshooting + +**Sender connects to the room but never logs ingest stats.** +Most often the Rust sender is connected to something that is not +gstreamer. Quick checks, in order: + +1. Confirm the gstreamer pipeline from step 1 is actually running and + logging `PLAYING`, not blocked on `Address already in use`. +2. Sniff the TCP stream directly — you should see NAL-unit bytes + flowing: + + ```bash + nc 127.0.0.1 5005 | pv -b > /dev/null + ``` + + If `pv` stays at `0 B`, the other end is not gstreamer (on macOS, + most commonly AirPlay Receiver on :5000; see the macOS callout in + step 1). +3. Confirm you picked the TCP Annex-B pipeline from step 1 and not the + UDP/RTP validation pipeline from [Validating Camera](#validating-camera) — + the latter won't feed `tcpserversink`. + +**gstreamer says `WARNING: erroneous pipeline: no element "ivfmux"`.** +Your gstreamer install doesn't bundle the native IVF muxer. Swap +`ivfmux` for `avmux_ivf` (from `gst-libav`), which produces an +identical IVF byte stream and is in Homebrew's consolidated +`gstreamer` formula. Confirm with `gst-inspect-1.0 avmux_ivf`. If +neither is present, `brew reinstall gstreamer` (or on Debian/Ubuntu, +`sudo apt install gstreamer1.0-libav gstreamer1.0-plugins-bad`) will +pull both in. + +**gstreamer reports `Error binding to address 0.0.0.0:5000`.** +Another process is listening on that port. On macOS that is usually +AirPlay Receiver; use port 5005 (as this README does) or disable +AirPlay Receiver. Check with: + +```bash +lsof -nP -iTCP:5000 -sTCP:LISTEN +``` + +**Receiver window opens but never shows video.** +Confirm `--from` matches the publisher identity exactly, or omit it to +render the first subscribed video track. The receiver logs +`Subscribed to ...` once it accepts a track, then `recv: ... fps` as +decoded frames arrive. + +**Nothing logs at all from the Rust binaries.** +`sender`/`receiver` use `env_logger`; set `RUST_LOG=info` (as in the +commands above). Without it, both processes are silent even when they +are working correctly. + +**Sender connects to gstreamer, TCP bytes flow, but `ingest:` still +reads 0 fps accepted.** +Almost always a codec / framing mismatch between the gstreamer +pipeline and the sender: + +- **H.26x**: the demuxer looks for the AUD NAL type of whichever codec + you passed via `--codec` (9 for H.264, 35 for H.265), and the two + use different bit layouts for the NAL-type field. An H.265 stream + fed to `--codec h264` (or vice versa) will scan end-to-end without + ever recognising an AUD boundary, so no AU is ever pushed to + `capture_frame`. +- **VP8 / AV1**: the demuxer accepts IVF with or without the `DKIF` + file header (native `ivfmux` emits it; `avmux_ivf` on + `tcpserversink` doesn't). It assumes the first byte starts an IVF + per-frame record, which is what gstreamer's one-buffer-per-packet + delivery guarantees. If you see `IVF: implausible frame_size=N + bytes`, gstreamer produced a byte stream where the first byte of a + new client's delivery is mid-packet (very rare in practice). The + sender logs the warning, drops the TCP connection, and reconnects — + which usually re-anchors on the next buffer boundary. If it keeps + happening, your muxer is producing non-record-aligned buffers; swap + `avmux_ivf` for the native `ivfmux` if it's available. If you pass + a `--codec` that doesn't match the pipeline's FOURCC (e.g. + `--codec av1` on a VP8 stream), you'll get a one-shot warning from + the IVF reader but bytes will keep flowing — the FOURCC check is + advisory; what actually differs between the IVF-framed codecs is + the keyframe probe (RFC 6386 frame-tag bit for VP8, OBU sequence- + header scan for AV1). +- **AV1-specific**: if ingest accepts frames but the receiver never + decodes them, check that your pipeline includes + `av1parse ! video/x-av1,stream-format=obu-stream,alignment=tu` + before `avmux_ivf`. Some encoders emit OBUs without size fields + when fed directly to the muxer; the sender's keyframe probe can't + skip those reliably and will mark every frame as a delta, causing + the jitter buffer to wait forever for a keyframe. +- **Mixed**: `--codec vp8` pointed at an Annex-B H.264 pipeline (or + `--codec h264` at an IVF VP8 pipeline) will either trip the IVF + magic check or silently scan forever — re-check `--codec` matches + your pipeline. + +**H.265 track publishes, but the remote peer shows a black frame.** +The other peer cannot decode HEVC — check the SDP answer for an +`H265` payload type. LiveKit SFUs that support H.265 will forward; +ones that don't will either drop the subscription or fall through to +a fallback codec. Point two instances of this demo at the same room +on macOS to isolate whether the problem is the SDK or the SFU: +VideoToolbox HEVC is available in `RTCDefaultVideoDecoderFactory`, so +macOS-to-macOS should decode cleanly. + +## Known limitations + +### VP9 is not documented as a supported codec for this example + +`CodecArg::Vp9` still exists in `sender.rs` (and +`NativeEncodedVideoSource` accepts `VideoCodec::Vp9`), but VP9 ingest +is not exercised by this demo and has rough edges that make it a poor +fit for a "Encoded bytes straight to RTP" path: + +- libvpx-vp9 emits **superframes** in IVF (a per-frame record can + bundle several coded frames — e.g. a show_existing_frame reshow + plus a hidden alt-ref). WebRTC's VP9 RTP packetizer expects one + *coded* frame per input, so feeding a superframe as one + `capture_frame` call misreports keyframe-ness and confuses the + depacketizer on the peer. +- Keyframe detection from just the VP9 uncompressed header misses + show_existing_frame / alt-ref semantics that determine whether a + picture actually refreshes the reference buffers. +- SVC (spatial / temporal layering) — the main reason to pick VP9 + over VP8 — needs the VP9 RTP descriptor plumbed through the encoded + source, which this branch does not expose. + +For single-layer VP9 with patched-up superframe handling this could +be revisited, but today **use VP8 or AV1** for IVF-framed ingest. +`--codec vp9` is left in the CLI so existing scripts don't break; it +is intentionally undocumented here. + +### Receive-side encoded frames are not exposed + +The feature added in this branch covers the **send** side: the producer +hands encoded bytes in, WebRTC packetizes them out. On the **receive** +side the SDK currently only exposes decoded frames via +`NativeVideoStream`. That's why the receiver round-trips through +WebRTC's internal decoder and renders decoded frames, rather than +forwarding encoded H.264. + +Exposing encoded frames on receive would require a +`RemoteEncodedVideoStream` analogue (likely backed by a WebRTC +`FrameTransformer`) and is a natural follow-up. + +### AUD-delimited bytestreams only + +The sender relies on `x264enc aud=true` emitting a NAL-type-9 AUD at +the start of every AU so it can find frame boundaries over the TCP +byte stream. Producers that don't emit AUDs would need a richer +splitter (e.g. detecting "new primary coded picture" via the slice +header's `first_mb_in_slice`). + +### Keyframe intervals dominate startup latency + +WebRTC's jitter buffer drops delta frames until it sees a keyframe, so +time-to-first-frame on the receiver is bounded by the x264enc +`key-int-max`. Lower `key-int-max` for faster startup at the cost of +bitrate overhead. diff --git a/examples/encoded_video_ingest/src/receiver.rs b/examples/encoded_video_ingest/src/receiver.rs new file mode 100644 index 000000000..99362df2a --- /dev/null +++ b/examples/encoded_video_ingest/src/receiver.rs @@ -0,0 +1,613 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Encoded ingest receiver with an in-process WGPU visualizer. +//! +//! Subscribes to a LiveKit room and renders the first incoming video track +//! directly in an `egui`/`wgpu` window. +//! +//! NOTE: the current SDK only exposes *decoded* frames on the receive side +//! (via `NativeVideoStream`). WebRTC's internal decoder runs in-process +//! before we hand the frame to the application. Encoded-frame receive is +//! a future enhancement — see README.md. + +use std::{ + sync::Arc, + time::{Duration, Instant}, +}; + +use anyhow::{anyhow, Result}; +use clap::Parser; +use eframe::Renderer; +use futures::StreamExt; +use livekit::{ + prelude::*, + webrtc::{ + native::yuv_helper, + prelude::{RtcVideoTrack, VideoBuffer}, + video_stream::native::{NativeVideoStream, NativeVideoStreamOptions}, + }, +}; +use livekit_api::access_token; +use log::{info, warn}; +use parking_lot::Mutex; +use tokio::sync::mpsc; + +#[derive(Parser, Debug, Clone)] +#[command(author, version, about, long_about = None)] +struct Args { + /// LiveKit server URL (or set LIVEKIT_URL env var) + #[arg(long, env = "LIVEKIT_URL")] + url: String, + + /// LiveKit API key (or set LIVEKIT_API_KEY env var) + #[arg(long, env = "LIVEKIT_API_KEY")] + api_key: String, + + /// LiveKit API secret (or set LIVEKIT_API_SECRET env var) + #[arg(long, env = "LIVEKIT_API_SECRET")] + api_secret: String, + + /// Room name to join + #[arg(long, default_value = "encoded-video-demo")] + room: String, + + /// Participant identity + #[arg(long, default_value = "encoded-receiver")] + identity: String, + + /// Only subscribe to the track from this participant identity + #[arg(long)] + from: Option, + + /// Enable vsync for smoother display at the cost of extra render latency + #[arg(long, default_value_t = false)] + vsync: bool, +} + +fn main() -> Result<()> { + env_logger::init(); + let args = Args::parse(); + let present_mode = if args.vsync { + eframe::wgpu::PresentMode::AutoVsync + } else { + eframe::wgpu::PresentMode::AutoNoVsync + }; + + eframe::run_native( + "LiveKit Encoded Video Receiver", + eframe::NativeOptions { + centered: true, + renderer: Renderer::Wgpu, + vsync: args.vsync, + wgpu_options: egui_wgpu::WgpuConfiguration { + present_mode, + desired_maximum_frame_latency: Some(1), + ..Default::default() + }, + ..Default::default() + }, + Box::new(|cc| Ok(Box::new(ReceiverApp::new(cc, args)))), + ) + .map_err(|err| anyhow!("receiver UI failed: {err}"))?; + + Ok(()) +} + +enum UiEvent { + Connected { room: Arc, sid: RoomSid }, + ConnectFailed { error: String }, + RoomEvent { event: RoomEvent }, +} + +struct ReceiverApp { + async_runtime: tokio::runtime::Runtime, + ui_rx: mpsc::UnboundedReceiver, + room: Option>, + render_state: egui_wgpu::RenderState, + renderer: Option, + active_sid: Option, + active_label: Option, + from: Option, + status: String, +} + +impl ReceiverApp { + fn new(cc: &eframe::CreationContext<'_>, args: Args) -> Self { + let async_runtime = + tokio::runtime::Builder::new_multi_thread().enable_all().build().unwrap(); + let (ui_tx, ui_rx) = mpsc::unbounded_channel(); + async_runtime.spawn(connect_task(args.clone(), ui_tx)); + + Self { + async_runtime, + ui_rx, + room: None, + render_state: cc.wgpu_render_state.clone().unwrap(), + renderer: None, + active_sid: None, + active_label: None, + from: args.from, + status: format!("Connecting to room '{}' as '{}'...", args.room, args.identity), + } + } + + fn event(&mut self, event: UiEvent) { + match event { + UiEvent::Connected { room, sid } => { + self.status = format!("Connected to room '{}' (sid {})", room.name(), sid); + self.room = Some(room); + } + UiEvent::ConnectFailed { error } => { + self.status = format!("Connection failed: {error}"); + } + UiEvent::RoomEvent { event } => self.room_event(event), + } + } + + fn room_event(&mut self, event: RoomEvent) { + match event { + RoomEvent::TrackSubscribed { track, publication, participant } => { + if let Some(from) = &self.from { + if participant.identity().as_str() != from { + return; + } + } + + let RemoteTrack::Video(video) = track else { + return; + }; + + if self.active_sid.is_some() { + info!( + "Ignoring extra video track {} (already have one active)", + publication.sid() + ); + return; + } + + let sid = publication.sid(); + let label = format!( + "{} from '{}': codec={}, {}x{}", + sid, + participant.identity(), + publication.mime_type(), + publication.dimension().0, + publication.dimension().1, + ); + + info!("Subscribed to {label}"); + self.renderer = Some(VideoRenderer::new( + self.async_runtime.handle(), + self.render_state.clone(), + video.rtc_track(), + )); + self.active_sid = Some(sid); + self.active_label = Some(label.clone()); + self.status = format!("Rendering {label}"); + } + RoomEvent::TrackUnsubscribed { publication, .. } + | RoomEvent::TrackUnpublished { publication, .. } => { + if self.active_sid.as_ref() == Some(&publication.sid()) { + info!("Track {} ended", publication.sid()); + self.renderer = None; + self.active_sid = None; + self.active_label = None; + self.status = "Waiting for a video track...".to_string(); + } + } + RoomEvent::Disconnected { reason } => { + self.renderer = None; + self.active_sid = None; + self.active_label = None; + self.room = None; + self.status = format!("Disconnected: {reason:?}"); + } + _ => {} + } + } + + fn draw_video(&self, ui: &mut egui::Ui) { + let rect = ui.available_rect_before_wrap(); + let response = ui.allocate_rect(rect, egui::Sense::hover()); + let rect = response.rect; + + ui.painter().rect_filled(rect, egui::CornerRadius::default(), egui::Color32::BLACK); + + let Some(renderer) = &self.renderer else { + ui.painter().text( + rect.center(), + egui::Align2::CENTER_CENTER, + &self.status, + egui::FontId::proportional(18.0), + egui::Color32::WHITE, + ); + return; + }; + + let resolution = renderer.resolution(); + if let Some(texture_id) = renderer.texture_id() { + let image_rect = fit_rect(rect, resolution.0, resolution.1); + ui.painter().image( + texture_id, + image_rect, + egui::Rect::from_min_max(egui::pos2(0.0, 0.0), egui::pos2(1.0, 1.0)), + egui::Color32::WHITE, + ); + } + + ui.painter().text( + egui::pos2(rect.min.x + 8.0, rect.max.y - 8.0), + egui::Align2::LEFT_BOTTOM, + format!( + "{}x{} {}", + resolution.0, + resolution.1, + self.active_label.as_deref().unwrap_or("") + ), + egui::FontId::default(), + egui::Color32::WHITE, + ); + } +} + +impl eframe::App for ReceiverApp { + fn update(&mut self, ctx: &egui::Context, _frame: &mut eframe::Frame) { + while let Ok(event) = self.ui_rx.try_recv() { + self.event(event); + } + + egui::TopBottomPanel::top("status_panel").show(ctx, |ui| { + ui.horizontal(|ui| { + ui.label(&self.status); + }); + }); + + egui::CentralPanel::default().show(ctx, |ui| { + self.draw_video(ui); + }); + + ctx.request_repaint(); + } +} + +impl Drop for ReceiverApp { + fn drop(&mut self) { + if let Some(room) = self.room.take() { + if let Err(err) = self.async_runtime.block_on(room.close()) { + warn!("room.close: {err}"); + } + } + } +} + +async fn connect_task(args: Args, ui_tx: mpsc::UnboundedSender) { + let token = match access_token::AccessToken::with_api_key(&args.api_key, &args.api_secret) + .with_identity(&args.identity) + .with_name(&args.identity) + .with_grants(access_token::VideoGrants { + room_join: true, + room: args.room.clone(), + can_subscribe: true, + ..Default::default() + }) + .to_jwt() + { + Ok(token) => token, + Err(err) => { + let _ = ui_tx.send(UiEvent::ConnectFailed { error: err.to_string() }); + return; + } + }; + + info!("Connecting to LiveKit room '{}' as '{}'...", args.room, args.identity); + let mut room_options = RoomOptions::default(); + room_options.auto_subscribe = true; + room_options.adaptive_stream = false; + + match Room::connect(&args.url, &token, room_options).await { + Ok((room, events)) => { + let sid = room.sid().await; + let room = Arc::new(room); + info!("Connected: {} (sid {})", room.name(), sid); + let _ = ui_tx.send(UiEvent::Connected { room, sid }); + tokio::spawn(room_event_task(events, ui_tx)); + } + Err(err) => { + let _ = ui_tx.send(UiEvent::ConnectFailed { error: err.to_string() }); + } + } +} + +async fn room_event_task( + mut events: mpsc::UnboundedReceiver, + ui_tx: mpsc::UnboundedSender, +) { + while let Some(event) = events.recv().await { + let _ = ui_tx.send(UiEvent::RoomEvent { event }); + } +} + +fn fit_rect(container: egui::Rect, width: u32, height: u32) -> egui::Rect { + if width == 0 || height == 0 { + return container; + } + + let source_aspect = width as f32 / height as f32; + let container_aspect = container.width() / container.height(); + let size = if container_aspect > source_aspect { + egui::vec2(container.height() * source_aspect, container.height()) + } else { + egui::vec2(container.width(), container.width() / source_aspect) + }; + + egui::Rect::from_center_size(container.center(), size) +} + +struct VideoRenderer { + internal: Arc>, + + #[allow(dead_code)] + rtc_track: RtcVideoTrack, +} + +struct RendererInternal { + render_state: egui_wgpu::RenderState, + width: u32, + height: u32, + rgba_data: Vec, + texture: Option, + texture_view: Option, + egui_texture: Option, +} + +impl VideoRenderer { + fn new( + async_handle: &tokio::runtime::Handle, + render_state: egui_wgpu::RenderState, + rtc_track: RtcVideoTrack, + ) -> Self { + let internal = Arc::new(Mutex::new(RendererInternal { + render_state, + width: 0, + height: 0, + rgba_data: Vec::default(), + texture: None, + texture_view: None, + egui_texture: None, + })); + + let mut video_sink = NativeVideoStream::with_options( + rtc_track.clone(), + NativeVideoStreamOptions { queue_size_frames: Some(1) }, + ); + std::thread::spawn({ + let async_handle = async_handle.clone(); + let internal = internal.clone(); + move || { + let mut frames: u64 = 0; + let mut last_log = Instant::now(); + while let Some(frame) = async_handle.block_on(video_sink.next()) { + let mut internal = internal.lock(); + let buffer = frame.buffer.as_ref(); + let width = buffer.width(); + let height = buffer.height(); + + internal.ensure_texture_size(width, height); + convert_to_abgr(buffer, &mut internal.rgba_data); + + internal.render_state.queue.write_texture( + eframe::wgpu::TexelCopyTextureInfo { + texture: internal.texture.as_ref().unwrap(), + mip_level: 0, + origin: eframe::wgpu::Origin3d::default(), + aspect: eframe::wgpu::TextureAspect::default(), + }, + &internal.rgba_data, + eframe::wgpu::TexelCopyBufferLayout { + bytes_per_row: Some(width * 4), + ..Default::default() + }, + eframe::wgpu::Extent3d { width, height, ..Default::default() }, + ); + + frames += 1; + if last_log.elapsed() >= Duration::from_secs(2) { + info!( + "recv: {}x{}, ~{:.1} fps", + width, + height, + frames as f64 / last_log.elapsed().as_secs_f64() + ); + frames = 0; + last_log = Instant::now(); + } + } + info!("frame renderer ended"); + } + }); + + Self { rtc_track, internal } + } + + fn resolution(&self) -> (u32, u32) { + let internal = self.internal.lock(); + (internal.width, internal.height) + } + + fn texture_id(&self) -> Option { + self.internal.lock().egui_texture + } +} + +fn convert_to_abgr(buffer: &dyn VideoBuffer, dst: &mut [u8]) { + let width = buffer.width(); + let height = buffer.height(); + let stride = width * 4; + + if let Some(buffer) = buffer.as_i420() { + let (stride_y, stride_u, stride_v) = buffer.strides(); + let (data_y, data_u, data_v) = buffer.data(); + yuv_helper::i420_to_abgr( + data_y, + stride_y, + data_u, + stride_u, + data_v, + stride_v, + dst, + stride, + width as i32, + height as i32, + ); + return; + } + + if let Some(buffer) = buffer.as_nv12() { + let (stride_y, stride_uv) = buffer.strides(); + let (data_y, data_uv) = buffer.data(); + yuv_helper::nv12_to_abgr( + data_y, + stride_y, + data_uv, + stride_uv, + dst, + stride, + width as i32, + height as i32, + ); + return; + } + + if let Some(buffer) = buffer.as_i422() { + let (stride_y, stride_u, stride_v) = buffer.strides(); + let (data_y, data_u, data_v) = buffer.data(); + yuv_helper::i422_to_abgr( + data_y, + stride_y, + data_u, + stride_u, + data_v, + stride_v, + dst, + stride, + width as i32, + height as i32, + ); + return; + } + + if let Some(buffer) = buffer.as_i444() { + let (stride_y, stride_u, stride_v) = buffer.strides(); + let (data_y, data_u, data_v) = buffer.data(); + yuv_helper::i444_to_abgr( + data_y, + stride_y, + data_u, + stride_u, + data_v, + stride_v, + dst, + stride, + width as i32, + height as i32, + ); + return; + } + + if let Some(buffer) = buffer.as_i010() { + let (stride_y, stride_u, stride_v) = buffer.strides(); + let (data_y, data_u, data_v) = buffer.data(); + yuv_helper::i010_to_abgr( + data_y, + stride_y, + data_u, + stride_u, + data_v, + stride_v, + dst, + stride, + width as i32, + height as i32, + ); + return; + } + + let buffer = buffer.to_i420(); + let (stride_y, stride_u, stride_v) = buffer.strides(); + let (data_y, data_u, data_v) = buffer.data(); + yuv_helper::i420_to_abgr( + data_y, + stride_y, + data_u, + stride_u, + data_v, + stride_v, + dst, + stride, + width as i32, + height as i32, + ); +} + +impl RendererInternal { + fn ensure_texture_size(&mut self, width: u32, height: u32) { + if self.width == width && self.height == height { + return; + } + + self.width = width; + self.height = height; + self.rgba_data.resize((width * height * 4) as usize, 0); + + self.texture = + Some(self.render_state.device.create_texture(&eframe::wgpu::TextureDescriptor { + label: Some("lk-receiver-texture"), + usage: eframe::wgpu::TextureUsages::TEXTURE_BINDING + | eframe::wgpu::TextureUsages::COPY_DST, + dimension: eframe::wgpu::TextureDimension::D2, + size: eframe::wgpu::Extent3d { width, height, ..Default::default() }, + sample_count: 1, + mip_level_count: 1, + format: eframe::wgpu::TextureFormat::Rgba8Unorm, + view_formats: &[eframe::wgpu::TextureFormat::Rgba8Unorm], + })); + + self.texture_view = Some(self.texture.as_mut().unwrap().create_view( + &eframe::wgpu::TextureViewDescriptor { + label: Some("lk-receiver-texture-view"), + format: Some(eframe::wgpu::TextureFormat::Rgba8Unorm), + dimension: Some(eframe::wgpu::TextureViewDimension::D2), + mip_level_count: Some(1), + array_layer_count: Some(1), + ..Default::default() + }, + )); + + if let Some(texture_id) = self.egui_texture { + self.render_state.renderer.write().update_egui_texture_from_wgpu_texture( + &self.render_state.device, + self.texture_view.as_ref().unwrap(), + eframe::wgpu::FilterMode::Linear, + texture_id, + ); + } else { + self.egui_texture = Some(self.render_state.renderer.write().register_native_texture( + &self.render_state.device, + self.texture_view.as_ref().unwrap(), + eframe::wgpu::FilterMode::Linear, + )); + } + } +} diff --git a/examples/encoded_video_ingest/src/sender.rs b/examples/encoded_video_ingest/src/sender.rs new file mode 100644 index 000000000..5d784364d --- /dev/null +++ b/examples/encoded_video_ingest/src/sender.rs @@ -0,0 +1,800 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Encoded H.264 / H.265 / VP8 / VP9 / AV1 ingest sender. +//! +//! Connects to a gstreamer pipeline as a TCP client and pushes each +//! decoded access unit / frame straight through +//! `NativeEncodedVideoSource::capture_frame`. No software encoding +//! happens on the Rust side — the bytes on the wire are the bytes that +//! get packetized into RTP. +//! +//! Two framings are supported, picked by `--codec`: +//! +//! * **H.264 / H.265**: raw Annex-B bytestream. The sender splits on +//! AUD NAL boundaries (NAL type 9 for H.264, type 35 for H.265) and +//! delivers each access unit. +//! * **VP8 / VP9 / AV1**: IVF container (gstreamer's `ivfmux` or +//! `avmux_ivf`). The sender parses the 32-byte IVF file header once +//! (when present), then each 12-byte frame header + payload, and +//! delivers each raw VPx frame (for AV1, each IVF record is one +//! Temporal Unit — a complete OBU sequence for one frame). +//! +//! TCP is used instead of UDP because macOS caps per-datagram UDP +//! payloads well below 64 KB by default, which is easy to exceed with +//! keyframes. The matching gstreamer pipelines are documented in +//! README.md. + +use std::{ + sync::{ + atomic::{AtomicBool, AtomicU64, Ordering}, + Arc, Mutex, + }, + time::{Duration, Instant}, +}; + +use anyhow::{Context, Result}; +use clap::Parser; +use libwebrtc::video_source::{EncodedFrameInfo, RtcVideoSource, VideoCodec, VideoResolution}; +use livekit::{ + options::{TrackPublishOptions, VideoCodec as LkVideoCodec, VideoEncoding}, + prelude::*, + webrtc::video_source::native::{EncodedVideoSourceObserver, NativeEncodedVideoSource}, +}; +use livekit_api::access_token; +use log::{info, warn}; +use tokio::{io::AsyncReadExt, net::TcpStream, time::sleep}; + +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + /// LiveKit server URL (or set LIVEKIT_URL env var) + #[arg(long, env = "LIVEKIT_URL")] + url: String, + + /// LiveKit API key (or set LIVEKIT_API_KEY env var) + #[arg(long, env = "LIVEKIT_API_KEY")] + api_key: String, + + /// LiveKit API secret (or set LIVEKIT_API_SECRET env var) + #[arg(long, env = "LIVEKIT_API_SECRET")] + api_secret: String, + + /// Room name to join + #[arg(long, default_value = "encoded-video-demo")] + room: String, + + /// Participant identity + #[arg(long, default_value = "encoded-sender")] + identity: String, + + /// Host of the gstreamer `tcpserversink` producing the Annex-B bytestream + #[arg(long, default_value = "127.0.0.1")] + tcp_host: String, + + /// Port of the gstreamer `tcpserversink` producing the Annex-B bytestream + #[arg(long, default_value_t = 5000)] + tcp_port: u16, + + /// Declared stream width (px) + #[arg(long, default_value_t = 640)] + width: u32, + + /// Declared stream height (px) + #[arg(long, default_value_t = 480)] + height: u32, + + /// RTP sender max bitrate advertised to WebRTC, in kbps + #[arg(long, default_value_t = 2_500)] + max_bitrate_kbps: u64, + + /// RTP sender max framerate advertised to WebRTC + #[arg(long, default_value_t = 30.0)] + max_framerate: f64, + + /// Encoded codec on the wire. Must match the gstreamer pipeline. + #[arg(long, value_enum, default_value_t = CodecArg::H264)] + codec: CodecArg, +} + +/// Codec selector for the CLI. Drives both framing (Annex-B vs. IVF) +/// and keyframe detection. +#[derive(Debug, Copy, Clone, PartialEq, Eq, clap::ValueEnum)] +enum CodecArg { + H264, + H265, + Vp8, + Vp9, + Av1, +} + +impl CodecArg { + fn webrtc_codec(self) -> VideoCodec { + match self { + CodecArg::H264 => VideoCodec::H264, + CodecArg::H265 => VideoCodec::H265, + CodecArg::Vp8 => VideoCodec::Vp8, + CodecArg::Vp9 => VideoCodec::Vp9, + CodecArg::Av1 => VideoCodec::Av1, + } + } + + fn livekit_codec(self) -> LkVideoCodec { + match self { + CodecArg::H264 => LkVideoCodec::H264, + CodecArg::H265 => LkVideoCodec::H265, + CodecArg::Vp8 => LkVideoCodec::VP8, + CodecArg::Vp9 => LkVideoCodec::VP9, + CodecArg::Av1 => LkVideoCodec::AV1, + } + } + + /// NAL unit type from the first byte after a start code. + /// H.264: lower 5 bits. H.265: bits 1..7. + fn nal_type(self, first_byte: u8) -> u8 { + match self { + CodecArg::H264 => first_byte & 0x1F, + CodecArg::H265 => (first_byte >> 1) & 0x3F, + // VPx/AV1 have no NAL units; callers should not reach this. + CodecArg::Vp8 | CodecArg::Vp9 | CodecArg::Av1 => 0, + } + } + + /// Access-unit delimiter NAL type. 9 (AUD) for H.264, 35 (AUD_NUT) + /// for H.265. Undefined for IVF-framed codecs. + fn aud_nal_type(self) -> u8 { + match self { + CodecArg::H264 => 9, + CodecArg::H265 => 35, + CodecArg::Vp8 | CodecArg::Vp9 | CodecArg::Av1 => u8::MAX, + } + } + + /// Whether a given NAL type is a keyframe NAL. + /// H.264: IDR slice (5). H.265: any IRAP (BLA/IDR/CRA, 16..=23). + /// IVF-framed codecs use [`is_keyframe`] directly; this never runs. + fn is_keyframe_nal(self, nal_type: u8) -> bool { + match self { + CodecArg::H264 => nal_type == 5, + CodecArg::H265 => (16..=23).contains(&nal_type), + CodecArg::Vp8 | CodecArg::Vp9 | CodecArg::Av1 => false, + } + } + + fn name(self) -> &'static str { + match self { + CodecArg::H264 => "H.264", + CodecArg::H265 => "H.265", + CodecArg::Vp8 => "VP8", + CodecArg::Vp9 => "VP9", + CodecArg::Av1 => "AV1", + } + } + + /// IVF FOURCC expected on the wire. Only meaningful for codecs + /// delivered via `ivfmux` / `avmux_ivf`. + fn ivf_fourcc(self) -> Option<&'static [u8; 4]> { + match self { + CodecArg::Vp8 => Some(b"VP80"), + CodecArg::Vp9 => Some(b"VP90"), + CodecArg::Av1 => Some(b"AV01"), + _ => None, + } + } +} + +/// Simple observer that logs feedback from the encoder pipeline. Real +/// producers should react here — e.g. nudge their hardware encoder to +/// emit an IDR on `on_keyframe_requested`, or clamp bitrate on +/// `on_target_bitrate`. +struct LoggingObserver { + last_bitrate_log: Mutex>, + target_bitrate_bps: Arc, +} + +impl LoggingObserver { + fn new(target_bitrate_bps: Arc) -> Self { + Self { last_bitrate_log: Mutex::new(None), target_bitrate_bps } + } +} + +impl EncodedVideoSourceObserver for LoggingObserver { + fn on_keyframe_requested(&self) { + warn!( + "keyframe requested by receiver — producer should emit a keyframe on the next frame \ + (in this demo the next keyframe comes when the gstreamer encoder hits its \ + keyframe-interval knob, e.g. x264enc/x265enc key-int-max or vp8enc keyframe-max-dist)" + ); + } + + fn on_target_bitrate(&self, bitrate_bps: u32, framerate_fps: f64) { + self.target_bitrate_bps.store(bitrate_bps as u64, Ordering::Relaxed); + + // Rate-limit logging to 1 Hz. + let mut last = self.last_bitrate_log.lock().unwrap(); + let now = Instant::now(); + if last.is_none_or(|t| now.duration_since(t) >= Duration::from_secs(1)) { + *last = Some(now); + info!("target bitrate update: {} kbps @ {:.1} fps", bitrate_bps / 1000, framerate_fps); + } + } +} + +/// Higher-level demuxer: hides whether the wire is Annex-B or IVF. +enum Demuxer { + AnnexB(AuSplitter), + Ivf(IvfReader), +} + +impl Demuxer { + fn new(codec: CodecArg) -> Self { + match codec { + CodecArg::H264 | CodecArg::H265 => Demuxer::AnnexB(AuSplitter::new(codec)), + CodecArg::Vp8 | CodecArg::Vp9 | CodecArg::Av1 => Demuxer::Ivf(IvfReader::new(codec)), + } + } + + fn feed(&mut self, chunk: &[u8], out: &mut Vec>) { + match self { + Demuxer::AnnexB(s) => s.feed(chunk, out), + Demuxer::Ivf(r) => r.feed(chunk, out), + } + } + + /// True if the demuxer has detected a byte misalignment it can't + /// recover from without a fresh TCP connection. Only meaningful + /// for IVF today. + fn desynced(&self) -> bool { + match self { + Demuxer::AnnexB(_) => false, + Demuxer::Ivf(r) => r.desynced, + } + } +} + +/// Reads IVF-framed video off the wire and emits one compressed video +/// frame per call to `feed` per available frame. Format per libvpx: +/// +/// File header (32 bytes, optional): "DKIF", u16 version, u16 +/// header_len, 4-byte FOURCC, u16 width, u16 height, u32 tb_num, +/// u32 tb_den, u32 frame_count, u32 unused. +/// +/// Frame header (12 bytes each): u32 frame_size, u64 pts. +/// +/// Frame payload: `frame_size` bytes. All integers little-endian. +/// +/// The file header is *optional* in our parser: gstreamer's +/// `avmux_ivf` on a non-seekable `tcpserversink` emits only per-frame +/// records (libavformat writes `DKIF` at `write_header` time, but the +/// ffmpeg AVIO wrapper in gst-libav appears to swallow it when the +/// output is non-seekable). We still accept `ivfmux` (native +/// gst-plugins-bad element), which does emit `DKIF`, by parsing the +/// file header if it's the first 4 bytes. Either way, gstreamer's +/// one-buffer-per-packet semantics mean new `tcpserversink` clients +/// land on an IVF record boundary. +/// +/// If we ever parse a `frame_size` that exceeds [`MAX_FRAME_BYTES`], +/// we're byte-misaligned (should be rare in practice); the reader +/// flips `desynced=true`, which the main loop reads to force a TCP +/// reconnect and a fresh alignment from the next gstreamer buffer. +const MAX_FRAME_BYTES: usize = 8 * 1024 * 1024; + +struct IvfReader { + codec: CodecArg, + buf: Vec, + /// Set once we've either consumed a 32-byte DKIF header or + /// decided there isn't one. After this, `buf` is interpreted as + /// back-to-back 12-byte-header + payload records. + header_phase_done: bool, + /// True if a frame_size field was absurd; main loop should + /// disconnect and reconnect to re-align. + desynced: bool, +} + +impl IvfReader { + fn new(codec: CodecArg) -> Self { + Self { + codec, + buf: Vec::with_capacity(256 * 1024), + header_phase_done: false, + desynced: false, + } + } + + fn feed(&mut self, chunk: &[u8], out: &mut Vec>) { + self.buf.extend_from_slice(chunk); + + if !self.header_phase_done { + // Decide whether the stream starts with a DKIF file header. + // We need at least 4 bytes to check the magic, and 32 to + // consume the full header if present. + if self.buf.len() < 4 { + return; + } + if &self.buf[0..4] == b"DKIF" { + if self.buf.len() < 32 { + return; + } + let fourcc = &self.buf[8..12]; + if let Some(expected) = self.codec.ivf_fourcc() { + if fourcc != expected { + warn!( + "IVF: expected FOURCC {:?} for {}, got {:?}", + std::str::from_utf8(expected).unwrap_or("?"), + self.codec.name(), + std::str::from_utf8(fourcc).unwrap_or("?"), + ); + } + } + info!( + "IVF: file header OK (codec fourcc={})", + std::str::from_utf8(fourcc).unwrap_or("?") + ); + self.buf.drain(..32); + } else { + // No file header — typical for gstreamer's `avmux_ivf` + // on tcpserversink. Gstreamer buffer boundaries keep + // us frame-aligned, so treat byte 0 as the start of a + // per-frame record. + info!( + "IVF: no DKIF file header on this stream (typical for gstreamer \ + avmux_ivf on tcpserversink); parsing per-frame records directly" + ); + } + self.header_phase_done = true; + } + + // Emit as many whole frames as we have. + loop { + if self.buf.len() < 12 { + return; + } + let size = + u32::from_le_bytes([self.buf[0], self.buf[1], self.buf[2], self.buf[3]]) as usize; + if size == 0 || size > MAX_FRAME_BYTES { + warn!( + "IVF: implausible frame_size={size} bytes — byte stream is misaligned. \ + Dropping connection so the main loop can reconnect and re-anchor on the \ + next gstreamer buffer boundary." + ); + self.desynced = true; + self.buf.clear(); + return; + } + if self.buf.len() < 12 + size { + return; + } + let frame = self.buf[12..12 + size].to_vec(); + self.buf.drain(..12 + size); + out.push(frame); + } + } +} + +/// Splits an incoming Annex-B bytestream into access units on AUD +/// boundaries. The AUD NAL type and NAL-type extraction are codec +/// specific — pass the right `CodecArg`. +/// +/// Relies on the upstream parser emitting an AUD at the start of every +/// AU (`x264enc aud=true` for H.264, `x265enc option-string="aud=1"` +/// plumbed through `h265parse` for H.265). Bytes before the first AUD +/// are discarded; each subsequent AU is emitted when the *next* AU's +/// AUD arrives (so there's always one AU of buffering lag, bounded by +/// the frame interval). +struct AuSplitter { + codec: CodecArg, + buf: Vec, + /// Offset (into `buf`) of the start code of the AU currently being + /// accumulated. `None` before the first AUD has been observed. + au_start: Option, + /// Position up to which `buf` has already been scanned for start codes. + scan_pos: usize, +} + +impl AuSplitter { + fn new(codec: CodecArg) -> Self { + Self { codec, buf: Vec::with_capacity(256 * 1024), au_start: None, scan_pos: 0 } + } + + fn feed(&mut self, chunk: &[u8], out: &mut Vec>) { + self.buf.extend_from_slice(chunk); + + // Scan for start codes. We need 4 more bytes to decide (3-byte + // start code + 1 NAL header byte). A 4-byte start code is detected + // one byte earlier and handled naturally as "zero byte, then + // 3-byte start code" collapsing into a 4-byte pattern. + let aud = self.codec.aud_nal_type(); + while self.scan_pos + 3 < self.buf.len() { + let i = self.scan_pos; + let (sc_start, sc_len) = if i + 4 <= self.buf.len() + && self.buf[i] == 0 + && self.buf[i + 1] == 0 + && self.buf[i + 2] == 0 + && self.buf[i + 3] == 1 + { + // 4-byte start code at i. We still need the NAL header byte after it. + if i + 5 > self.buf.len() { + break; + } + (i, 4) + } else if self.buf[i] == 0 && self.buf[i + 1] == 0 && self.buf[i + 2] == 1 { + (i, 3) + } else { + self.scan_pos += 1; + continue; + }; + + let nal_off = sc_start + sc_len; + if self.codec.nal_type(self.buf[nal_off]) == aud { + // AUD — boundary between AUs. + if let Some(start) = self.au_start.take() { + out.push(self.buf[start..sc_start].to_vec()); + } + self.au_start = Some(sc_start); + } + self.scan_pos = nal_off + 1; + } + + // Compact: drop bytes before the current AU start (or before the + // last 3 bytes, in case a start code straddles the next feed). + let drain_before = self.au_start.unwrap_or_else(|| self.buf.len().saturating_sub(3)); + if drain_before > 0 { + self.buf.drain(..drain_before); + self.scan_pos = self.scan_pos.saturating_sub(drain_before); + if self.au_start.is_some() { + self.au_start = Some(0); + } + } + } +} + +/// Minimal keyframe probe. For H.264/H.265 it scans for a keyframe +/// NAL (IDR slice / IRAP); for VP8 it reads bit 0 of the frame tag +/// (RFC 6386 §9.1: 0 = keyframe, 1 = interframe); for VP9 it decodes +/// the leading bits of the uncompressed header (VP9 bitstream spec +/// §6.2); for AV1 it scans the OBUs in the Temporal Unit for an +/// OBU_SEQUENCE_HEADER (which libaom/SVT-AV1/rav1e only emit at +/// keyframes — this is the same heuristic WebRTC's own AV1 RTP +/// packetizer uses). +fn is_keyframe(codec: CodecArg, data: &[u8]) -> bool { + match codec { + CodecArg::H264 | CodecArg::H265 => is_keyframe_annex_b(codec, data), + CodecArg::Vp8 => !data.is_empty() && (data[0] & 0x01) == 0, + CodecArg::Vp9 => is_keyframe_vp9(data), + CodecArg::Av1 => is_keyframe_av1(data), + } +} + +/// AV1 keyframe probe. Walks the OBUs in a Temporal Unit and returns +/// true if any OBU has type `OBU_SEQUENCE_HEADER` (1). AV1 spec §5.3.2 +/// (OBU header) + §5.3.1 (leb128): +/// +/// * byte 0 bits 6..=3: `obu_type`. +/// * byte 0 bit 2: `obu_extension_flag`; if set, one extension byte +/// follows. +/// * byte 0 bit 1: `obu_has_size_field`; if set, a leb128-encoded +/// `obu_size` follows and gives the payload length. If clear, the +/// OBU runs to the end of the input (legacy AV1) — so we stop +/// scanning because we can't skip it. +/// +/// Assumes the Low Overhead Bitstream Format produced by gstreamer's +/// `av1parse stream-format=obu-stream,alignment=tu` + `avmux_ivf`: +/// one Temporal Unit per IVF record, each OBU carries its own size. +fn is_keyframe_av1(mut data: &[u8]) -> bool { + const OBU_SEQUENCE_HEADER: u8 = 1; + while !data.is_empty() { + let header = data[0]; + let obu_type = (header >> 3) & 0x0F; + let ext = (header & 0x04) != 0; + let has_size = (header & 0x02) != 0; + + let mut off = 1; + if ext { + if off >= data.len() { + return false; + } + off += 1; + } + if !has_size { + // No size field means we can't skip to the next OBU; treat + // this OBU as the last one and decide based on what we've + // seen so far. + return obu_type == OBU_SEQUENCE_HEADER; + } + let (size, size_len) = match read_leb128(&data[off..]) { + Some(v) => v, + None => return false, + }; + off += size_len; + let payload_end = match off.checked_add(size as usize) { + Some(e) if e <= data.len() => e, + _ => return false, + }; + if obu_type == OBU_SEQUENCE_HEADER { + return true; + } + data = &data[payload_end..]; + } + false +} + +/// Decodes an AV1 leb128 (unsigned little-endian base-128) integer. +/// Returns `(value, bytes_consumed)` or `None` on truncated input. +/// AV1 spec §4.10.5 caps the encoding at 8 bytes and 32 significant +/// bits; we enforce the 8-byte limit and keep the value in a u32. +fn read_leb128(input: &[u8]) -> Option<(u32, usize)> { + let mut value: u64 = 0; + for (i, &byte) in input.iter().take(8).enumerate() { + value |= ((byte & 0x7F) as u64) << (i * 7); + if (byte & 0x80) == 0 { + return u32::try_from(value).ok().map(|v| (v, i + 1)); + } + } + None +} + +/// VP9 uncompressed-header keyframe probe. Reads first-byte bits (MSB +/// first) per VP9 bitstream spec §6.2: +/// +/// * bits 7..=6: `frame_marker` (must be `0b10`). +/// * bit 5: `profile_low_bit`, bit 4: `profile_high_bit` +/// (combined `profile` ∈ 0..=3). +/// * For `profile == 3`: bit 3 is reserved-zero, bit 2 is +/// `show_existing_frame`, bit 1 is `frame_type`. +/// * For `profile != 3`: bit 3 is `show_existing_frame`, bit 2 is +/// `frame_type`. +/// +/// A keyframe has `show_existing_frame == 0` and `frame_type == 0`. +/// `show_existing_frame == 1` records redisplay a previously decoded +/// buffer and carry no new coded data, so they are explicitly not +/// keyframes. +fn is_keyframe_vp9(data: &[u8]) -> bool { + let Some(&b0) = data.first() else { + return false; + }; + if (b0 >> 6) & 0b11 != 0b10 { + return false; + } + let profile_low = (b0 >> 5) & 0x1; + let profile_high = (b0 >> 4) & 0x1; + let profile = (profile_high << 1) | profile_low; + let (show_existing_bit, frame_type_bit) = if profile == 3 { (2, 1) } else { (3, 2) }; + let show_existing = (b0 >> show_existing_bit) & 0x1; + if show_existing != 0 { + return false; + } + let frame_type = (b0 >> frame_type_bit) & 0x1; + frame_type == 0 +} + +fn is_keyframe_annex_b(codec: CodecArg, data: &[u8]) -> bool { + let mut i = 0usize; + while i + 3 < data.len() { + let is_four = i + 4 <= data.len() + && data[i] == 0 + && data[i + 1] == 0 + && data[i + 2] == 0 + && data[i + 3] == 1; + let is_three = data[i] == 0 && data[i + 1] == 0 && data[i + 2] == 1; + if is_four || is_three { + let payload_idx = if is_four { i + 4 } else { i + 3 }; + if payload_idx < data.len() && codec.is_keyframe_nal(codec.nal_type(data[payload_idx])) + { + return true; + } + i = payload_idx + 1; + } else { + i += 1; + } + } + false +} + +#[tokio::main] +async fn main() -> Result<()> { + env_logger::init(); + let args = Args::parse(); + + let shutdown = Arc::new(AtomicBool::new(false)); + tokio::spawn({ + let shutdown = shutdown.clone(); + async move { + let _ = tokio::signal::ctrl_c().await; + shutdown.store(true, Ordering::Release); + info!("Ctrl-C received, shutting down..."); + } + }); + + let token = access_token::AccessToken::with_api_key(&args.api_key, &args.api_secret) + .with_identity(&args.identity) + .with_name(&args.identity) + .with_grants(access_token::VideoGrants { + room_join: true, + room: args.room.clone(), + can_publish: true, + ..Default::default() + }) + .to_jwt()?; + + info!("Connecting to LiveKit room '{}' as '{}'...", args.room, args.identity); + let mut room_options = RoomOptions::default(); + room_options.auto_subscribe = false; + room_options.dynacast = false; + let (room, _events) = Room::connect(&args.url, &token, room_options).await?; + let room = Arc::new(room); + info!("Connected: {} (sid {})", room.name(), room.sid().await); + + let resolution = VideoResolution { width: args.width, height: args.height }; + let source = NativeEncodedVideoSource::new(args.codec.webrtc_codec(), resolution); + let target_bitrate_bps = Arc::new(AtomicU64::new(0)); + source.set_observer(Arc::new(LoggingObserver::new(target_bitrate_bps.clone()))); + info!( + "Created encoded {} source: {}x{} (source_id={})", + args.codec.name(), + args.width, + args.height, + source.source_id() + ); + + let track_name = match args.codec { + CodecArg::H264 => "encoded-h264", + CodecArg::H265 => "encoded-h265", + CodecArg::Vp8 => "encoded-vp8", + CodecArg::Vp9 => "encoded-vp9", + CodecArg::Av1 => "encoded-av1", + }; + let track = + LocalVideoTrack::create_video_track(track_name, RtcVideoSource::Encoded(source.clone())); + + let publish_opts = TrackPublishOptions { + source: TrackSource::Camera, + simulcast: false, + video_codec: args.codec.livekit_codec(), + video_encoding: Some(VideoEncoding { + max_bitrate: args.max_bitrate_kbps.saturating_mul(1000), + max_framerate: args.max_framerate, + }), + ..Default::default() + }; + room.local_participant() + .publish_track(LocalTrack::Video(track), publish_opts) + .await + .context("publish_track failed")?; + info!( + "Published encoded {} track (max {} kbps @ {:.1} fps)", + args.codec.name(), + args.max_bitrate_kbps, + args.max_framerate + ); + + let frames_accepted = Arc::new(AtomicU64::new(0)); + let frames_dropped = Arc::new(AtomicU64::new(0)); + let keyframes = Arc::new(AtomicU64::new(0)); + let encoded_bytes = Arc::new(AtomicU64::new(0)); + + { + let frames_accepted = frames_accepted.clone(); + let frames_dropped = frames_dropped.clone(); + let keyframes = keyframes.clone(); + let encoded_bytes = encoded_bytes.clone(); + let target_bitrate_bps = target_bitrate_bps.clone(); + tokio::spawn(async move { + let mut last = Instant::now(); + loop { + sleep(Duration::from_secs(2)).await; + let elapsed = last.elapsed().as_secs_f64(); + last = Instant::now(); + let ok = frames_accepted.swap(0, Ordering::Relaxed); + let dropped = frames_dropped.swap(0, Ordering::Relaxed); + let kf = keyframes.swap(0, Ordering::Relaxed); + let bytes = encoded_bytes.swap(0, Ordering::Relaxed); + if ok + dropped > 0 { + let encoded_kbps = bytes as f64 * 8.0 / elapsed / 1000.0; + let target_kbps = target_bitrate_bps.load(Ordering::Relaxed) / 1000; + info!( + "ingest: {:.1} fps accepted, {:.1} fps dropped, {:.0} kbps encoded \ + (target {} kbps), {} keyframes", + ok as f64 / elapsed, + dropped as f64 / elapsed, + encoded_kbps, + target_kbps, + kf + ); + } + } + }); + } + + // Reconnect loop: if gstreamer restarts, we come back up automatically. + while !shutdown.load(Ordering::Acquire) { + let addr = format!("{}:{}", args.tcp_host, args.tcp_port); + let framing = match args.codec { + CodecArg::H264 | CodecArg::H265 => "Annex-B", + CodecArg::Vp8 | CodecArg::Vp9 | CodecArg::Av1 => "IVF", + }; + info!("Connecting to {addr} for {} {framing} bytestream...", args.codec.name()); + let mut stream = match TcpStream::connect(&addr).await { + Ok(s) => s, + Err(e) => { + warn!("connect {addr} failed: {e}. Retrying in 1s..."); + sleep(Duration::from_secs(1)).await; + continue; + } + }; + let _ = stream.set_nodelay(true); + info!("Connected to {addr}"); + + let mut demuxer = Demuxer::new(args.codec); + let mut read_buf = vec![0u8; 64 * 1024]; + let mut out = Vec::new(); + loop { + if shutdown.load(Ordering::Acquire) { + break; + } + let n = tokio::select! { + r = stream.read(&mut read_buf) => r, + _ = sleep(Duration::from_millis(250)) => continue, + }; + let n = match n { + Ok(0) => { + warn!("gstreamer closed the connection"); + break; + } + Ok(n) => n, + Err(e) => { + warn!("read error: {e}"); + break; + } + }; + + out.clear(); + demuxer.feed(&read_buf[..n], &mut out); + if demuxer.desynced() { + warn!("demuxer reported desync — dropping TCP connection to re-align"); + break; + } + for au in out.drain(..) { + encoded_bytes.fetch_add(au.len() as u64, Ordering::Relaxed); + let is_keyframe = is_keyframe(args.codec, &au); + if is_keyframe { + keyframes.fetch_add(1, Ordering::Relaxed); + } + let info = EncodedFrameInfo { + is_keyframe, + has_sps_pps: false, // the source scans+prepends SPS/PPS as needed + width: args.width, + height: args.height, + capture_time_us: 0, + }; + if source.capture_frame(&au, &info) { + frames_accepted.fetch_add(1, Ordering::Relaxed); + } else { + frames_dropped.fetch_add(1, Ordering::Relaxed); + warn!( + "capture_frame dropped AU ({} bytes, keyframe={})", + au.len(), + is_keyframe + ); + } + } + } + + if !shutdown.load(Ordering::Acquire) { + sleep(Duration::from_secs(1)).await; + } + } + + info!("Shutting down..."); + Ok(()) +} diff --git a/libwebrtc/Cargo.toml b/libwebrtc/Cargo.toml index aab4ff181..5f2c689f7 100644 --- a/libwebrtc/Cargo.toml +++ b/libwebrtc/Cargo.toml @@ -14,6 +14,7 @@ default = [ "glib-main-loop" ] # event loop running in your application, for example if you are using the # GTK or GStreamer Rust bindings, disable this feature. glib-main-loop = [ "dep:glib" ] +encoded-video = [ "webrtc-sys/encoded-video" ] [dependencies] livekit-protocol = { workspace = true } diff --git a/libwebrtc/src/lib.rs b/libwebrtc/src/lib.rs index bf4ad8294..d11d52c3b 100644 --- a/libwebrtc/src/lib.rs +++ b/libwebrtc/src/lib.rs @@ -68,6 +68,8 @@ pub mod video_track; pub mod native { pub use webrtc_sys::webrtc::ffi::create_random_uuid; + #[cfg(feature = "encoded-video")] + pub use crate::imp::encoded_video_source; pub use crate::imp::{ apm, audio_mixer, audio_resampler, frame_cryptor, packet_trailer, yuv_helper, }; diff --git a/libwebrtc/src/native/encoded_video_source.rs b/libwebrtc/src/native/encoded_video_source.rs new file mode 100644 index 000000000..a2367c27b --- /dev/null +++ b/libwebrtc/src/native/encoded_video_source.rs @@ -0,0 +1,213 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{ + fmt::{Debug, Formatter}, + sync::Arc, +}; + +use cxx::SharedPtr; +use parking_lot::Mutex; +use webrtc_sys::encoded_video_source as sys_evs; + +use crate::video_source::{EncodedFrameInfo, VideoCodec, VideoResolution}; + +/// Observer that receives encoder-side feedback (keyframe requests, bitrate +/// updates) for a [`NativeEncodedVideoSource`]. +/// +/// Callbacks are invoked on internal WebRTC threads; implementers MUST be +/// cheap and non-blocking. +pub trait EncodedVideoSourceObserver: Send + Sync { + /// Called when the receiver requests a keyframe (PLI/FIR). + fn on_keyframe_requested(&self); + + /// Called when the WebRTC bandwidth estimator updates the target + /// bitrate / framerate for this source. + fn on_target_bitrate(&self, bitrate_bps: u32, framerate_fps: f64); +} + +impl From for sys_evs::ffi::EncodedVideoCodecType { + fn from(codec: VideoCodec) -> Self { + match codec { + VideoCodec::H264 => Self::H264, + VideoCodec::H265 => Self::H265, + VideoCodec::Vp8 => Self::Vp8, + VideoCodec::Vp9 => Self::Vp9, + VideoCodec::Av1 => Self::Av1, + } + } +} + +impl From for VideoCodec { + fn from(codec: sys_evs::ffi::EncodedVideoCodecType) -> Self { + match codec { + sys_evs::ffi::EncodedVideoCodecType::H264 => Self::H264, + sys_evs::ffi::EncodedVideoCodecType::H265 => Self::H265, + sys_evs::ffi::EncodedVideoCodecType::Vp8 => Self::Vp8, + sys_evs::ffi::EncodedVideoCodecType::Vp9 => Self::Vp9, + sys_evs::ffi::EncodedVideoCodecType::Av1 => Self::Av1, + _ => Self::H264, + } + } +} + +struct Inner { + resolution: Mutex, +} + +/// A video source that accepts encoded compressed frames (H.264, H.265, +/// VP8, VP9, AV1) instead of raw pixels. WebRTC's encoder is bypassed for +/// tracks bound to this source — frames flow straight from `capture_frame` +/// into RTP packetization and congestion control. +/// +/// A source carries a single encoded stream (one resolution, one codec). For +/// simulcast, create several sources and publish them on separate tracks. +#[derive(Clone)] +pub struct NativeEncodedVideoSource { + sys_handle: SharedPtr, + inner: Arc, +} + +impl Debug for NativeEncodedVideoSource { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("NativeEncodedVideoSource") + .field("source_id", &self.source_id()) + .field("codec", &self.codec()) + .finish() + } +} + +impl NativeEncodedVideoSource { + pub fn new(codec: VideoCodec, resolution: VideoResolution) -> Self { + let sys_handle = sys_evs::ffi::new_encoded_video_track_source( + codec.into(), + resolution.width, + resolution.height, + ); + Self { sys_handle, inner: Arc::new(Inner { resolution: Mutex::new(resolution) }) } + } + + /// Unique non-zero id assigned to this source. Exposed for debugging / + /// tracing; callers do not need to inspect it. + pub fn source_id(&self) -> u16 { + self.sys_handle.source_id() + } + + pub fn codec(&self) -> VideoCodec { + self.sys_handle.codec().into() + } + + pub fn video_resolution(&self) -> VideoResolution { + self.inner.resolution.lock().clone() + } + + /// Push an encoded (compressed) frame to the track. Returns `true` if the frame was + /// accepted, `false` if the internal queue was full and the frame had to + /// be dropped. + pub fn capture_frame(&self, data: &[u8], info: &EncodedFrameInfo) -> bool { + { + let mut res = self.inner.resolution.lock(); + if info.width != 0 && info.height != 0 { + res.width = info.width; + res.height = info.height; + } + } + + self.sys_handle.capture_frame( + data, + info.is_keyframe, + info.has_sps_pps, + info.width, + info.height, + info.capture_time_us, + ) + } + + /// Register an observer for encoder-side feedback. The previous observer + /// (if any) is dropped. + pub fn set_observer(&self, observer: Arc) { + let wrapper = Box::new(sys_evs::EncodedVideoSourceWrapper::new(Arc::new(ObserverBridge { + inner: observer, + }))); + self.sys_handle.set_observer(wrapper); + } + + pub fn sys_handle(&self) -> SharedPtr { + self.sys_handle.clone() + } +} + +/// Adapts a `libwebrtc`-level observer trait object to the +/// `webrtc-sys`-level observer trait expected by the cxx bridge. +struct ObserverBridge { + inner: Arc, +} + +impl sys_evs::EncodedVideoSourceObserver for ObserverBridge { + fn on_keyframe_requested(&self) { + self.inner.on_keyframe_requested(); + } + + fn on_target_bitrate(&self, bitrate_bps: u32, framerate_fps: f64) { + self.inner.on_target_bitrate(bitrate_bps, framerate_fps); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn encoded_source_reports_codec_and_updates_resolution_from_frames() { + let source = NativeEncodedVideoSource::new( + VideoCodec::Av1, + VideoResolution { width: 640, height: 360 }, + ); + + assert_ne!(source.source_id(), 0); + assert_eq!(source.codec(), VideoCodec::Av1); + assert_eq!(source.video_resolution().width, 640); + assert_eq!(source.video_resolution().height, 360); + + let info = EncodedFrameInfo { + is_keyframe: true, + width: 1280, + height: 720, + capture_time_us: 123_456, + ..Default::default() + }; + + assert!(source.capture_frame(&[0x0A, 0x00], &info)); + assert_eq!(source.video_resolution().width, 1280); + assert_eq!(source.video_resolution().height, 720); + } + + #[test] + fn encoded_source_prefers_buffered_keyframe_over_incoming_delta_when_full() { + let source = NativeEncodedVideoSource::new( + VideoCodec::H264, + VideoResolution { width: 640, height: 360 }, + ); + let keyframe = + EncodedFrameInfo { is_keyframe: true, width: 640, height: 360, ..Default::default() }; + let delta = EncodedFrameInfo { width: 640, height: 360, ..Default::default() }; + + assert!(source.capture_frame(&[0, 0, 0, 1, 0x65], &keyframe)); + for _ in 0..7 { + assert!(source.capture_frame(&[0, 0, 0, 1, 0x41], &delta)); + } + + assert!(!source.capture_frame(&[0, 0, 0, 1, 0x41], &delta)); + } +} diff --git a/libwebrtc/src/native/mod.rs b/libwebrtc/src/native/mod.rs index de56e3345..87a41e364 100644 --- a/libwebrtc/src/native/mod.rs +++ b/libwebrtc/src/native/mod.rs @@ -23,6 +23,8 @@ pub mod audio_track; pub mod data_channel; #[cfg(any(target_os = "macos", target_os = "windows", target_os = "linux"))] pub mod desktop_capturer; +#[cfg(feature = "encoded-video")] +pub mod encoded_video_source; pub mod frame_cryptor; pub mod ice_candidate; pub mod media_stream; diff --git a/libwebrtc/src/native/peer_connection_factory.rs b/libwebrtc/src/native/peer_connection_factory.rs index ae082aecc..60f3e38fc 100644 --- a/libwebrtc/src/native/peer_connection_factory.rs +++ b/libwebrtc/src/native/peer_connection_factory.rs @@ -19,6 +19,8 @@ use lazy_static::lazy_static; use parking_lot::Mutex; use webrtc_sys::{peer_connection_factory as sys_pcf, rtc_error as sys_err, webrtc as sys_rtc}; +#[cfg(feature = "encoded-video")] +use crate::video_source::native::NativeEncodedVideoSource; use crate::{ audio_source::native::NativeAudioSource, audio_track::RtcAudioTrack, @@ -81,6 +83,20 @@ impl PeerConnectionFactory { } } + #[cfg(feature = "encoded-video")] + pub fn create_video_track_from_encoded_source( + &self, + label: &str, + source: NativeEncodedVideoSource, + ) -> RtcVideoTrack { + RtcVideoTrack { + handle: imp_vt::RtcVideoTrack::new( + self.sys_handle + .create_video_track_from_encoded_source(label.to_string(), source.sys_handle()), + ), + } + } + pub fn create_audio_track(&self, label: &str, source: NativeAudioSource) -> RtcAudioTrack { RtcAudioTrack { handle: imp_at::RtcAudioTrack { diff --git a/libwebrtc/src/peer_connection_factory.rs b/libwebrtc/src/peer_connection_factory.rs index 12f6d24bc..bccbf7156 100644 --- a/libwebrtc/src/peer_connection_factory.rs +++ b/libwebrtc/src/peer_connection_factory.rs @@ -86,6 +86,8 @@ impl PeerConnectionFactory { pub mod native { use super::PeerConnectionFactory; + #[cfg(feature = "encoded-video")] + use crate::video_source::native::NativeEncodedVideoSource; use crate::{ audio_source::native::NativeAudioSource, audio_track::RtcAudioTrack, video_source::native::NativeVideoSource, video_track::RtcVideoTrack, @@ -93,6 +95,12 @@ pub mod native { pub trait PeerConnectionFactoryExt { fn create_video_track(&self, label: &str, source: NativeVideoSource) -> RtcVideoTrack; + #[cfg(feature = "encoded-video")] + fn create_video_track_from_encoded_source( + &self, + label: &str, + source: NativeEncodedVideoSource, + ) -> RtcVideoTrack; fn create_audio_track(&self, label: &str, source: NativeAudioSource) -> RtcAudioTrack; } @@ -101,6 +109,15 @@ pub mod native { self.handle.create_video_track(label, source) } + #[cfg(feature = "encoded-video")] + fn create_video_track_from_encoded_source( + &self, + label: &str, + source: NativeEncodedVideoSource, + ) -> RtcVideoTrack { + self.handle.create_video_track_from_encoded_source(label, source) + } + fn create_audio_track(&self, label: &str, source: NativeAudioSource) -> RtcAudioTrack { self.handle.create_audio_track(label, source) } diff --git a/libwebrtc/src/video_source.rs b/libwebrtc/src/video_source.rs index f0404ea8b..e4f99f957 100644 --- a/libwebrtc/src/video_source.rs +++ b/libwebrtc/src/video_source.rs @@ -29,15 +29,60 @@ impl Default for VideoResolution { } } +/// Codec used by an encoded video feed. +#[cfg(feature = "encoded-video")] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum VideoCodec { + H264, + H265, + Vp8, + Vp9, + Av1, +} + +/// Metadata describing a single encoded video frame pushed to an +/// [`native::NativeEncodedVideoSource`]. +#[cfg(feature = "encoded-video")] +#[derive(Debug, Copy, Clone)] +pub struct EncodedFrameInfo { + /// True when this frame is an IDR / keyframe. + pub is_keyframe: bool, + /// True when the `data` buffer already has SPS/PPS (or equivalent) + /// prepended. H.264/H.265 only; ignored for other codecs. + pub has_sps_pps: bool, + pub width: u32, + pub height: u32, + /// Capture timestamp in microseconds. `0` lets the source stamp `now`. + pub capture_time_us: i64, +} + +#[cfg(feature = "encoded-video")] +impl Default for EncodedFrameInfo { + fn default() -> Self { + Self { is_keyframe: false, has_sps_pps: false, width: 0, height: 0, capture_time_us: 0 } + } +} + #[non_exhaustive] #[derive(Debug, Clone)] pub enum RtcVideoSource { // TODO(theomonnom): Web video sources (eq. to tracks on browsers?) #[cfg(not(target_arch = "wasm32"))] Native(native::NativeVideoSource), + #[cfg(all(not(target_arch = "wasm32"), feature = "encoded-video"))] + Encoded(native::NativeEncodedVideoSource), } // TODO(theomonnom): Support enum dispatch with conditional compilation? +#[cfg(all(not(target_arch = "wasm32"), feature = "encoded-video"))] +impl RtcVideoSource { + enum_dispatch!( + [Native, Encoded]; + pub fn video_resolution(self: &Self) -> VideoResolution; + ); +} + +#[cfg(all(not(target_arch = "wasm32"), not(feature = "encoded-video")))] impl RtcVideoSource { enum_dispatch!( [Native]; @@ -49,6 +94,11 @@ impl RtcVideoSource { pub mod native { use std::fmt::{Debug, Formatter}; + #[cfg(feature = "encoded-video")] + pub use crate::native::encoded_video_source::{ + EncodedVideoSourceObserver, NativeEncodedVideoSource, + }; + use super::*; use crate::native::packet_trailer::PacketTrailerHandler; use crate::video_frame::{VideoBuffer, VideoFrame}; diff --git a/livekit-ffi-node-bindings/proto/ffi_pb.d.ts b/livekit-ffi-node-bindings/proto/ffi_pb.d.ts index 05d98054d..54cc664ae 100644 --- a/livekit-ffi-node-bindings/proto/ffi_pb.d.ts +++ b/livekit-ffi-node-bindings/proto/ffi_pb.d.ts @@ -21,7 +21,7 @@ import type { BinaryReadOptions, FieldList, JsonReadOptions, JsonValue, PartialM import { Message, proto2 } from "@bufbuild/protobuf"; import type { ConnectCallback, ConnectRequest, ConnectResponse, DisconnectCallback, DisconnectRequest, DisconnectResponse, EditChatMessageRequest, GetSessionStatsCallback, GetSessionStatsRequest, GetSessionStatsResponse, PublishDataCallback, PublishDataRequest, PublishDataResponse, PublishSipDtmfCallback, PublishSipDtmfRequest, PublishSipDtmfResponse, PublishTrackCallback, PublishTrackRequest, PublishTrackResponse, PublishTranscriptionCallback, PublishTranscriptionRequest, PublishTranscriptionResponse, RoomEvent, SendChatMessageCallback, SendChatMessageRequest, SendChatMessageResponse, SendStreamChunkCallback, SendStreamChunkRequest, SendStreamChunkResponse, SendStreamHeaderCallback, SendStreamHeaderRequest, SendStreamHeaderResponse, SendStreamTrailerCallback, SendStreamTrailerRequest, SendStreamTrailerResponse, SetDataChannelBufferedAmountLowThresholdRequest, SetDataChannelBufferedAmountLowThresholdResponse, SetLocalAttributesCallback, SetLocalAttributesRequest, SetLocalAttributesResponse, SetLocalMetadataCallback, SetLocalMetadataRequest, SetLocalMetadataResponse, SetLocalNameCallback, SetLocalNameRequest, SetLocalNameResponse, SetSubscribedRequest, SetSubscribedResponse, UnpublishTrackCallback, UnpublishTrackRequest, UnpublishTrackResponse } from "./room_pb.js"; import type { CreateAudioTrackRequest, CreateAudioTrackResponse, CreateVideoTrackRequest, CreateVideoTrackResponse, EnableRemoteTrackRequest, EnableRemoteTrackResponse, GetStatsCallback, GetStatsRequest, GetStatsResponse, LocalTrackMuteRequest, LocalTrackMuteResponse, SetTrackSubscriptionPermissionsRequest, SetTrackSubscriptionPermissionsResponse, TrackEvent } from "./track_pb.js"; -import type { CaptureVideoFrameRequest, CaptureVideoFrameResponse, NewVideoSourceRequest, NewVideoSourceResponse, NewVideoStreamRequest, NewVideoStreamResponse, VideoConvertRequest, VideoConvertResponse, VideoStreamEvent, VideoStreamFromParticipantRequest, VideoStreamFromParticipantResponse } from "./video_frame_pb.js"; +import type { CaptureEncodedVideoFrameRequest, CaptureEncodedVideoFrameResponse, CaptureVideoFrameRequest, CaptureVideoFrameResponse, EncodedVideoSourceEvent, NewVideoSourceRequest, NewVideoSourceResponse, NewVideoStreamRequest, NewVideoStreamResponse, VideoConvertRequest, VideoConvertResponse, VideoStreamEvent, VideoStreamFromParticipantRequest, VideoStreamFromParticipantResponse } from "./video_frame_pb.js"; import type { ApmProcessReverseStreamRequest, ApmProcessReverseStreamResponse, ApmProcessStreamRequest, ApmProcessStreamResponse, ApmSetStreamDelayRequest, ApmSetStreamDelayResponse, AudioStreamEvent, AudioStreamFromParticipantRequest, AudioStreamFromParticipantResponse, CaptureAudioFrameCallback, CaptureAudioFrameRequest, CaptureAudioFrameResponse, ClearAudioBufferRequest, ClearAudioBufferResponse, FlushSoxResamplerRequest, FlushSoxResamplerResponse, LoadAudioFilterPluginRequest, LoadAudioFilterPluginResponse, NewApmRequest, NewApmResponse, NewAudioResamplerRequest, NewAudioResamplerResponse, NewAudioSourceRequest, NewAudioSourceResponse, NewAudioStreamRequest, NewAudioStreamResponse, NewSoxResamplerRequest, NewSoxResamplerResponse, PushSoxResamplerRequest, PushSoxResamplerResponse, RemixAndResampleRequest, RemixAndResampleResponse } from "./audio_frame_pb.js"; import type { E2eeRequest, E2eeResponse } from "./e2ee_pb.js"; import type { PerformRpcCallback, PerformRpcRequest, PerformRpcResponse, RegisterRpcMethodRequest, RegisterRpcMethodResponse, RpcMethodInvocationEvent, RpcMethodInvocationResponseRequest, RpcMethodInvocationResponseResponse, UnregisterRpcMethodRequest, UnregisterRpcMethodResponse } from "./rpc_pb.js"; @@ -537,6 +537,14 @@ export declare class FfiRequest extends Message { */ value: DataTrackStreamReadRequest; case: "dataTrackStreamRead"; + } | { + /** + * Encoded video + * + * @generated from field: livekit.proto.CaptureEncodedVideoFrameRequest capture_encoded_video_frame = 76; + */ + value: CaptureEncodedVideoFrameRequest; + case: "captureEncodedVideoFrame"; } | { case: undefined; value?: undefined }; constructor(data?: PartialMessage); @@ -1025,6 +1033,14 @@ export declare class FfiResponse extends Message { */ value: DataTrackStreamReadResponse; case: "dataTrackStreamRead"; + } | { + /** + * Encoded video + * + * @generated from field: livekit.proto.CaptureEncodedVideoFrameResponse capture_encoded_video_frame = 75; + */ + value: CaptureEncodedVideoFrameResponse; + case: "captureEncodedVideoFrame"; } | { case: undefined; value?: undefined }; constructor(data?: PartialMessage); @@ -1313,6 +1329,14 @@ export declare class FfiEvent extends Message { */ value: DataTrackStreamEvent; case: "dataTrackStreamEvent"; + } | { + /** + * Encoded video + * + * @generated from field: livekit.proto.EncodedVideoSourceEvent encoded_video_source_event = 44; + */ + value: EncodedVideoSourceEvent; + case: "encodedVideoSourceEvent"; } | { case: undefined; value?: undefined }; constructor(data?: PartialMessage); diff --git a/livekit-ffi-node-bindings/proto/ffi_pb.js b/livekit-ffi-node-bindings/proto/ffi_pb.js index 22727f9b7..90cd7ab89 100644 --- a/livekit-ffi-node-bindings/proto/ffi_pb.js +++ b/livekit-ffi-node-bindings/proto/ffi_pb.js @@ -23,7 +23,7 @@ Object.defineProperty(exports, "__esModule", { value: true }); const { proto2 } = require("@bufbuild/protobuf"); const { ConnectCallback, ConnectRequest, ConnectResponse, DisconnectCallback, DisconnectRequest, DisconnectResponse, EditChatMessageRequest, GetSessionStatsCallback, GetSessionStatsRequest, GetSessionStatsResponse, PublishDataCallback, PublishDataRequest, PublishDataResponse, PublishSipDtmfCallback, PublishSipDtmfRequest, PublishSipDtmfResponse, PublishTrackCallback, PublishTrackRequest, PublishTrackResponse, PublishTranscriptionCallback, PublishTranscriptionRequest, PublishTranscriptionResponse, RoomEvent, SendChatMessageCallback, SendChatMessageRequest, SendChatMessageResponse, SendStreamChunkCallback, SendStreamChunkRequest, SendStreamChunkResponse, SendStreamHeaderCallback, SendStreamHeaderRequest, SendStreamHeaderResponse, SendStreamTrailerCallback, SendStreamTrailerRequest, SendStreamTrailerResponse, SetDataChannelBufferedAmountLowThresholdRequest, SetDataChannelBufferedAmountLowThresholdResponse, SetLocalAttributesCallback, SetLocalAttributesRequest, SetLocalAttributesResponse, SetLocalMetadataCallback, SetLocalMetadataRequest, SetLocalMetadataResponse, SetLocalNameCallback, SetLocalNameRequest, SetLocalNameResponse, SetSubscribedRequest, SetSubscribedResponse, UnpublishTrackCallback, UnpublishTrackRequest, UnpublishTrackResponse } = require("./room_pb.js"); const { CreateAudioTrackRequest, CreateAudioTrackResponse, CreateVideoTrackRequest, CreateVideoTrackResponse, EnableRemoteTrackRequest, EnableRemoteTrackResponse, GetStatsCallback, GetStatsRequest, GetStatsResponse, LocalTrackMuteRequest, LocalTrackMuteResponse, SetTrackSubscriptionPermissionsRequest, SetTrackSubscriptionPermissionsResponse, TrackEvent } = require("./track_pb.js"); -const { CaptureVideoFrameRequest, CaptureVideoFrameResponse, NewVideoSourceRequest, NewVideoSourceResponse, NewVideoStreamRequest, NewVideoStreamResponse, VideoConvertRequest, VideoConvertResponse, VideoStreamEvent, VideoStreamFromParticipantRequest, VideoStreamFromParticipantResponse } = require("./video_frame_pb.js"); +const { CaptureEncodedVideoFrameRequest, CaptureEncodedVideoFrameResponse, CaptureVideoFrameRequest, CaptureVideoFrameResponse, EncodedVideoSourceEvent, NewVideoSourceRequest, NewVideoSourceResponse, NewVideoStreamRequest, NewVideoStreamResponse, VideoConvertRequest, VideoConvertResponse, VideoStreamEvent, VideoStreamFromParticipantRequest, VideoStreamFromParticipantResponse } = require("./video_frame_pb.js"); const { ApmProcessReverseStreamRequest, ApmProcessReverseStreamResponse, ApmProcessStreamRequest, ApmProcessStreamResponse, ApmSetStreamDelayRequest, ApmSetStreamDelayResponse, AudioStreamEvent, AudioStreamFromParticipantRequest, AudioStreamFromParticipantResponse, CaptureAudioFrameCallback, CaptureAudioFrameRequest, CaptureAudioFrameResponse, ClearAudioBufferRequest, ClearAudioBufferResponse, FlushSoxResamplerRequest, FlushSoxResamplerResponse, LoadAudioFilterPluginRequest, LoadAudioFilterPluginResponse, NewApmRequest, NewApmResponse, NewAudioResamplerRequest, NewAudioResamplerResponse, NewAudioSourceRequest, NewAudioSourceResponse, NewAudioStreamRequest, NewAudioStreamResponse, NewSoxResamplerRequest, NewSoxResamplerResponse, PushSoxResamplerRequest, PushSoxResamplerResponse, RemixAndResampleRequest, RemixAndResampleResponse } = require("./audio_frame_pb.js"); const { E2eeRequest, E2eeResponse } = require("./e2ee_pb.js"); const { PerformRpcCallback, PerformRpcRequest, PerformRpcResponse, RegisterRpcMethodRequest, RegisterRpcMethodResponse, RpcMethodInvocationEvent, RpcMethodInvocationResponseRequest, RpcMethodInvocationResponseResponse, UnregisterRpcMethodRequest, UnregisterRpcMethodResponse } = require("./rpc_pb.js"); @@ -128,6 +128,7 @@ const FfiRequest = /*@__PURE__*/ proto2.makeMessageType( { no: 73, name: "subscribe_data_track", kind: "message", T: SubscribeDataTrackRequest, oneof: "message" }, { no: 74, name: "remote_data_track_is_published", kind: "message", T: RemoteDataTrackIsPublishedRequest, oneof: "message" }, { no: 75, name: "data_track_stream_read", kind: "message", T: DataTrackStreamReadRequest, oneof: "message" }, + { no: 76, name: "capture_encoded_video_frame", kind: "message", T: CaptureEncodedVideoFrameRequest, oneof: "message" }, ], ); @@ -212,6 +213,7 @@ const FfiResponse = /*@__PURE__*/ proto2.makeMessageType( { no: 72, name: "subscribe_data_track", kind: "message", T: SubscribeDataTrackResponse, oneof: "message" }, { no: 73, name: "remote_data_track_is_published", kind: "message", T: RemoteDataTrackIsPublishedResponse, oneof: "message" }, { no: 74, name: "data_track_stream_read", kind: "message", T: DataTrackStreamReadResponse, oneof: "message" }, + { no: 75, name: "capture_encoded_video_frame", kind: "message", T: CaptureEncodedVideoFrameResponse, oneof: "message" }, ], ); @@ -267,6 +269,7 @@ const FfiEvent = /*@__PURE__*/ proto2.makeMessageType( { no: 41, name: "send_bytes", kind: "message", T: StreamSendBytesCallback, oneof: "message" }, { no: 42, name: "publish_data_track", kind: "message", T: PublishDataTrackCallback, oneof: "message" }, { no: 43, name: "data_track_stream_event", kind: "message", T: DataTrackStreamEvent, oneof: "message" }, + { no: 44, name: "encoded_video_source_event", kind: "message", T: EncodedVideoSourceEvent, oneof: "message" }, ], ); diff --git a/livekit-ffi-node-bindings/proto/video_frame_pb.d.ts b/livekit-ffi-node-bindings/proto/video_frame_pb.d.ts index e0ec12f19..06bb93451 100644 --- a/livekit-ffi-node-bindings/proto/video_frame_pb.d.ts +++ b/livekit-ffi-node-bindings/proto/video_frame_pb.d.ts @@ -168,6 +168,14 @@ export declare enum VideoSourceType { * @generated from enum value: VIDEO_SOURCE_NATIVE = 0; */ VIDEO_SOURCE_NATIVE = 0, + + /** + * A source that accepts encoded (compressed) frames. WebRTC's internal + * encoder is bypassed for tracks bound to this source. + * + * @generated from enum value: VIDEO_SOURCE_ENCODED = 1; + */ + VIDEO_SOURCE_ENCODED = 1, } /** @@ -363,6 +371,14 @@ export declare class NewVideoSourceRequest extends Message); static readonly runtime: typeof proto2; @@ -469,6 +485,101 @@ export declare class CaptureVideoFrameResponse extends Message | undefined, b: CaptureVideoFrameResponse | PlainMessage | undefined): boolean; } +/** + * Push an encoded (compressed) frame to an encoded VideoSource. + * The source must have been created with type == VIDEO_SOURCE_ENCODED. + * + * @generated from message livekit.proto.CaptureEncodedVideoFrameRequest + */ +export declare class CaptureEncodedVideoFrameRequest extends Message { + /** + * @generated from field: required uint64 source_handle = 1; + */ + sourceHandle?: bigint; + + /** + * Raw encoded bitstream (e.g. NAL units for H.264/H.265, VP8/VP9/AV1 + * OBU payload). Must be a complete access unit / picture. + * + * @generated from field: required bytes data = 2; + */ + data?: Uint8Array; + + /** + * @generated from field: required bool is_keyframe = 3; + */ + isKeyframe?: boolean; + + /** + * H.264/H.265 only: set when SPS/PPS (or VPS/SPS/PPS) is already + * prepended to `data`. Ignored for other codecs. + * + * @generated from field: optional bool has_sps_pps = 4; + */ + hasSpsPps?: boolean; + + /** + * Frame resolution. 0/0 means "use the resolution from + * EncodedVideoSourceOptions". + * + * @generated from field: optional uint32 width = 5; + */ + width?: number; + + /** + * @generated from field: optional uint32 height = 6; + */ + height?: number; + + /** + * Capture timestamp in microseconds. 0 lets the source stamp `now`. + * + * @generated from field: optional int64 capture_time_us = 7; + */ + captureTimeUs?: bigint; + + constructor(data?: PartialMessage); + + static readonly runtime: typeof proto2; + static readonly typeName = "livekit.proto.CaptureEncodedVideoFrameRequest"; + static readonly fields: FieldList; + + static fromBinary(bytes: Uint8Array, options?: Partial): CaptureEncodedVideoFrameRequest; + + static fromJson(jsonValue: JsonValue, options?: Partial): CaptureEncodedVideoFrameRequest; + + static fromJsonString(jsonString: string, options?: Partial): CaptureEncodedVideoFrameRequest; + + static equals(a: CaptureEncodedVideoFrameRequest | PlainMessage | undefined, b: CaptureEncodedVideoFrameRequest | PlainMessage | undefined): boolean; +} + +/** + * @generated from message livekit.proto.CaptureEncodedVideoFrameResponse + */ +export declare class CaptureEncodedVideoFrameResponse extends Message { + /** + * True if the frame was queued; false if it was dropped because the + * internal queue was full. + * + * @generated from field: required bool accepted = 1; + */ + accepted?: boolean; + + constructor(data?: PartialMessage); + + static readonly runtime: typeof proto2; + static readonly typeName = "livekit.proto.CaptureEncodedVideoFrameResponse"; + static readonly fields: FieldList; + + static fromBinary(bytes: Uint8Array, options?: Partial): CaptureEncodedVideoFrameResponse; + + static fromJson(jsonValue: JsonValue, options?: Partial): CaptureEncodedVideoFrameResponse; + + static fromJsonString(jsonString: string, options?: Partial): CaptureEncodedVideoFrameResponse; + + static equals(a: CaptureEncodedVideoFrameResponse | PlainMessage | undefined, b: CaptureEncodedVideoFrameResponse | PlainMessage | undefined): boolean; +} + /** * @generated from message livekit.proto.VideoConvertRequest */ @@ -908,6 +1019,13 @@ export declare class VideoSourceInfo extends Message { */ type?: VideoSourceType; + /** + * Only populated for encoded sources. Exposed for debugging / tracing. + * + * @generated from field: optional uint32 encoded_source_id = 2; + */ + encodedSourceId?: number; + constructor(data?: PartialMessage); static readonly runtime: typeof proto2; @@ -952,3 +1070,124 @@ export declare class OwnedVideoSource extends Message { static equals(a: OwnedVideoSource | PlainMessage | undefined, b: OwnedVideoSource | PlainMessage | undefined): boolean; } +/** + * Options for an encoded video source. One source carries a single encoded + * stream (one resolution, one codec). To simulcast, create multiple sources + * and publish them on separate tracks. + * + * @generated from message livekit.proto.EncodedVideoSourceOptions + */ +export declare class EncodedVideoSourceOptions extends Message { + /** + * @generated from field: required livekit.proto.VideoCodec codec = 1; + */ + codec?: VideoCodec; + + constructor(data?: PartialMessage); + + static readonly runtime: typeof proto2; + static readonly typeName = "livekit.proto.EncodedVideoSourceOptions"; + static readonly fields: FieldList; + + static fromBinary(bytes: Uint8Array, options?: Partial): EncodedVideoSourceOptions; + + static fromJson(jsonValue: JsonValue, options?: Partial): EncodedVideoSourceOptions; + + static fromJsonString(jsonString: string, options?: Partial): EncodedVideoSourceOptions; + + static equals(a: EncodedVideoSourceOptions | PlainMessage | undefined, b: EncodedVideoSourceOptions | PlainMessage | undefined): boolean; +} + +/** + * Encoder-side feedback for an encoded video source. Emitted as FfiEvents + * so client SDKs can react (request a fresh keyframe from their encoder, + * adjust target bitrate, etc.). + * + * @generated from message livekit.proto.EncodedVideoSourceEvent + */ +export declare class EncodedVideoSourceEvent extends Message { + /** + * @generated from field: required uint64 source_handle = 1; + */ + sourceHandle?: bigint; + + /** + * @generated from oneof livekit.proto.EncodedVideoSourceEvent.message + */ + message: { + /** + * @generated from field: livekit.proto.EncodedVideoSourceEvent.KeyframeRequested keyframe_requested = 2; + */ + value: EncodedVideoSourceEvent_KeyframeRequested; + case: "keyframeRequested"; + } | { + /** + * @generated from field: livekit.proto.EncodedVideoSourceEvent.TargetBitrateChanged target_bitrate_changed = 3; + */ + value: EncodedVideoSourceEvent_TargetBitrateChanged; + case: "targetBitrateChanged"; + } | { case: undefined; value?: undefined }; + + constructor(data?: PartialMessage); + + static readonly runtime: typeof proto2; + static readonly typeName = "livekit.proto.EncodedVideoSourceEvent"; + static readonly fields: FieldList; + + static fromBinary(bytes: Uint8Array, options?: Partial): EncodedVideoSourceEvent; + + static fromJson(jsonValue: JsonValue, options?: Partial): EncodedVideoSourceEvent; + + static fromJsonString(jsonString: string, options?: Partial): EncodedVideoSourceEvent; + + static equals(a: EncodedVideoSourceEvent | PlainMessage | undefined, b: EncodedVideoSourceEvent | PlainMessage | undefined): boolean; +} + +/** + * @generated from message livekit.proto.EncodedVideoSourceEvent.KeyframeRequested + */ +export declare class EncodedVideoSourceEvent_KeyframeRequested extends Message { + constructor(data?: PartialMessage); + + static readonly runtime: typeof proto2; + static readonly typeName = "livekit.proto.EncodedVideoSourceEvent.KeyframeRequested"; + static readonly fields: FieldList; + + static fromBinary(bytes: Uint8Array, options?: Partial): EncodedVideoSourceEvent_KeyframeRequested; + + static fromJson(jsonValue: JsonValue, options?: Partial): EncodedVideoSourceEvent_KeyframeRequested; + + static fromJsonString(jsonString: string, options?: Partial): EncodedVideoSourceEvent_KeyframeRequested; + + static equals(a: EncodedVideoSourceEvent_KeyframeRequested | PlainMessage | undefined, b: EncodedVideoSourceEvent_KeyframeRequested | PlainMessage | undefined): boolean; +} + +/** + * @generated from message livekit.proto.EncodedVideoSourceEvent.TargetBitrateChanged + */ +export declare class EncodedVideoSourceEvent_TargetBitrateChanged extends Message { + /** + * @generated from field: required uint32 bitrate_bps = 1; + */ + bitrateBps?: number; + + /** + * @generated from field: required double framerate_fps = 2; + */ + framerateFps?: number; + + constructor(data?: PartialMessage); + + static readonly runtime: typeof proto2; + static readonly typeName = "livekit.proto.EncodedVideoSourceEvent.TargetBitrateChanged"; + static readonly fields: FieldList; + + static fromBinary(bytes: Uint8Array, options?: Partial): EncodedVideoSourceEvent_TargetBitrateChanged; + + static fromJson(jsonValue: JsonValue, options?: Partial): EncodedVideoSourceEvent_TargetBitrateChanged; + + static fromJsonString(jsonString: string, options?: Partial): EncodedVideoSourceEvent_TargetBitrateChanged; + + static equals(a: EncodedVideoSourceEvent_TargetBitrateChanged | PlainMessage | undefined, b: EncodedVideoSourceEvent_TargetBitrateChanged | PlainMessage | undefined): boolean; +} + diff --git a/livekit-ffi-node-bindings/proto/video_frame_pb.js b/livekit-ffi-node-bindings/proto/video_frame_pb.js index 331320682..e76b6f4e5 100644 --- a/livekit-ffi-node-bindings/proto/video_frame_pb.js +++ b/livekit-ffi-node-bindings/proto/video_frame_pb.js @@ -93,6 +93,7 @@ const VideoSourceType = /*@__PURE__*/ proto2.makeEnum( "livekit.proto.VideoSourceType", [ {no: 0, name: "VIDEO_SOURCE_NATIVE"}, + {no: 1, name: "VIDEO_SOURCE_ENCODED"}, ], ); @@ -162,6 +163,7 @@ const NewVideoSourceRequest = /*@__PURE__*/ proto2.makeMessageType( { no: 1, name: "type", kind: "enum", T: proto2.getEnumType(VideoSourceType), req: true }, { no: 2, name: "resolution", kind: "message", T: VideoSourceResolution, req: true }, { no: 3, name: "is_screencast", kind: "scalar", T: 8 /* ScalarType.BOOL */, opt: true }, + { no: 4, name: "encoded_options", kind: "message", T: EncodedVideoSourceOptions, opt: true }, ], ); @@ -199,6 +201,35 @@ const CaptureVideoFrameResponse = /*@__PURE__*/ proto2.makeMessageType( [], ); +/** + * Push an encoded (compressed) frame to an encoded VideoSource. + * The source must have been created with type == VIDEO_SOURCE_ENCODED. + * + * @generated from message livekit.proto.CaptureEncodedVideoFrameRequest + */ +const CaptureEncodedVideoFrameRequest = /*@__PURE__*/ proto2.makeMessageType( + "livekit.proto.CaptureEncodedVideoFrameRequest", + () => [ + { no: 1, name: "source_handle", kind: "scalar", T: 4 /* ScalarType.UINT64 */, req: true }, + { no: 2, name: "data", kind: "scalar", T: 12 /* ScalarType.BYTES */, req: true }, + { no: 3, name: "is_keyframe", kind: "scalar", T: 8 /* ScalarType.BOOL */, req: true }, + { no: 4, name: "has_sps_pps", kind: "scalar", T: 8 /* ScalarType.BOOL */, opt: true }, + { no: 5, name: "width", kind: "scalar", T: 13 /* ScalarType.UINT32 */, opt: true }, + { no: 6, name: "height", kind: "scalar", T: 13 /* ScalarType.UINT32 */, opt: true }, + { no: 7, name: "capture_time_us", kind: "scalar", T: 3 /* ScalarType.INT64 */, opt: true }, + ], +); + +/** + * @generated from message livekit.proto.CaptureEncodedVideoFrameResponse + */ +const CaptureEncodedVideoFrameResponse = /*@__PURE__*/ proto2.makeMessageType( + "livekit.proto.CaptureEncodedVideoFrameResponse", + () => [ + { no: 1, name: "accepted", kind: "scalar", T: 8 /* ScalarType.BOOL */, req: true }, + ], +); + /** * @generated from message livekit.proto.VideoConvertRequest */ @@ -356,6 +387,7 @@ const VideoSourceInfo = /*@__PURE__*/ proto2.makeMessageType( "livekit.proto.VideoSourceInfo", () => [ { no: 1, name: "type", kind: "enum", T: proto2.getEnumType(VideoSourceType), req: true }, + { no: 2, name: "encoded_source_id", kind: "scalar", T: 13 /* ScalarType.UINT32 */, opt: true }, ], ); @@ -370,6 +402,57 @@ const OwnedVideoSource = /*@__PURE__*/ proto2.makeMessageType( ], ); +/** + * Options for an encoded video source. One source carries a single encoded + * stream (one resolution, one codec). To simulcast, create multiple sources + * and publish them on separate tracks. + * + * @generated from message livekit.proto.EncodedVideoSourceOptions + */ +const EncodedVideoSourceOptions = /*@__PURE__*/ proto2.makeMessageType( + "livekit.proto.EncodedVideoSourceOptions", + () => [ + { no: 1, name: "codec", kind: "enum", T: proto2.getEnumType(VideoCodec), req: true }, + ], +); + +/** + * Encoder-side feedback for an encoded video source. Emitted as FfiEvents + * so client SDKs can react (request a fresh keyframe from their encoder, + * adjust target bitrate, etc.). + * + * @generated from message livekit.proto.EncodedVideoSourceEvent + */ +const EncodedVideoSourceEvent = /*@__PURE__*/ proto2.makeMessageType( + "livekit.proto.EncodedVideoSourceEvent", + () => [ + { no: 1, name: "source_handle", kind: "scalar", T: 4 /* ScalarType.UINT64 */, req: true }, + { no: 2, name: "keyframe_requested", kind: "message", T: EncodedVideoSourceEvent_KeyframeRequested, oneof: "message" }, + { no: 3, name: "target_bitrate_changed", kind: "message", T: EncodedVideoSourceEvent_TargetBitrateChanged, oneof: "message" }, + ], +); + +/** + * @generated from message livekit.proto.EncodedVideoSourceEvent.KeyframeRequested + */ +const EncodedVideoSourceEvent_KeyframeRequested = /*@__PURE__*/ proto2.makeMessageType( + "livekit.proto.EncodedVideoSourceEvent.KeyframeRequested", + [], + {localName: "EncodedVideoSourceEvent_KeyframeRequested"}, +); + +/** + * @generated from message livekit.proto.EncodedVideoSourceEvent.TargetBitrateChanged + */ +const EncodedVideoSourceEvent_TargetBitrateChanged = /*@__PURE__*/ proto2.makeMessageType( + "livekit.proto.EncodedVideoSourceEvent.TargetBitrateChanged", + () => [ + { no: 1, name: "bitrate_bps", kind: "scalar", T: 13 /* ScalarType.UINT32 */, req: true }, + { no: 2, name: "framerate_fps", kind: "scalar", T: 1 /* ScalarType.DOUBLE */, req: true }, + ], + {localName: "EncodedVideoSourceEvent_TargetBitrateChanged"}, +); + exports.VideoCodec = VideoCodec; exports.VideoRotation = VideoRotation; @@ -384,6 +467,8 @@ exports.NewVideoSourceRequest = NewVideoSourceRequest; exports.NewVideoSourceResponse = NewVideoSourceResponse; exports.CaptureVideoFrameRequest = CaptureVideoFrameRequest; exports.CaptureVideoFrameResponse = CaptureVideoFrameResponse; +exports.CaptureEncodedVideoFrameRequest = CaptureEncodedVideoFrameRequest; +exports.CaptureEncodedVideoFrameResponse = CaptureEncodedVideoFrameResponse; exports.VideoConvertRequest = VideoConvertRequest; exports.VideoConvertResponse = VideoConvertResponse; exports.VideoResolution = VideoResolution; @@ -399,3 +484,7 @@ exports.VideoStreamEOS = VideoStreamEOS; exports.VideoSourceResolution = VideoSourceResolution; exports.VideoSourceInfo = VideoSourceInfo; exports.OwnedVideoSource = OwnedVideoSource; +exports.EncodedVideoSourceOptions = EncodedVideoSourceOptions; +exports.EncodedVideoSourceEvent = EncodedVideoSourceEvent; +exports.EncodedVideoSourceEvent_KeyframeRequested = EncodedVideoSourceEvent_KeyframeRequested; +exports.EncodedVideoSourceEvent_TargetBitrateChanged = EncodedVideoSourceEvent_TargetBitrateChanged; diff --git a/livekit-ffi/Cargo.toml b/livekit-ffi/Cargo.toml index 38e16a6e3..5d9851d8e 100644 --- a/livekit-ffi/Cargo.toml +++ b/livekit-ffi/Cargo.toml @@ -15,6 +15,7 @@ native-tls-vendored = ["livekit/native-tls-vendored"] rustls-tls-native-roots = ["livekit/rustls-tls-native-roots"] rustls-tls-webpki-roots = ["livekit/rustls-tls-webpki-roots"] __rustls-tls = ["livekit/__rustls-tls"] +encoded-video = ["livekit/encoded-video"] # Enable tokio-console to debug tasks tracing = ["tokio/tracing", "console-subscriber"] diff --git a/livekit-ffi/protocol/ffi.proto b/livekit-ffi/protocol/ffi.proto index b27a7b865..4b1377b5e 100644 --- a/livekit-ffi/protocol/ffi.proto +++ b/livekit-ffi/protocol/ffi.proto @@ -164,7 +164,10 @@ message FfiRequest { RemoteDataTrackIsPublishedRequest remote_data_track_is_published = 74; DataTrackStreamReadRequest data_track_stream_read = 75; - // NEXT_ID: 76 + // Encoded video + CaptureEncodedVideoFrameRequest capture_encoded_video_frame = 76; + + // NEXT_ID: 77 } } @@ -274,7 +277,10 @@ message FfiResponse { RemoteDataTrackIsPublishedResponse remote_data_track_is_published = 73; DataTrackStreamReadResponse data_track_stream_read = 74; - // NEXT_ID: 75 + // Encoded video + CaptureEncodedVideoFrameResponse capture_encoded_video_frame = 75; + + // NEXT_ID: 76 } } @@ -337,7 +343,10 @@ message FfiEvent { // Data Track (remote) DataTrackStreamEvent data_track_stream_event = 43; - // NEXT_ID: 44 + // Encoded video + EncodedVideoSourceEvent encoded_video_source_event = 44; + + // NEXT_ID: 45 } } diff --git a/livekit-ffi/protocol/video_frame.proto b/livekit-ffi/protocol/video_frame.proto index ff91fa3c6..e0adbd38e 100644 --- a/livekit-ffi/protocol/video_frame.proto +++ b/livekit-ffi/protocol/video_frame.proto @@ -68,6 +68,9 @@ message NewVideoSourceRequest { // Most of the time it corresponds to the source resolution required VideoSourceResolution resolution = 2; optional bool is_screencast = 3; + // When type == VIDEO_SOURCE_ENCODED this field MUST be set. It configures + // the passthrough encoder for the source (codec + initial resolution). + optional EncodedVideoSourceOptions encoded_options = 4; } message NewVideoSourceResponse { required OwnedVideoSource source = 1; } @@ -82,6 +85,31 @@ message CaptureVideoFrameRequest { message CaptureVideoFrameResponse {} +// Push an encoded (compressed) frame to an encoded VideoSource. +// The source must have been created with type == VIDEO_SOURCE_ENCODED. +message CaptureEncodedVideoFrameRequest { + required uint64 source_handle = 1; + // Raw encoded bitstream (e.g. NAL units for H.264/H.265, VP8/VP9/AV1 + // OBU payload). Must be a complete access unit / picture. + required bytes data = 2; + required bool is_keyframe = 3; + // H.264/H.265 only: set when SPS/PPS (or VPS/SPS/PPS) is already + // prepended to `data`. Ignored for other codecs. + optional bool has_sps_pps = 4; + // Frame resolution. 0/0 means "use the resolution from + // EncodedVideoSourceOptions". + optional uint32 width = 5; + optional uint32 height = 6; + // Capture timestamp in microseconds. 0 lets the source stamp `now`. + optional int64 capture_time_us = 7; +} + +message CaptureEncodedVideoFrameResponse { + // True if the frame was queued; false if it was dropped because the + // internal queue was full. + required bool accepted = 1; +} + message VideoConvertRequest { optional bool flip_y = 1; required VideoBufferInfo buffer = 2; @@ -206,13 +234,43 @@ message VideoSourceResolution { enum VideoSourceType { VIDEO_SOURCE_NATIVE = 0; + // A source that accepts encoded (compressed) frames. WebRTC's internal + // encoder is bypassed for tracks bound to this source. + VIDEO_SOURCE_ENCODED = 1; } message VideoSourceInfo { required VideoSourceType type = 1; + // Only populated for encoded sources. Exposed for debugging / tracing. + optional uint32 encoded_source_id = 2; } message OwnedVideoSource { required FfiOwnedHandle handle = 1; required VideoSourceInfo info = 2; } + +// Options for an encoded video source. One source carries a single encoded +// stream (one resolution, one codec). To simulcast, create multiple sources +// and publish them on separate tracks. +message EncodedVideoSourceOptions { + required VideoCodec codec = 1; +} + +// Encoder-side feedback for an encoded video source. Emitted as FfiEvents +// so client SDKs can react (request a fresh keyframe from their encoder, +// adjust target bitrate, etc.). +message EncodedVideoSourceEvent { + required uint64 source_handle = 1; + oneof message { + KeyframeRequested keyframe_requested = 2; + TargetBitrateChanged target_bitrate_changed = 3; + } + + message KeyframeRequested {} + + message TargetBitrateChanged { + required uint32 bitrate_bps = 1; + required double framerate_fps = 2; + } +} diff --git a/livekit-ffi/src/conversion/video_frame.rs b/livekit-ffi/src/conversion/video_frame.rs index 783950b61..cd836606d 100644 --- a/livekit-ffi/src/conversion/video_frame.rs +++ b/livekit-ffi/src/conversion/video_frame.rs @@ -30,7 +30,10 @@ impl From for VideoSourceResolution { impl From<&FfiVideoSource> for proto::VideoSourceInfo { fn from(source: &FfiVideoSource) -> Self { - Self { r#type: source.source_type as i32 } + Self { + r#type: source.source_type as i32, + encoded_source_id: source.encoded_source_id().map(|id| id as u32), + } } } diff --git a/livekit-ffi/src/server/requests.rs b/livekit-ffi/src/server/requests.rs index e27a54168..5566f2681 100644 --- a/livekit-ffi/src/server/requests.rs +++ b/livekit-ffi/src/server/requests.rs @@ -24,8 +24,7 @@ use livekit::{ use parking_lot::Mutex; use super::{ - audio_source, audio_stream, colorcvt, data_stream, data_track, - participant::FfiParticipant, + audio_source, audio_stream, colorcvt, data_stream, data_track, participant::FfiParticipant, resampler, room::{self, FfiPublication, FfiTrack}, video_source, video_stream, FfiError, FfiResult, FfiServer, @@ -474,6 +473,15 @@ unsafe fn on_capture_video_frame( Ok(proto::CaptureVideoFrameResponse::default()) } +/// Push an encoded (compressed) frame to a VIDEO_SOURCE_ENCODED source. +fn on_capture_encoded_video_frame( + server: &'static FfiServer, + push: proto::CaptureEncodedVideoFrameRequest, +) -> FfiResult { + let source = server.retrieve_handle::(push.source_handle)?; + source.capture_encoded_frame(server, push) +} + /// Convert a video frame /// /// # Safety: The user must ensure that the pointers/len provided are valid @@ -1294,6 +1302,9 @@ pub fn handle_request( } Request::NewVideoSource(req) => on_new_video_source(server, req)?.into(), Request::CaptureVideoFrame(req) => unsafe { on_capture_video_frame(server, req)?.into() }, + Request::CaptureEncodedVideoFrame(req) => { + on_capture_encoded_video_frame(server, req)?.into() + } Request::VideoConvert(req) => unsafe { on_video_convert(server, req)?.into() }, Request::NewAudioStream(req) => on_new_audio_stream(server, req)?.into(), Request::NewAudioSource(req) => on_new_audio_source(server, req)?.into(), diff --git a/livekit-ffi/src/server/room.rs b/livekit-ffi/src/server/room.rs index 4c120d3bc..0156698d5 100644 --- a/livekit-ffi/src/server/room.rs +++ b/livekit-ffi/src/server/room.rs @@ -325,6 +325,10 @@ impl FfiRoom { } impl RoomInner { + pub(crate) fn mark_local_publish_callback_sent(&self, sid: TrackSid) { + self.pending_published_tracks.lock().insert(sid); + } + pub fn publish_data( &self, server: &'static FfiServer, @@ -490,7 +494,7 @@ impl RoomInner { .into(), ); - inner.pending_published_tracks.lock().insert(publication.sid()); + inner.mark_local_publish_callback_sent(publication.sid()); } Err(err) => { // Failed to publish the track diff --git a/livekit-ffi/src/server/video_source.rs b/livekit-ffi/src/server/video_source.rs index 047443728..27f0cd9a1 100644 --- a/livekit-ffi/src/server/video_source.rs +++ b/livekit-ffi/src/server/video_source.rs @@ -1,4 +1,4 @@ -// Copyright 2025 LiveKit, Inc. +// Copyright 2026 LiveKit, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,13 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -use super::{colorcvt, FfiHandle}; -use crate::{proto, server, FfiError, FfiHandleId, FfiResult}; +#[cfg(all(not(target_arch = "wasm32"), feature = "encoded-video"))] +use std::sync::Arc; + use livekit::webrtc::{ prelude::*, video_frame::{FrameMetadata, VideoFrame}, }; +use super::{colorcvt, FfiHandle}; +use crate::{proto, server, FfiError, FfiHandleId, FfiResult}; + pub struct FfiVideoSource { pub handle_id: FfiHandleId, pub source_type: proto::VideoSourceType, @@ -36,12 +40,63 @@ fn frame_metadata_from_proto(metadata: Option) -> Option livekit::webrtc::video_source::VideoCodec { + use livekit::webrtc::video_source::VideoCodec; + match codec { + proto::VideoCodec::H264 => VideoCodec::H264, + proto::VideoCodec::H265 => VideoCodec::H265, + proto::VideoCodec::Vp8 => VideoCodec::Vp8, + proto::VideoCodec::Vp9 => VideoCodec::Vp9, + proto::VideoCodec::Av1 => VideoCodec::Av1, + } +} + +/// Forwards encoder-side feedback from the native source out to the FFI +/// client as `EncodedVideoSourceEvent`s. +#[cfg(all(not(target_arch = "wasm32"), feature = "encoded-video"))] +struct EncodedObserverBridge { + server: &'static server::FfiServer, + source_handle: u64, +} + +#[cfg(all(not(target_arch = "wasm32"), feature = "encoded-video"))] +impl livekit::webrtc::video_source::native::EncodedVideoSourceObserver for EncodedObserverBridge { + fn on_keyframe_requested(&self) { + let _ = self.server.send_event( + proto::EncodedVideoSourceEvent { + source_handle: self.source_handle, + message: Some(proto::encoded_video_source_event::Message::KeyframeRequested( + proto::encoded_video_source_event::KeyframeRequested {}, + )), + } + .into(), + ); + } + + fn on_target_bitrate(&self, bitrate_bps: u32, framerate_fps: f64) { + let _ = self.server.send_event( + proto::EncodedVideoSourceEvent { + source_handle: self.source_handle, + message: Some(proto::encoded_video_source_event::Message::TargetBitrateChanged( + proto::encoded_video_source_event::TargetBitrateChanged { + bitrate_bps, + framerate_fps, + }, + )), + } + .into(), + ); + } +} + impl FfiVideoSource { pub fn setup( server: &'static server::FfiServer, new_source: proto::NewVideoSourceRequest, ) -> FfiResult { let source_type = new_source.r#type(); + let handle_id = server.next_id(); #[allow(unreachable_patterns)] let source_inner = match source_type { #[cfg(not(target_arch = "wasm32"))] @@ -53,10 +108,41 @@ impl FfiVideoSource { NativeVideoSource::new(new_source.resolution.into(), is_screencast); RtcVideoSource::Native(video_source) } + #[cfg(all(not(target_arch = "wasm32"), feature = "encoded-video"))] + proto::VideoSourceType::VideoSourceEncoded => { + use livekit::webrtc::video_source::{ + native::NativeEncodedVideoSource, VideoResolution, + }; + + let options = new_source.encoded_options.as_ref().ok_or_else(|| { + FfiError::InvalidRequest( + "encoded_options is required for VIDEO_SOURCE_ENCODED".into(), + ) + })?; + + let codec = video_codec_from_proto(options.codec()); + let resolution = VideoResolution { + width: new_source.resolution.width, + height: new_source.resolution.height, + }; + let source = NativeEncodedVideoSource::new(codec, resolution); + + source.set_observer(Arc::new(EncodedObserverBridge { + server, + source_handle: handle_id, + })); + + RtcVideoSource::Encoded(source) + } + #[cfg(any(target_arch = "wasm32", not(feature = "encoded-video")))] + proto::VideoSourceType::VideoSourceEncoded => { + return Err(FfiError::InvalidRequest( + "Encoded video source support is not enabled".into(), + )); + } _ => return Err(FfiError::InvalidRequest("unsupported video source type".into())), }; - let handle_id = server.next_id(); let video_source = Self { handle_id, source_type, source: source_inner }; let source_info = proto::VideoSourceInfo::from(&video_source); server.store_handle(handle_id, video_source); @@ -67,6 +153,16 @@ impl FfiVideoSource { }) } + /// Returns the unique 16-bit id assigned to an encoded source by the + /// WebRTC layer. `None` for non-encoded sources. + pub fn encoded_source_id(&self) -> Option { + #[cfg(all(not(target_arch = "wasm32"), feature = "encoded-video"))] + if let RtcVideoSource::Encoded(ref source) = self.source { + return Some(source.source_id()); + } + None + } + pub unsafe fn capture_frame( &self, _server: &'static server::FfiServer, @@ -85,10 +181,47 @@ impl FfiVideoSource { source.capture_frame(&frame); } + #[cfg(all(not(target_arch = "wasm32"), feature = "encoded-video"))] + RtcVideoSource::Encoded(_) => { + return Err(FfiError::InvalidRequest( + "capture_video_frame is not supported for encoded sources; \ + use capture_encoded_video_frame instead" + .into(), + )); + } _ => {} } Ok(()) } + + pub fn capture_encoded_frame( + &self, + _server: &'static server::FfiServer, + capture: proto::CaptureEncodedVideoFrameRequest, + ) -> FfiResult { + #[cfg(any(target_arch = "wasm32", not(feature = "encoded-video")))] + let _ = &capture; + + match self.source { + #[cfg(all(not(target_arch = "wasm32"), feature = "encoded-video"))] + RtcVideoSource::Encoded(ref source) => { + use livekit::webrtc::video_source::EncodedFrameInfo; + + let info = EncodedFrameInfo { + is_keyframe: capture.is_keyframe, + has_sps_pps: capture.has_sps_pps.unwrap_or(false), + width: capture.width.unwrap_or(0), + height: capture.height.unwrap_or(0), + capture_time_us: capture.capture_time_us.unwrap_or(0), + }; + let accepted = source.capture_frame(&capture.data, &info); + Ok(proto::CaptureEncodedVideoFrameResponse { accepted }) + } + _ => Err(FfiError::InvalidRequest( + "capture_encoded_video_frame requires a VIDEO_SOURCE_ENCODED source".into(), + )), + } + } } #[cfg(test)] diff --git a/livekit/Cargo.toml b/livekit/Cargo.toml index 8022f8d03..51962e822 100644 --- a/livekit/Cargo.toml +++ b/livekit/Cargo.toml @@ -13,6 +13,7 @@ default = ["tokio"] async = ["livekit-api/signal-client-async"] tokio = ["livekit-api/signal-client-tokio"] dispatcher = ["livekit-api/signal-client-dispatcher"] +encoded-video = ["libwebrtc/encoded-video"] # Note that the following features only change the behavior of tokio-tungstenite. @@ -34,7 +35,7 @@ livekit-datatrack = { workspace = true } prost = "0.12" serde = { version = "1", features = ["derive"] } serde_json = "1.0" -tokio = { version = "1", default-features = false, features = ["sync", "macros", "fs"] } +tokio = { version = "1", default-features = false, features = ["sync", "macros", "fs", "net", "io-util", "time"] } parking_lot = { version = "0.12" } futures-util = { version = "0.3", default-features = false, features = ["sink"] } thiserror = "1.0" diff --git a/livekit/src/room/options.rs b/livekit/src/room/options.rs index 2fbb79f19..675f2832d 100644 --- a/livekit/src/room/options.rs +++ b/livekit/src/room/options.rs @@ -38,6 +38,19 @@ impl VideoCodec { } } +#[cfg(feature = "encoded-video")] +impl From for VideoCodec { + fn from(codec: libwebrtc::video_source::VideoCodec) -> Self { + match codec { + libwebrtc::video_source::VideoCodec::H264 => VideoCodec::H264, + libwebrtc::video_source::VideoCodec::H265 => VideoCodec::H265, + libwebrtc::video_source::VideoCodec::Vp8 => VideoCodec::VP8, + libwebrtc::video_source::VideoCodec::Vp9 => VideoCodec::VP9, + libwebrtc::video_source::VideoCodec::Av1 => VideoCodec::AV1, + } + } +} + #[derive(Debug, Clone)] pub struct VideoResolution { pub width: u32, diff --git a/livekit/src/room/participant/local_participant.rs b/livekit/src/room/participant/local_participant.rs index 1053abde5..eebdf8528 100644 --- a/livekit/src/room/participant/local_participant.rs +++ b/livekit/src/room/participant/local_participant.rs @@ -302,6 +302,36 @@ impl LocalParticipant { track: LocalTrack, options: TrackPublishOptions, ) -> RoomResult { + // Encoded video sources deliver encoded single-layer frames. + // Force-disable simulcast and pin the negotiated codec to the + // source's codec so WebRTC's encoder factory picks our passthrough + // encoder path. + #[cfg(all(not(target_arch = "wasm32"), feature = "encoded-video"))] + let options = { + let mut publish_options = options; + if let LocalTrack::Video(ref video_track) = track { + if let RtcVideoSource::Encoded(ref encoded_source) = video_track.rtc_source() { + let source_codec: options::VideoCodec = encoded_source.codec().into(); + if publish_options.video_codec != source_codec { + log::warn!( + "publish_track: overriding video_codec {:?} -> {:?} to match encoded source", + publish_options.video_codec, + source_codec + ); + publish_options.video_codec = source_codec; + } + if publish_options.simulcast { + log::warn!( + "publish_track: disabling simulcast for encoded video source (single-layer only)" + ); + publish_options.simulcast = false; + } + } + } + publish_options + }; + #[cfg(any(target_arch = "wasm32", not(feature = "encoded-video")))] + let options = options; let disable_red = self.local.encryption_type != EncryptionType::None || !options.red; let mut req = proto::AddTrackRequest { diff --git a/livekit/src/room/track/local_video_track.rs b/livekit/src/room/track/local_video_track.rs index 0da8f683c..a40a2a2b2 100644 --- a/livekit/src/room/track/local_video_track.rs +++ b/livekit/src/room/track/local_video_track.rs @@ -61,6 +61,14 @@ impl LocalVideoTrack { .pc_factory() .create_video_track(&libwebrtc::native::create_random_uuid(), native_source) } + #[cfg(all(not(target_arch = "wasm32"), feature = "encoded-video"))] + RtcVideoSource::Encoded(encoded_source) => { + use libwebrtc::peer_connection_factory::native::PeerConnectionFactoryExt; + LkRuntime::instance().pc_factory().create_video_track_from_encoded_source( + &libwebrtc::native::create_random_uuid(), + encoded_source, + ) + } _ => panic!("unsupported video source"), }; diff --git a/webrtc-sys/Cargo.toml b/webrtc-sys/Cargo.toml index d1bbd8bdc..e44681d7b 100644 --- a/webrtc-sys/Cargo.toml +++ b/webrtc-sys/Cargo.toml @@ -9,6 +9,7 @@ repository.workspace = true [features] default = [] +encoded-video = [] [dependencies] cxx = "1.0" diff --git a/webrtc-sys/build.rs b/webrtc-sys/build.rs index 072794ecf..3917a1f4a 100644 --- a/webrtc-sys/build.rs +++ b/webrtc-sys/build.rs @@ -24,9 +24,19 @@ fn main() { let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap(); let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap(); let is_desktop = target_os == "linux" || target_os == "windows" || target_os == "macos"; + let encoded_video = env::var("CARGO_FEATURE_ENCODED_VIDEO").is_ok(); println!("cargo:rerun-if-env-changed=LK_DEBUG_WEBRTC"); println!("cargo:rerun-if-env-changed=LK_CUSTOM_WEBRTC"); + println!("cargo:rustc-check-cfg=cfg(encoded_video)"); + + if encoded_video { + // cxx_build evaluates cfgs from the build-script environment. Cargo + // exposes `encoded-video` as CARGO_FEATURE_ENCODED_VIDEO, which does + // not match `feature = "encoded-video"` in cxx's cfg evaluator. + println!("cargo:rustc-cfg=encoded_video"); + env::set_var("CARGO_CFG_ENCODED_VIDEO", "1"); + } let mut rust_files = vec![ "src/peer_connection.rs", @@ -57,6 +67,10 @@ fn main() { "src/packet_trailer.rs", ]; + if encoded_video { + rust_files.push("src/encoded_video_source.rs"); + } + if is_desktop { rust_files.push("src/desktop_capturer.rs"); } @@ -93,6 +107,13 @@ fn main() { "src/packet_trailer.cpp", ]); + if encoded_video { + builder + .file("src/encoded_video_source.cpp") + .file("src/passthrough_video_encoder.cpp") + .define("LK_PRE_ENCODED_VIDEO", "1"); + } + if is_desktop { builder.file("src/desktop_capturer.cpp"); } diff --git a/webrtc-sys/include/livekit/encoded_video_source.h b/webrtc-sys/include/livekit/encoded_video_source.h new file mode 100644 index 000000000..c1c8dc664 --- /dev/null +++ b/webrtc-sys/include/livekit/encoded_video_source.h @@ -0,0 +1,168 @@ +/* + * Copyright 2026 LiveKit, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "api/media_stream_interface.h" +#include "api/scoped_refptr.h" +#include "api/video/video_frame.h" +#include "media/base/adapted_video_track_source.h" +#include "rtc_base/synchronization/mutex.h" +#include "rust/cxx.h" + +namespace livekit_ffi { + +class EncodedVideoTrackSource; +class EncodedVideoSourceWrapper; + +} // namespace livekit_ffi + +#include "webrtc-sys/src/encoded_video_source.rs.h" + +namespace livekit_ffi { + +// Process-global registry that maps a 16-bit source id (stamped on every +// dummy VideoFrame via VideoFrame::set_id) to the owning encoded source. +// +// This is the mechanism the LazyVideoEncoder uses to decide whether to +// instantiate a PassthroughVideoEncoder or a real encoder on the first +// Encode() call. Keying on VideoFrame::id() (rather than codec name) ensures +// per-track routing is correct even when multiple encoded sources share a +// codec. +class EncodedSourceRegistry { + public: + static EncodedSourceRegistry& instance(); + + // Returns a new non-zero u16 id, skipping any id currently in use. + uint16_t allocate_id(); + + void register_source(uint16_t id, EncodedVideoTrackSource* src); + void unregister_source(uint16_t id); + EncodedVideoTrackSource* lookup(uint16_t id); + + private: + EncodedSourceRegistry() = default; + + std::mutex mu_; + std::unordered_map map_; + uint32_t next_id_ = 1; +}; + +// Owns a single encoded video feed. The paired PassthroughVideoEncoder pops +// frames from this source via the registry (looked up by VideoFrame::id()). +class EncodedVideoTrackSource { + public: + class InternalSource : public webrtc::AdaptedVideoTrackSource { + public: + InternalSource(uint16_t source_id, + EncodedVideoCodecType codec, + uint32_t width, + uint32_t height); + ~InternalSource() override; + + bool is_screencast() const override { return false; } + std::optional needs_denoising() const override { return std::nullopt; } + SourceState state() const override { return kLive; } + bool remote() const override { return false; } + + uint16_t source_id() const { return source_id_; } + EncodedVideoCodecType codec() const { return codec_; } + + // Enqueues the encoded bytes and pushes one dummy VideoFrame into the + // WebRTC pipeline so the encoder tick fires. Returns false if the frame + // was dropped because the queue was full and the frame was not a keyframe. + bool push_encoded_frame(std::vector data, + bool is_keyframe, + bool has_sps_pps, + uint32_t width, + uint32_t height, + int64_t capture_time_us); + + struct DequeuedFrame { + std::vector data; + bool is_keyframe = false; + bool has_sps_pps = false; + uint32_t width = 0; + uint32_t height = 0; + int64_t capture_time_us = 0; + }; + bool pop_encoded_frame(DequeuedFrame& out); + + // Wired into PassthroughVideoEncoder::Encode / SetRates so the Rust + // producer can react to PLI/FIR and congestion control. + void notify_keyframe_requested(); + void notify_target_bitrate(uint32_t bitrate_bps, double framerate_fps); + + void set_observer(rust::Box observer); + + private: + const uint16_t source_id_; + const EncodedVideoCodecType codec_; + + mutable webrtc::Mutex mutex_; + std::deque queue_; + uint32_t width_; + uint32_t height_; + std::unique_ptr> observer_; + + // Cached H.264/H.265 parameter sets, each with a leading 4-byte Annex-B + // start code. Populated by scanning incoming keyframes. Prepended to + // later keyframes that arrive without inline parameter sets. + // + // For H.264: vps is unused. For H.265: all three are typically present. + std::vector cached_vps_; + std::vector cached_sps_; + std::vector cached_pps_; + + static constexpr size_t kMaxQueueSize = 8; + }; + + EncodedVideoTrackSource(EncodedVideoCodecType codec, + uint32_t width, + uint32_t height); + ~EncodedVideoTrackSource(); + + uint16_t source_id() const { return source_->source_id(); } + EncodedVideoCodecType codec() const { return source_->codec(); } + + bool capture_frame(rust::Slice data, + bool is_keyframe, + bool has_sps_pps, + uint32_t width, + uint32_t height, + int64_t capture_time_us) const; + + void set_observer(rust::Box observer) const; + + webrtc::scoped_refptr get() const { return source_; } + + private: + webrtc::scoped_refptr source_; +}; + +std::shared_ptr new_encoded_video_track_source( + EncodedVideoCodecType codec, + uint32_t width, + uint32_t height); + +} // namespace livekit_ffi diff --git a/webrtc-sys/include/livekit/passthrough_video_encoder.h b/webrtc-sys/include/livekit/passthrough_video_encoder.h new file mode 100644 index 000000000..179947209 --- /dev/null +++ b/webrtc-sys/include/livekit/passthrough_video_encoder.h @@ -0,0 +1,129 @@ +/* + * Copyright 2026 LiveKit, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include "api/environment/environment.h" +#include "api/video/video_frame.h" +#include "api/video_codecs/sdp_video_format.h" +#include "api/video_codecs/video_codec.h" +#include "api/video_codecs/video_encoder.h" +#include "api/video_codecs/video_encoder_factory.h" +#include "livekit/encoded_video_source.h" + +namespace livekit_ffi { + +// Encoder that takes encoded bitstream bytes from a paired +// EncodedVideoTrackSource and forwards them unmodified to the +// EncodedImageCallback. Used for applications that already produce H.264 / +// H.265 / VP8 / VP9 / AV1 bitstreams (e.g. from a hardware capturer or a +// remote camera feed) and want to pipe them through WebRTC without +// re-encoding. +class PassthroughVideoEncoder : public webrtc::VideoEncoder { + public: + // The encoder holds a strong ref to the source so that: + // * Encode() can pop frames / notify keyframe requests without a registry + // lookup (bound 1:1 at construction) + // * SetRates() can forward congestion-controller target bitrate updates + // to the Rust producer immediately. + explicit PassthroughVideoEncoder( + webrtc::scoped_refptr source); + ~PassthroughVideoEncoder() override; + + // webrtc::VideoEncoder + int InitEncode(const webrtc::VideoCodec* codec_settings, + const Settings& settings) override; + int32_t RegisterEncodeCompleteCallback( + webrtc::EncodedImageCallback* callback) override; + int32_t Release() override; + int32_t Encode( + const webrtc::VideoFrame& frame, + const std::vector* frame_types) override; + void SetRates(const RateControlParameters& parameters) override; + EncoderInfo GetEncoderInfo() const override; + + private: + const webrtc::scoped_refptr source_; + const EncodedVideoCodecType codec_; + webrtc::EncodedImageCallback* callback_ = nullptr; + webrtc::VideoCodec codec_settings_{}; + bool initialized_ = false; +}; + +// Wraps a webrtc::VideoEncoder built lazily on the first Encode() call. This +// lets us delay the decision of "passthrough vs. real encoder" until we can +// inspect the incoming VideoFrame::id() and check the EncodedSourceRegistry. +// +// Cost: one registry lookup + one encoder construction on the first frame. +// Subsequent frames are a single virtual call with no extra overhead. +class LazyVideoEncoder : public webrtc::VideoEncoder { + public: + // `real_encoder_builder` is called at most once, the first time Encode() + // receives a frame that does not correspond to an encoded source. + using RealEncoderBuilder = + std::function()>; + + LazyVideoEncoder(webrtc::SdpVideoFormat format, + RealEncoderBuilder real_encoder_builder); + ~LazyVideoEncoder() override; + + int InitEncode(const webrtc::VideoCodec* codec_settings, + const Settings& settings) override; + int32_t RegisterEncodeCompleteCallback( + webrtc::EncodedImageCallback* callback) override; + int32_t Release() override; + int32_t Encode( + const webrtc::VideoFrame& frame, + const std::vector* frame_types) override; + void SetRates(const RateControlParameters& parameters) override; + void OnPacketLossRateUpdate(float packet_loss_rate) override; + void OnRttUpdate(int64_t rtt_ms) override; + void OnLossNotification(const LossNotification& loss_notification) override; + EncoderInfo GetEncoderInfo() const override; + + private: + // Build the underlying encoder based on frame.id() lookup. Returns true on + // success. Safe to call exactly once. + bool BuildInner(uint16_t frame_id); + + const webrtc::SdpVideoFormat format_; + RealEncoderBuilder real_encoder_builder_; + + // Set on first Encode(). + std::unique_ptr inner_; + bool is_passthrough_ = false; + + // Deferred InitEncode() args. + webrtc::VideoCodec pending_codec_settings_{}; + webrtc::VideoEncoder::Settings pending_settings_{ + webrtc::VideoEncoder::Capabilities(/*loss_notification=*/false), + /*number_of_cores=*/1, + /*max_payload_size=*/1200}; + bool has_pending_init_ = false; + webrtc::EncodedImageCallback* callback_ = nullptr; + + // Cached rate / loss / rtt updates that arrived before Encode(). + std::optional pending_rates_; + std::optional pending_loss_rate_; + std::optional pending_rtt_ms_; +}; + +} // namespace livekit_ffi diff --git a/webrtc-sys/include/livekit/peer_connection_factory.h b/webrtc-sys/include/livekit/peer_connection_factory.h index 0e77dbadb..e0edf4ef4 100644 --- a/webrtc-sys/include/livekit/peer_connection_factory.h +++ b/webrtc-sys/include/livekit/peer_connection_factory.h @@ -29,6 +29,9 @@ namespace livekit_ffi { class PeerConnectionFactory; class PeerConnectionObserverWrapper; +#ifdef LK_PRE_ENCODED_VIDEO +class EncodedVideoTrackSource; +#endif } // namespace livekit_ffi #include "webrtc-sys/src/peer_connection_factory.rs.h" @@ -53,6 +56,12 @@ class PeerConnectionFactory { rust::String label, std::shared_ptr source) const; +#ifdef LK_PRE_ENCODED_VIDEO + std::shared_ptr create_video_track_from_encoded_source( + rust::String label, + std::shared_ptr source) const; +#endif + std::shared_ptr create_audio_track( rust::String label, std::shared_ptr source) const; diff --git a/webrtc-sys/src/encoded_video_source.cpp b/webrtc-sys/src/encoded_video_source.cpp new file mode 100644 index 000000000..e35fbf85d --- /dev/null +++ b/webrtc-sys/src/encoded_video_source.cpp @@ -0,0 +1,386 @@ +/* + * Copyright 2026 LiveKit, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "livekit/encoded_video_source.h" + +#include +#include +#include + +#include "api/video/i420_buffer.h" +#include "api/video/video_frame.h" +#include "api/video/video_rotation.h" +#include "rtc_base/logging.h" +#include "rtc_base/ref_counted_object.h" +#include "rtc_base/time_utils.h" + +namespace livekit_ffi { + +namespace { + +// ---- Annex-B NAL unit parsing ---- +// +// Produces a list of NAL units in the bytestream. Each NalUnit records the +// offset to its leading start code (00 00 01 or 00 00 00 01) and the +// payload offset/length (the bytes after the start code, up to the next +// start code or end of buffer). + +struct NalUnit { + size_t start_code_offset; // index of the first 0x00 of the start code + size_t start_code_length; // 3 or 4 + size_t payload_offset; // index of the first byte after the start code + size_t payload_length; // length of the NAL unit payload (no start code) + uint8_t first_byte; // payload[0] — used for NAL type extraction +}; + +std::vector ScanNalUnits(const uint8_t* data, size_t size) { + std::vector units; + if (size < 3) return units; + + // Locate start code candidates: positions where data[i..i+2] == 00 00 01. + // Track them in order; then materialize units with proper payload lengths. + std::vector> starts; // (offset, length) + for (size_t i = 0; i + 2 < size;) { + if (data[i] == 0 && data[i + 1] == 0 && data[i + 2] == 1) { + size_t off = i; + size_t len = 3; + if (i > 0 && data[i - 1] == 0) { + off = i - 1; + len = 4; + } + starts.emplace_back(off, len); + i += 3; + } else { + ++i; + } + } + + for (size_t j = 0; j < starts.size(); ++j) { + NalUnit u; + u.start_code_offset = starts[j].first; + u.start_code_length = starts[j].second; + u.payload_offset = u.start_code_offset + u.start_code_length; + size_t payload_end = + (j + 1 < starts.size()) ? starts[j + 1].first : size; + if (payload_end < u.payload_offset) continue; + u.payload_length = payload_end - u.payload_offset; + u.first_byte = u.payload_length > 0 ? data[u.payload_offset] : 0; + units.push_back(u); + } + return units; +} + +// H.264 NAL unit types we care about. +enum : uint8_t { + kH264NalSps = 7, + kH264NalPps = 8, +}; + +// H.265 NAL unit types we care about. +enum : uint8_t { + kH265NalVps = 32, + kH265NalSps = 33, + kH265NalPps = 34, +}; + +uint8_t H264NalType(uint8_t byte) { return byte & 0x1Fu; } +uint8_t H265NalType(uint8_t byte) { return (byte >> 1) & 0x3Fu; } + +// Copies [start_code_offset, payload_end) into `out`, including the start +// code. `out` is overwritten. +void CopyNalWithStartCode(const uint8_t* data, + const NalUnit& u, + std::vector& out) { + const size_t total = u.start_code_length + u.payload_length; + out.assign(data + u.start_code_offset, + data + u.start_code_offset + total); +} + +} // namespace + +// ---------- EncodedSourceRegistry ---------- + +EncodedSourceRegistry& EncodedSourceRegistry::instance() { + static EncodedSourceRegistry reg; + return reg; +} + +uint16_t EncodedSourceRegistry::allocate_id() { + std::lock_guard lock(mu_); + // Skip kNotSetId (0) and any id currently mapped. With 65535 usable slots + // and short-lived encoded tracks this loop is effectively O(1). + for (uint32_t probe = 0; probe < 0x10000u; ++probe) { + uint16_t candidate = static_cast(next_id_); + next_id_ = next_id_ + 1; + if (next_id_ > 0xFFFFu) { + next_id_ = 1; + } + if (candidate == 0) continue; + if (map_.find(candidate) == map_.end()) { + return candidate; + } + } + RTC_LOG(LS_ERROR) + << "EncodedSourceRegistry exhausted all 65535 slots; reusing 1"; + return 1; +} + +void EncodedSourceRegistry::register_source(uint16_t id, + EncodedVideoTrackSource* src) { + std::lock_guard lock(mu_); + map_[id] = src; +} + +void EncodedSourceRegistry::unregister_source(uint16_t id) { + std::lock_guard lock(mu_); + map_.erase(id); +} + +EncodedVideoTrackSource* EncodedSourceRegistry::lookup(uint16_t id) { + if (id == 0) return nullptr; + std::lock_guard lock(mu_); + auto it = map_.find(id); + return it == map_.end() ? nullptr : it->second; +} + +// ---------- EncodedVideoTrackSource::InternalSource ---------- + +EncodedVideoTrackSource::InternalSource::InternalSource( + uint16_t source_id, + EncodedVideoCodecType codec, + uint32_t width, + uint32_t height) + : webrtc::AdaptedVideoTrackSource(/*required_alignment=*/1), + source_id_(source_id), + codec_(codec), + width_(width), + height_(height) {} + +EncodedVideoTrackSource::InternalSource::~InternalSource() = default; + +bool EncodedVideoTrackSource::InternalSource::push_encoded_frame( + std::vector data, + bool is_keyframe, + bool has_sps_pps, + uint32_t width, + uint32_t height, + int64_t capture_time_us) { + { + webrtc::MutexLock lock(&mutex_); + + if (width != 0 && height != 0) { + width_ = width; + height_ = height; + } + + // For H.264 / H.265, cache parameter sets we see in the bytestream and + // auto-prepend them to keyframes that arrive without inline params. + // Delta frames are passed through unchanged — receivers carry the last + // seen parameter sets across the stream. + const bool param_sets_applicable = + (codec_ == EncodedVideoCodecType::H264 || + codec_ == EncodedVideoCodecType::H265); + + if (param_sets_applicable) { + const auto units = ScanNalUnits(data.data(), data.size()); + bool saw_sps = false; + bool saw_pps = false; + bool saw_vps = false; + for (const auto& u : units) { + if (codec_ == EncodedVideoCodecType::H264) { + const uint8_t t = H264NalType(u.first_byte); + if (t == kH264NalSps) { + CopyNalWithStartCode(data.data(), u, cached_sps_); + saw_sps = true; + } else if (t == kH264NalPps) { + CopyNalWithStartCode(data.data(), u, cached_pps_); + saw_pps = true; + } + } else { // H.265 + const uint8_t t = H265NalType(u.first_byte); + if (t == kH265NalVps) { + CopyNalWithStartCode(data.data(), u, cached_vps_); + saw_vps = true; + } else if (t == kH265NalSps) { + CopyNalWithStartCode(data.data(), u, cached_sps_); + saw_sps = true; + } else if (t == kH265NalPps) { + CopyNalWithStartCode(data.data(), u, cached_pps_); + saw_pps = true; + } + } + } + + if (is_keyframe) { + // Required params for this codec. + const bool h265 = codec_ == EncodedVideoCodecType::H265; + const bool have_required = + !cached_sps_.empty() && !cached_pps_.empty() && + (!h265 || !cached_vps_.empty()); + const bool frame_missing = + !(saw_sps && saw_pps && (!h265 || saw_vps)); + + if (frame_missing && have_required) { + // Prepend cached params. (void)has_sps_pps — we trust the + // scanner over the flag so callers can't accidentally double- + // prepend or lie about the contents. + std::vector prefixed; + prefixed.reserve(cached_vps_.size() + cached_sps_.size() + + cached_pps_.size() + data.size()); + if (h265) { + prefixed.insert(prefixed.end(), cached_vps_.begin(), + cached_vps_.end()); + } + prefixed.insert(prefixed.end(), cached_sps_.begin(), + cached_sps_.end()); + prefixed.insert(prefixed.end(), cached_pps_.begin(), + cached_pps_.end()); + prefixed.insert(prefixed.end(), data.begin(), data.end()); + data = std::move(prefixed); + has_sps_pps = true; + } else if (frame_missing) { + RTC_LOG(LS_WARNING) + << "EncodedVideoTrackSource[" << source_id_ + << "] keyframe is missing parameter sets and none are cached; " + "receiver will fail to decode until the producer emits a " + "keyframe with inline SPS/PPS" + << (h265 ? "/VPS" : ""); + } else { + // Frame already carries required params (producer inlined them). + has_sps_pps = true; + } + } + } + + // Bounded queue: drop-oldest, but never drop a keyframe. + while (queue_.size() >= kMaxQueueSize) { + if (queue_.front().is_keyframe && !is_keyframe) { + RTC_LOG(LS_WARNING) + << "EncodedVideoTrackSource[" << source_id_ + << "] queue full; dropping incoming delta to preserve keyframe"; + return false; + } + queue_.pop_front(); + } + + DequeuedFrame f; + f.data = std::move(data); + f.is_keyframe = is_keyframe; + f.has_sps_pps = has_sps_pps; + f.width = width_; + f.height = height_; + f.capture_time_us = capture_time_us; + queue_.push_back(std::move(f)); + } + + // Emit a dummy VideoFrame so the WebRTC pipeline ticks. The actual bytes + // are pulled out by PassthroughVideoEncoder via the registry, keyed on + // source_id_ stamped into VideoFrame::id(). + // + // The dummy buffer is 2x2 I420 black; callers never see it. WebRTC needs + // *some* buffer here. The width/height on the VideoFrame carry the real + // resolution so downstream stats, pacing, and simulcast decisions work. + auto dummy_buffer = webrtc::I420Buffer::Create(2, 2); + webrtc::I420Buffer::SetBlack(dummy_buffer.get()); + + webrtc::VideoFrame frame = + webrtc::VideoFrame::Builder() + .set_video_frame_buffer(dummy_buffer) + .set_rotation(webrtc::kVideoRotation_0) + .set_timestamp_us(capture_time_us != 0 ? capture_time_us + : webrtc::TimeMicros()) + .set_id(source_id_) + .build(); + + OnFrame(frame); + return true; +} + +bool EncodedVideoTrackSource::InternalSource::pop_encoded_frame( + DequeuedFrame& out) { + webrtc::MutexLock lock(&mutex_); + if (queue_.empty()) return false; + out = std::move(queue_.front()); + queue_.pop_front(); + return true; +} + +void EncodedVideoTrackSource::InternalSource::notify_keyframe_requested() { + webrtc::MutexLock lock(&mutex_); + if (observer_) { + (*observer_)->on_keyframe_requested(); + } +} + +void EncodedVideoTrackSource::InternalSource::notify_target_bitrate( + uint32_t bitrate_bps, + double framerate_fps) { + webrtc::MutexLock lock(&mutex_); + if (observer_) { + (*observer_)->on_target_bitrate(bitrate_bps, framerate_fps); + } +} + +void EncodedVideoTrackSource::InternalSource::set_observer( + rust::Box observer) { + webrtc::MutexLock lock(&mutex_); + observer_ = std::make_unique>( + std::move(observer)); +} + +// ---------- EncodedVideoTrackSource ---------- + +EncodedVideoTrackSource::EncodedVideoTrackSource(EncodedVideoCodecType codec, + uint32_t width, + uint32_t height) { + uint16_t id = EncodedSourceRegistry::instance().allocate_id(); + source_ = webrtc::make_ref_counted(id, codec, width, height); + EncodedSourceRegistry::instance().register_source(id, this); + RTC_LOG(LS_INFO) << "EncodedVideoTrackSource created id=" << id + << " codec=" << static_cast(codec) << " " << width + << "x" << height; +} + +EncodedVideoTrackSource::~EncodedVideoTrackSource() { + EncodedSourceRegistry::instance().unregister_source(source_->source_id()); + RTC_LOG(LS_INFO) << "EncodedVideoTrackSource destroyed id=" + << source_->source_id(); +} + +bool EncodedVideoTrackSource::capture_frame(rust::Slice data, + bool is_keyframe, + bool has_sps_pps, + uint32_t width, + uint32_t height, + int64_t capture_time_us) const { + std::vector buf(data.begin(), data.end()); + return source_->push_encoded_frame(std::move(buf), is_keyframe, has_sps_pps, + width, height, capture_time_us); +} + +void EncodedVideoTrackSource::set_observer( + rust::Box observer) const { + source_->set_observer(std::move(observer)); +} + +std::shared_ptr new_encoded_video_track_source( + EncodedVideoCodecType codec, + uint32_t width, + uint32_t height) { + return std::make_shared(codec, width, height); +} + +} // namespace livekit_ffi diff --git a/webrtc-sys/src/encoded_video_source.rs b/webrtc-sys/src/encoded_video_source.rs new file mode 100644 index 000000000..82b6ecbd4 --- /dev/null +++ b/webrtc-sys/src/encoded_video_source.rs @@ -0,0 +1,95 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use crate::impl_thread_safety; + +#[cxx::bridge(namespace = "livekit_ffi")] +pub mod ffi { + #[repr(u8)] + #[derive(Debug, Clone, Copy, PartialEq, Eq)] + pub enum EncodedVideoCodecType { + H264 = 0, + H265 = 1, + Vp8 = 2, + Vp9 = 3, + Av1 = 4, + } + + unsafe extern "C++" { + include!("livekit/encoded_video_source.h"); + + type EncodedVideoTrackSource; + + fn new_encoded_video_track_source( + codec: EncodedVideoCodecType, + width: u32, + height: u32, + ) -> SharedPtr; + + fn source_id(self: &EncodedVideoTrackSource) -> u16; + fn codec(self: &EncodedVideoTrackSource) -> EncodedVideoCodecType; + + fn capture_frame( + self: &EncodedVideoTrackSource, + data: &[u8], + is_keyframe: bool, + has_sps_pps: bool, + width: u32, + height: u32, + capture_time_us: i64, + ) -> bool; + + fn set_observer(self: &EncodedVideoTrackSource, observer: Box); + } + + extern "Rust" { + type EncodedVideoSourceWrapper; + + fn on_keyframe_requested(self: &EncodedVideoSourceWrapper); + fn on_target_bitrate( + self: &EncodedVideoSourceWrapper, + bitrate_bps: u32, + framerate_fps: f64, + ); + } +} + +impl_thread_safety!(ffi::EncodedVideoTrackSource, Send + Sync); + +/// Trait implemented by Rust consumers to receive encoder feedback (keyframe +/// requests, target bitrate updates) from WebRTC. +pub trait EncodedVideoSourceObserver: Send + Sync { + fn on_keyframe_requested(&self); + fn on_target_bitrate(&self, bitrate_bps: u32, framerate_fps: f64); +} + +pub struct EncodedVideoSourceWrapper { + observer: Arc, +} + +impl EncodedVideoSourceWrapper { + pub fn new(observer: Arc) -> Self { + Self { observer } + } + + fn on_keyframe_requested(&self) { + self.observer.on_keyframe_requested(); + } + + fn on_target_bitrate(&self, bitrate_bps: u32, framerate_fps: f64) { + self.observer.on_target_bitrate(bitrate_bps, framerate_fps); + } +} diff --git a/webrtc-sys/src/lib.rs b/webrtc-sys/src/lib.rs index 94f4eed0c..e23b63400 100644 --- a/webrtc-sys/src/lib.rs +++ b/webrtc-sys/src/lib.rs @@ -22,6 +22,8 @@ pub mod candidate; pub mod data_channel; #[cfg(any(target_os = "macos", target_os = "windows", target_os = "linux"))] pub mod desktop_capturer; +#[cfg(feature = "encoded-video")] +pub mod encoded_video_source; pub mod frame_cryptor; pub mod helper; pub mod jsep; diff --git a/webrtc-sys/src/passthrough_video_encoder.cpp b/webrtc-sys/src/passthrough_video_encoder.cpp new file mode 100644 index 000000000..accc178ee --- /dev/null +++ b/webrtc-sys/src/passthrough_video_encoder.cpp @@ -0,0 +1,313 @@ +/* + * Copyright 2026 LiveKit, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "livekit/passthrough_video_encoder.h" + +#include +#include + +#include "api/video/encoded_image.h" +#include "api/video/video_codec_type.h" +#include "api/video/video_frame_type.h" +#include "modules/video_coding/include/video_codec_interface.h" +#include "modules/video_coding/include/video_error_codes.h" +#include "rtc_base/logging.h" + +namespace livekit_ffi { + +namespace { + +webrtc::VideoCodecType ToWebrtcCodec(EncodedVideoCodecType codec) { + switch (codec) { + case EncodedVideoCodecType::H264: + return webrtc::kVideoCodecH264; + case EncodedVideoCodecType::H265: + return webrtc::kVideoCodecH265; + case EncodedVideoCodecType::Vp8: + return webrtc::kVideoCodecVP8; + case EncodedVideoCodecType::Vp9: + return webrtc::kVideoCodecVP9; + case EncodedVideoCodecType::Av1: + return webrtc::kVideoCodecAV1; + default: + return webrtc::kVideoCodecGeneric; + } +} + +bool FrameTypesRequestKeyframe( + const std::vector* frame_types) { + if (!frame_types) + return false; + return std::any_of(frame_types->begin(), frame_types->end(), + [](webrtc::VideoFrameType t) { + return t == webrtc::VideoFrameType::kVideoFrameKey; + }); +} + +} // namespace + +// ---------- PassthroughVideoEncoder ---------- + +PassthroughVideoEncoder::PassthroughVideoEncoder( + webrtc::scoped_refptr source) + : source_(std::move(source)), + codec_(source_ ? source_->codec() : EncodedVideoCodecType::H264) { + RTC_DCHECK(source_); +} + +PassthroughVideoEncoder::~PassthroughVideoEncoder() = default; + +int PassthroughVideoEncoder::InitEncode( + const webrtc::VideoCodec* codec_settings, + const Settings& settings) { + if (codec_settings) { + codec_settings_ = *codec_settings; + } + initialized_ = true; + return WEBRTC_VIDEO_CODEC_OK; +} + +int32_t PassthroughVideoEncoder::RegisterEncodeCompleteCallback( + webrtc::EncodedImageCallback* callback) { + callback_ = callback; + return WEBRTC_VIDEO_CODEC_OK; +} + +int32_t PassthroughVideoEncoder::Release() { + callback_ = nullptr; + initialized_ = false; + return WEBRTC_VIDEO_CODEC_OK; +} + +int32_t PassthroughVideoEncoder::Encode( + const webrtc::VideoFrame& frame, + const std::vector* frame_types) { + if (!initialized_ || !callback_) { + return WEBRTC_VIDEO_CODEC_UNINITIALIZED; + } + + if (FrameTypesRequestKeyframe(frame_types)) { + source_->notify_keyframe_requested(); + } + + EncodedVideoTrackSource::InternalSource::DequeuedFrame enc; + if (!source_->pop_encoded_frame(enc)) { + // No bytes queued for this tick; treat as a dropped frame so WebRTC's + // pacing accounting is correct. + callback_->OnDroppedFrame( + webrtc::EncodedImageCallback::DropReason::kDroppedByEncoder); + return WEBRTC_VIDEO_CODEC_OK; + } + + webrtc::EncodedImage image; + image.SetEncodedData( + webrtc::EncodedImageBuffer::Create(enc.data.data(), enc.data.size())); + image.SetFrameType(enc.is_keyframe + ? webrtc::VideoFrameType::kVideoFrameKey + : webrtc::VideoFrameType::kVideoFrameDelta); + image.SetRtpTimestamp(frame.rtp_timestamp()); + image.capture_time_ms_ = enc.capture_time_us != 0 ? enc.capture_time_us / 1000 + : frame.render_time_ms(); + image._encodedWidth = enc.width; + image._encodedHeight = enc.height; + image.rotation_ = frame.rotation(); + + webrtc::CodecSpecificInfo info{}; + info.codecType = ToWebrtcCodec(codec_); + info.end_of_picture = true; + + auto result = callback_->OnEncodedImage(image, &info); + if (result.error != webrtc::EncodedImageCallback::Result::OK) { + RTC_LOG(LS_WARNING) + << "PassthroughVideoEncoder OnEncodedImage failed; send_failed=" + << (result.error == + webrtc::EncodedImageCallback::Result::ERROR_SEND_FAILED); + return WEBRTC_VIDEO_CODEC_ERROR; + } + return WEBRTC_VIDEO_CODEC_OK; +} + +void PassthroughVideoEncoder::SetRates( + const RateControlParameters& parameters) { + const uint32_t target_bps = parameters.target_bitrate.get_sum_bps(); + const double framerate = parameters.framerate_fps; + source_->notify_target_bitrate(target_bps, framerate); +} + +webrtc::VideoEncoder::EncoderInfo PassthroughVideoEncoder::GetEncoderInfo() + const { + EncoderInfo info; + info.implementation_name = "LiveKitPassthrough"; + info.is_hardware_accelerated = false; + info.supports_native_handle = false; + info.has_trusted_rate_controller = true; + info.supports_simulcast = false; + info.requested_resolution_alignment = 1; + info.apply_alignment_to_all_simulcast_layers = false; + return info; +} + +// ---------- LazyVideoEncoder ---------- + +LazyVideoEncoder::LazyVideoEncoder(webrtc::SdpVideoFormat format, + RealEncoderBuilder real_encoder_builder) + : format_(std::move(format)), + real_encoder_builder_(std::move(real_encoder_builder)) {} + +LazyVideoEncoder::~LazyVideoEncoder() = default; + +int LazyVideoEncoder::InitEncode(const webrtc::VideoCodec* codec_settings, + const Settings& settings) { + if (codec_settings) { + pending_codec_settings_ = *codec_settings; + } + pending_settings_ = settings; + has_pending_init_ = true; + + // If we already built an inner (e.g. re-init), forward immediately. + if (inner_) { + return inner_->InitEncode(codec_settings, settings); + } + return WEBRTC_VIDEO_CODEC_OK; +} + +int32_t LazyVideoEncoder::RegisterEncodeCompleteCallback( + webrtc::EncodedImageCallback* callback) { + callback_ = callback; + if (inner_) { + return inner_->RegisterEncodeCompleteCallback(callback); + } + return WEBRTC_VIDEO_CODEC_OK; +} + +int32_t LazyVideoEncoder::Release() { + int32_t rc = WEBRTC_VIDEO_CODEC_OK; + if (inner_) { + rc = inner_->Release(); + } + inner_.reset(); + has_pending_init_ = false; + pending_rates_.reset(); + pending_loss_rate_.reset(); + pending_rtt_ms_.reset(); + callback_ = nullptr; + return rc; +} + +bool LazyVideoEncoder::BuildInner(uint16_t frame_id) { + EncodedVideoTrackSource* src = + EncodedSourceRegistry::instance().lookup(frame_id); + + if (src != nullptr) { + inner_ = std::make_unique(src->get()); + is_passthrough_ = true; + RTC_LOG(LS_INFO) + << "LazyVideoEncoder: using PassthroughVideoEncoder for source id=" + << frame_id << " codec=" << static_cast(src->codec()) + << " sdp=" << format_.name; + } else { + inner_ = real_encoder_builder_ ? real_encoder_builder_() : nullptr; + is_passthrough_ = false; + if (!inner_) { + RTC_LOG(LS_ERROR) + << "LazyVideoEncoder: real_encoder_builder returned null for " + << format_.name; + return false; + } + } + + if (callback_) { + inner_->RegisterEncodeCompleteCallback(callback_); + } + if (has_pending_init_) { + int rc = inner_->InitEncode(&pending_codec_settings_, pending_settings_); + if (rc != WEBRTC_VIDEO_CODEC_OK) { + RTC_LOG(LS_ERROR) << "LazyVideoEncoder: inner InitEncode failed rc=" + << rc; + return false; + } + } + if (pending_rates_) { + inner_->SetRates(*pending_rates_); + pending_rates_.reset(); + } + if (pending_loss_rate_) { + inner_->OnPacketLossRateUpdate(*pending_loss_rate_); + pending_loss_rate_.reset(); + } + if (pending_rtt_ms_) { + inner_->OnRttUpdate(*pending_rtt_ms_); + pending_rtt_ms_.reset(); + } + return true; +} + +int32_t LazyVideoEncoder::Encode( + const webrtc::VideoFrame& frame, + const std::vector* frame_types) { + if (!inner_) { + if (!BuildInner(frame.id())) { + return WEBRTC_VIDEO_CODEC_ERROR; + } + } + return inner_->Encode(frame, frame_types); +} + +void LazyVideoEncoder::SetRates(const RateControlParameters& parameters) { + if (inner_) { + inner_->SetRates(parameters); + } else { + pending_rates_ = parameters; + } +} + +void LazyVideoEncoder::OnPacketLossRateUpdate(float packet_loss_rate) { + if (inner_) { + inner_->OnPacketLossRateUpdate(packet_loss_rate); + } else { + pending_loss_rate_ = packet_loss_rate; + } +} + +void LazyVideoEncoder::OnRttUpdate(int64_t rtt_ms) { + if (inner_) { + inner_->OnRttUpdate(rtt_ms); + } else { + pending_rtt_ms_ = rtt_ms; + } +} + +void LazyVideoEncoder::OnLossNotification( + const LossNotification& loss_notification) { + if (inner_) { + inner_->OnLossNotification(loss_notification); + } +} + +webrtc::VideoEncoder::EncoderInfo LazyVideoEncoder::GetEncoderInfo() const { + if (inner_) { + return inner_->GetEncoderInfo(); + } + EncoderInfo info; + info.implementation_name = "LiveKitLazy"; + info.is_hardware_accelerated = false; + info.supports_native_handle = false; + info.requested_resolution_alignment = 1; + info.apply_alignment_to_all_simulcast_layers = false; + return info; +} + +} // namespace livekit_ffi diff --git a/webrtc-sys/src/peer_connection_factory.cpp b/webrtc-sys/src/peer_connection_factory.cpp index a0e27a0a2..a63b5a90f 100644 --- a/webrtc-sys/src/peer_connection_factory.cpp +++ b/webrtc-sys/src/peer_connection_factory.cpp @@ -36,6 +36,9 @@ #include "livekit/peer_connection.h" #include "livekit/rtc_error.h" #include "livekit/rtp_parameters.h" +#ifdef LK_PRE_ENCODED_VIDEO +#include "livekit/encoded_video_source.h" +#endif #include "livekit/video_decoder_factory.h" #include "livekit/video_encoder_factory.h" #include "livekit/webrtc.h" @@ -116,6 +119,17 @@ std::shared_ptr PeerConnectionFactory::create_video_track( peer_factory_->CreateVideoTrack(source->get(), label.c_str()))); } +#ifdef LK_PRE_ENCODED_VIDEO +std::shared_ptr +PeerConnectionFactory::create_video_track_from_encoded_source( + rust::String label, + std::shared_ptr source) const { + return std::static_pointer_cast( + rtc_runtime_->get_or_create_media_stream_track( + peer_factory_->CreateVideoTrack(source->get(), label.c_str()))); +} +#endif + std::shared_ptr PeerConnectionFactory::create_audio_track( rust::String label, std::shared_ptr source) const { diff --git a/webrtc-sys/src/peer_connection_factory.rs b/webrtc-sys/src/peer_connection_factory.rs index c18d8331c..0cd9ca8b7 100644 --- a/webrtc-sys/src/peer_connection_factory.rs +++ b/webrtc-sys/src/peer_connection_factory.rs @@ -58,6 +58,8 @@ pub mod ffi { type IceGatheringState = crate::peer_connection::ffi::IceGatheringState; type AudioTrackSource = crate::audio_track::ffi::AudioTrackSource; type VideoTrackSource = crate::video_track::ffi::VideoTrackSource; + #[cfg(encoded_video)] + type EncodedVideoTrackSource = crate::encoded_video_source::ffi::EncodedVideoTrackSource; type RtpCapabilities = crate::rtp_parameters::ffi::RtpCapabilities; type AudioTrack = crate::audio_track::ffi::AudioTrack; type VideoTrack = crate::video_track::ffi::VideoTrack; @@ -101,6 +103,13 @@ pub mod ffi { source: SharedPtr, ) -> SharedPtr; + #[cfg(encoded_video)] + fn create_video_track_from_encoded_source( + self: &PeerConnectionFactory, + label: String, + source: SharedPtr, + ) -> SharedPtr; + fn create_audio_track( self: &PeerConnectionFactory, label: String, diff --git a/webrtc-sys/src/video_encoder_factory.cpp b/webrtc-sys/src/video_encoder_factory.cpp index 7435760b4..8d9afa58b 100644 --- a/webrtc-sys/src/video_encoder_factory.cpp +++ b/webrtc-sys/src/video_encoder_factory.cpp @@ -21,6 +21,9 @@ #include "api/video_codecs/video_encoder.h" #include "api/video_codecs/video_encoder_factory_template.h" #include "livekit/objc_video_factory.h" +#ifdef LK_PRE_ENCODED_VIDEO +#include "livekit/passthrough_video_encoder.h" +#endif #include "media/base/media_constants.h" #include "media/engine/simulcast_encoder_adapter.h" #include "rtc_base/logging.h" @@ -146,13 +149,28 @@ VideoEncoderFactory::CodecSupport VideoEncoderFactory::QueryCodecSupport( std::unique_ptr VideoEncoderFactory::Create( const webrtc::Environment& env, const webrtc::SdpVideoFormat& format) { - std::unique_ptr encoder; - if (format.IsCodecInList(internal_factory_->GetSupportedFormats())) { - encoder = std::make_unique( - env, internal_factory_.get(), nullptr, format); + if (!format.IsCodecInList(internal_factory_->GetSupportedFormats())) { + return nullptr; } - return encoder; +#ifdef LK_PRE_ENCODED_VIDEO + // Wrap the real encoder construction in a lazy shim so we can branch + // between passthrough and a real encoder based on the first VideoFrame's + // id. The builder is called at most once and only for non-passthrough + // tracks; passthrough tracks never instantiate the SimulcastEncoderAdapter. + auto real_encoder_builder = [env, format, + internal_factory = internal_factory_.get()]() + -> std::unique_ptr { + return std::make_unique( + env, internal_factory, nullptr, format); + }; + + return std::make_unique(format, + std::move(real_encoder_builder)); +#else + return std::make_unique( + env, internal_factory_.get(), nullptr, format); +#endif } } // namespace livekit_ffi