mudler · mudler · May 5, 2026 · May 4, 2026 · May 4, 2026 · May 4, 2026
diff --git a/backend/Dockerfile.golang b/backend/Dockerfile.golang
@@ -21,20 +21,28 @@ ENV AMDGPU_TARGETS=${AMDGPU_TARGETS}
 ARG APT_MIRROR
 ARG APT_PORTS_MIRROR
 
+# gcc-14 is the default on noble (ubuntu:24.04) but absent from jammy
+# (the L4T jetpack r36.4.0 base). LocalVQE specifically needs it; the
+# other Go backends compile fine with the default gcc shipped via
+# build-essential. So: try gcc-14 from the configured repos, fall back
+# gracefully when it's not available so jammy-based builds don't fail
+# at the apt step.
 RUN --mount=type=bind,source=.docker/apt-mirror.sh,target=/usr/local/sbin/apt-mirror \
     APT_MIRROR="${APT_MIRROR}" APT_PORTS_MIRROR="${APT_PORTS_MIRROR}" sh /usr/local/sbin/apt-mirror && \
     apt-get update && \
     apt-get install -y --no-install-recommends \
         build-essential \
-        gcc-14 g++-14 \
         git ccache \
         ca-certificates \
         make cmake wget libopenblas-dev \
         curl unzip \
         libssl-dev && \
-    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100 \
-        --slave /usr/bin/g++ g++ /usr/bin/g++-14 \
-        --slave /usr/bin/gcov gcov /usr/bin/gcov-14 && \
+    if apt-cache show gcc-14 >/dev/null 2>&1 && apt-cache show g++-14 >/dev/null 2>&1; then \
+        apt-get install -y --no-install-recommends gcc-14 g++-14 && \
+        update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100 \
+            --slave /usr/bin/g++ g++ /usr/bin/g++-14 \
+            --slave /usr/bin/gcov gcov /usr/bin/gcov-14; \
+    fi && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 

diff --git a/backend/backend.proto b/backend/backend.proto
@@ -41,6 +41,8 @@ service Backend {
 
   rpc VAD(VADRequest) returns (VADResponse) {}
 
+  rpc Diarize(DiarizeRequest) returns (DiarizeResponse) {}
+
   rpc AudioEncode(AudioEncodeRequest) returns (AudioEncodeResult) {}
   rpc AudioDecode(AudioDecodeRequest) returns (AudioDecodeResult) {}
 
@@ -416,6 +418,43 @@ message VADResponse {
   repeated VADSegment segments = 1;
 }
 
+// --- Speaker diarization messages ---
+//
+// Pure speaker diarization: "who spoke when". Returns time-stamped segments
+// labelled with cluster IDs (the same string for the same speaker across
+// segments). Some backends (e.g. vibevoice.cpp) produce diarization as a
+// by-product of ASR and may also fill in `text` per segment; backends with a
+// dedicated diarization pipeline (e.g. sherpa-onnx pyannote) leave `text`
+// empty and emit only the segmentation.
+
+message DiarizeRequest {
+  string dst = 1;                      // path to audio file (HTTP layer materialises uploads to a temp file)
+  uint32 threads = 2;
+  string language = 3;                 // optional; only meaningful for transcription-bundling backends
+  int32  num_speakers = 4;             // exact speaker count if known (>0 forces); 0 = auto
+  int32  min_speakers = 5;             // hint when auto-detecting; 0 = unset
+  int32  max_speakers = 6;             // hint when auto-detecting; 0 = unset
+  float  clustering_threshold = 7;     // distance threshold when num_speakers unknown; 0 = backend default
+  float  min_duration_on = 8;          // discard segments shorter than this (seconds); 0 = backend default
+  float  min_duration_off = 9;         // merge gaps shorter than this (seconds); 0 = backend default
+  bool   include_text = 10;            // when the backend can emit per-segment transcript for free, ask it to populate `text`
+}
+
+message DiarizeSegment {
+  int32  id = 1;
+  float  start = 2;                    // seconds
+  float  end = 3;                      // seconds
+  string speaker = 4;                  // backend-emitted speaker label (e.g. "0", "SPEAKER_00")
+  string text = 5;                     // optional per-segment transcript (empty unless include_text and supported)
+}
+
+message DiarizeResponse {
+  repeated DiarizeSegment segments = 1;
+  int32  num_speakers = 2;             // count of distinct speaker labels in `segments`
+  float  duration = 3;                 // total audio duration in seconds (0 if unknown)
+  string language = 4;                 // optional, when the backend bundles transcription
+}
+
 message SoundGenerationRequest {
   string text = 1;
   string model = 2;