Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions backend/Dockerfile.golang
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,28 @@ ENV AMDGPU_TARGETS=${AMDGPU_TARGETS}
ARG APT_MIRROR
ARG APT_PORTS_MIRROR

# gcc-14 is the default on noble (ubuntu:24.04) but absent from jammy
# (the L4T jetpack r36.4.0 base). LocalVQE specifically needs it; the
# other Go backends compile fine with the default gcc shipped via
# build-essential. So: try gcc-14 from the configured repos, fall back
# gracefully when it's not available so jammy-based builds don't fail
# at the apt step.
RUN --mount=type=bind,source=.docker/apt-mirror.sh,target=/usr/local/sbin/apt-mirror \
APT_MIRROR="${APT_MIRROR}" APT_PORTS_MIRROR="${APT_PORTS_MIRROR}" sh /usr/local/sbin/apt-mirror && \
apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
gcc-14 g++-14 \
git ccache \
ca-certificates \
make cmake wget libopenblas-dev \
curl unzip \
libssl-dev && \
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100 \
--slave /usr/bin/g++ g++ /usr/bin/g++-14 \
--slave /usr/bin/gcov gcov /usr/bin/gcov-14 && \
if apt-cache show gcc-14 >/dev/null 2>&1 && apt-cache show g++-14 >/dev/null 2>&1; then \
apt-get install -y --no-install-recommends gcc-14 g++-14 && \
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100 \
--slave /usr/bin/g++ g++ /usr/bin/g++-14 \
--slave /usr/bin/gcov gcov /usr/bin/gcov-14; \
fi && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

Expand Down
39 changes: 39 additions & 0 deletions backend/backend.proto
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ service Backend {

rpc VAD(VADRequest) returns (VADResponse) {}

rpc Diarize(DiarizeRequest) returns (DiarizeResponse) {}

rpc AudioEncode(AudioEncodeRequest) returns (AudioEncodeResult) {}
rpc AudioDecode(AudioDecodeRequest) returns (AudioDecodeResult) {}

Expand Down Expand Up @@ -416,6 +418,43 @@ message VADResponse {
repeated VADSegment segments = 1;
}

// --- Speaker diarization messages ---
//
// Pure speaker diarization: "who spoke when". Returns time-stamped segments
// labelled with cluster IDs (the same string for the same speaker across
// segments). Some backends (e.g. vibevoice.cpp) produce diarization as a
// by-product of ASR and may also fill in `text` per segment; backends with a
// dedicated diarization pipeline (e.g. sherpa-onnx pyannote) leave `text`
// empty and emit only the segmentation.

message DiarizeRequest {
string dst = 1; // path to audio file (HTTP layer materialises uploads to a temp file)
uint32 threads = 2;
string language = 3; // optional; only meaningful for transcription-bundling backends
int32 num_speakers = 4; // exact speaker count if known (>0 forces); 0 = auto
int32 min_speakers = 5; // hint when auto-detecting; 0 = unset
int32 max_speakers = 6; // hint when auto-detecting; 0 = unset
float clustering_threshold = 7; // distance threshold when num_speakers unknown; 0 = backend default
float min_duration_on = 8; // discard segments shorter than this (seconds); 0 = backend default
float min_duration_off = 9; // merge gaps shorter than this (seconds); 0 = backend default
bool include_text = 10; // when the backend can emit per-segment transcript for free, ask it to populate `text`
}

message DiarizeSegment {
int32 id = 1;
float start = 2; // seconds
float end = 3; // seconds
string speaker = 4; // backend-emitted speaker label (e.g. "0", "SPEAKER_00")
string text = 5; // optional per-segment transcript (empty unless include_text and supported)
}

message DiarizeResponse {
repeated DiarizeSegment segments = 1;
int32 num_speakers = 2; // count of distinct speaker labels in `segments`
float duration = 3; // total audio duration in seconds (0 if unknown)
string language = 4; // optional, when the backend bundles transcription
}

message SoundGenerationRequest {
string text = 1;
string model = 2;
Expand Down
Loading
Loading