From e4d8190dd9512115e2c4b7be53072f50376b9141 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Tue, 12 May 2026 14:13:08 +0300 Subject: [PATCH] Audio: MFCC: Add Voice Activity Detection based on Mel spectrum This patch adds a new mfcc_vad module that implements VAD operating on the Mel log spectrum values produced by the MFCC component. The VAD is very simple and is not very selective for voice vs. other signals. But the continuously updated background noise estimate prevents stationary noises to trigger the VAD. The algorithm tracks a per-bin noise floor (instant-down, slow-rise) and computes a A-weighted energy delta. The used weight emphasizes speech frequencies. Speech is declared when the delta exceeds a threshold (0.30 in Q9.23) with a 20-frame hangover to prevent rapid toggling. The VAD flag is inserted into the output stream as the first value after the magic header word in all format paths (S16, S24, S32). A new Kconfig option CONFIG_COMP_MFCC_VAD (depends on COMP_MFCC, default n) gates compilation of the VAD code and the stream format change. Signed-off-by: Seppo Ingalsuo --- src/arch/host/configs/library_defconfig | 1 + src/audio/mfcc/CMakeLists.txt | 3 + src/audio/mfcc/Kconfig | 11 ++ src/audio/mfcc/mfcc_common.c | 44 +++++ src/audio/mfcc/mfcc_setup.c | 18 ++ src/audio/mfcc/mfcc_vad.c | 239 ++++++++++++++++++++++++ src/include/sof/audio/mfcc/mfcc_comp.h | 8 + src/include/sof/audio/mfcc/mfcc_vad.h | 102 ++++++++++ 8 files changed, 426 insertions(+) create mode 100644 src/audio/mfcc/mfcc_vad.c create mode 100644 src/include/sof/audio/mfcc/mfcc_vad.h diff --git a/src/arch/host/configs/library_defconfig b/src/arch/host/configs/library_defconfig index 28c486bec58d..34ea0fe051f3 100644 --- a/src/arch/host/configs/library_defconfig +++ b/src/arch/host/configs/library_defconfig @@ -11,6 +11,7 @@ CONFIG_COMP_IIR=y CONFIG_COMP_IGO_NR=y CONFIG_COMP_LEVEL_MULTIPLIER=y CONFIG_COMP_MFCC=y +CONFIG_COMP_MFCC_VAD=y CONFIG_COMP_MODULE_ADAPTER=y CONFIG_COMP_MULTIBAND_DRC=y CONFIG_COMP_MUX=y diff --git a/src/audio/mfcc/CMakeLists.txt b/src/audio/mfcc/CMakeLists.txt index f8af79d1ca8a..433aa824e713 100644 --- a/src/audio/mfcc/CMakeLists.txt +++ b/src/audio/mfcc/CMakeLists.txt @@ -5,4 +5,7 @@ if(CONFIG_COMP_MFCC STREQUAL "m" AND DEFINED CONFIG_LLEXT) add_dependencies(app mfcc) else() add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c) + if(CONFIG_COMP_MFCC_VAD) + add_local_sources(sof mfcc_vad.c) + endif() endif() diff --git a/src/audio/mfcc/Kconfig b/src/audio/mfcc/Kconfig index f56cadb40de2..fc74700562c0 100644 --- a/src/audio/mfcc/Kconfig +++ b/src/audio/mfcc/Kconfig @@ -24,3 +24,14 @@ config COMP_MFCC The characteristic of the audio features are defined in the binary control blob. Directory tools/tune/mfcc contains a tool to create the configurations. + +config COMP_MFCC_VAD + bool "MFCC Voice Activity Detection" + depends on COMP_MFCC + default n + help + This option enables a Voice Activity Detector (VAD) that operates + on the Mel spectrum values produced by the MFCC component. The VAD + flag is inserted into the output stream as the first int32_t value + after the magic header word. The VAD tracks a per-bin noise floor + and detects speech using a weighted energy delta with hangover. diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c index 1079864e9259..38025f2adf76 100644 --- a/src/audio/mfcc/mfcc_common.c +++ b/src/audio/mfcc/mfcc_common.c @@ -21,6 +21,10 @@ #include #include +#ifdef CONFIG_COMP_MFCC_VAD +#include +#endif + LOG_MODULE_REGISTER(mfcc_common, CONFIG_SOF_LOG_LEVEL); /* @@ -144,6 +148,10 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data * sat_int32(Q_MULTSR_32X32(s, config->mel_scale, 23, 12, 23)); } +#ifdef CONFIG_COMP_MFCC_VAD + /* Run VAD on the mel log spectrum before further processing */ + state->vad_flag = mfcc_vad_update(&cd->vad, state->mel_log_32); +#endif /* Store Q9.7 version in mel_spectra for s16 output mode */ for (j = 0; j < state->dct.num_in; j++) state->mel_spectra->data[j] = @@ -289,6 +297,9 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer state->out_remain = num_ceps; state->magic_pending = true; +#ifdef CONFIG_COMP_MFCC_VAD + state->vad_pending = true; +#endif } /* Write to sink, limited by period size */ @@ -301,6 +312,15 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer state->magic_pending = false; } +#ifdef CONFIG_COMP_MFCC_VAD + /* Write VAD flag as first value after magic (as two int16_t = one int32_t) */ + if (state->vad_pending && sink_samples >= 2) { + w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, 2, (int16_t *)&state->vad_flag); + sink_samples -= 2; + state->vad_pending = false; + } +#endif + /* Write cepstral/mel data from scratch buffer */ to_copy = MIN(state->out_remain, sink_samples); if (to_copy > 0) { @@ -392,6 +412,9 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer state->out_remain = num_ceps; state->magic_pending = true; +#ifdef CONFIG_COMP_MFCC_VAD + state->vad_pending = true; +#endif } /* Write to sink, limited by period size */ @@ -404,6 +427,15 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer state->magic_pending = false; } +#ifdef CONFIG_COMP_MFCC_VAD + /* Write VAD flag as first value after magic */ + if (state->vad_pending && sink_samples >= 1) { + w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, 1, &state->vad_flag); + sink_samples -= 1; + state->vad_pending = false; + } +#endif + if (state->mel_only) { /* Write 32-bit mel data Q9.15, one value per int32_t */ to_copy = MIN(state->out_remain, sink_samples); @@ -467,6 +499,9 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer state->out_remain = num_ceps; state->magic_pending = true; +#ifdef CONFIG_COMP_MFCC_VAD + state->vad_pending = true; +#endif } /* Write to sink, limited by period size */ @@ -479,6 +514,15 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer state->magic_pending = false; } +#ifdef CONFIG_COMP_MFCC_VAD + /* Write VAD flag as first value after magic */ + if (state->vad_pending && sink_samples >= 1) { + w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, 1, &state->vad_flag); + sink_samples -= 1; + state->vad_pending = false; + } +#endif + if (state->mel_only) { /* Write 32-bit mel data Q9.23, one value per int32_t */ to_copy = MIN(state->out_remain, sink_samples); diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c index 1cad4b2b984e..ad5c7a79deb6 100644 --- a/src/audio/mfcc/mfcc_setup.c +++ b/src/audio/mfcc/mfcc_setup.c @@ -18,6 +18,10 @@ #include #include +#ifdef CONFIG_COMP_MFCC_VAD +#include +#endif + /* Definitions for cepstral lifter */ #define PI_Q23 Q_CONVERT_FLOAT(3.1415926536, 23) #define TWO_PI_Q23 Q_CONVERT_FLOAT(6.2831853072, 23) @@ -346,10 +350,24 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i state->waiting_fill = true; state->prev_samples_valid = false; state->magic_pending = false; +#ifdef CONFIG_COMP_MFCC_VAD + state->vad_pending = false; + state->vad_flag = 0; +#endif state->out_data_ptr = NULL; state->out_data_ptr_32 = NULL; state->out_remain = 0; +#ifdef CONFIG_COMP_MFCC_VAD + ret = mfcc_vad_init(&cd->vad, config->num_mel_bins, sample_rate); + if (ret < 0) { + comp_err(dev, "Failed VAD init"); + goto free_lifter; + } + + comp_info(dev, "VAD enabled, num_mel_bins = %d", config->num_mel_bins); +#endif + comp_dbg(dev, "done"); return 0; diff --git a/src/audio/mfcc/mfcc_vad.c b/src/audio/mfcc/mfcc_vad.c new file mode 100644 index 000000000000..5da780b4cd9b --- /dev/null +++ b/src/audio/mfcc/mfcc_vad.c @@ -0,0 +1,239 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2026 Intel Corporation. +// +// Author: Seppo Ingalsuo + +/** + * \file mfcc_vad.c + * \brief Voice Activity Detection based on Mel spectrum energy. + * + * Implements a VAD that tracks per-bin noise floor and computes a + * speech-frequency weighted energy above the floor. Speech is declared + * when the weighted delta exceeds a threshold, with hangover to prevent + * rapid toggling. + */ + +#include + +#ifdef CONFIG_COMP_MFCC_VAD + +#include +#include +#include +#include +#include + +LOG_MODULE_DECLARE(mfcc, CONFIG_SOF_LOG_LEVEL); + +/** + * \brief A-weighting table: 1/3 octave band center frequencies in Hz (Q16.0). + * + * From IEC 61672-1:2013, source: + * https://acousticalengineer.com/a-weighting-table/ + */ +#define A_WEIGHT_TABLE_SIZE 36 + +static const int16_t a_weight_hz[A_WEIGHT_TABLE_SIZE] = { + 6, 8, 10, 13, 16, 20, 25, 32, + 40, 50, 63, 80, 100, 125, 160, 200, + 250, 315, 400, 500, 630, 800, 1000, 1250, + 1600, 2000, 2500, 3150, 4000, 5000, 6300, 8000, + 10000, 12500, 16000, 20000, +}; + +/** + * \brief A-weighting linear amplitude, scaled so peak (at 2500 Hz) maps + * to INT16_MAX (32767). Original dB values converted via + * 10^(dB/20) then scaled by 32767 / max. + */ +static const int16_t a_weight_lin[A_WEIGHT_TABLE_SIZE] = { + 2, 4, 9, 19, 43, 85, 162, 299, + 531, 862, 1382, 2140, 3129, 4370, 6172, 8136, + 10362, 13196, 16234, 19518, 22669, 25730, 28212, 30230, + 31655, 32392, 32767, 32392, 31655, 30230, 27889, 24856, + 21156, 17196, 13045, 9670, +}; + +/** + * \brief Compute A-weighted speech-frequency emphasis weights for Mel bins. + * + * Weights are computed by linearly interpolating the A-weighting table + * at each Mel bin center frequency. Output weights are in Q1.15 and + * sum to approximately 2^15. + * + * \param[out] weights Output weight array. + * \param[in] num_mel Number of Mel bins. + * \param[in] sample_rate Sample rate in Hz. + */ +static void mfcc_vad_build_weights(int16_t *weights, int num_mel, int sample_rate) +{ + int16_t mel_end = psy_hz_to_mel((int16_t)(sample_rate / 2)); + int16_t mel_step = mel_end / (num_mel + 1); + int32_t sum = 0; + int i; + + for (i = 0; i < num_mel; i++) { + int16_t f_hz = psy_mel_to_hz((int16_t)((i + 1) * mel_step)); + int16_t w; + int j; + + /* Find the table interval containing f_hz and interpolate */ + if (f_hz <= a_weight_hz[0]) { + w = a_weight_lin[0]; + } else if (f_hz >= a_weight_hz[A_WEIGHT_TABLE_SIZE - 1]) { + w = a_weight_lin[A_WEIGHT_TABLE_SIZE - 1]; + } else { + /* Find j such that a_weight_hz[j] <= f_hz < a_weight_hz[j+1] */ + for (j = 0; j < A_WEIGHT_TABLE_SIZE - 2; j++) { + if (f_hz < a_weight_hz[j + 1]) + break; + } + + /* Linear interpolation: + * w = w0 + (w1 - w0) * (f - f0) / (f1 - f0) + */ + int16_t f0 = a_weight_hz[j]; + int16_t f1 = a_weight_hz[j + 1]; + int16_t w0 = a_weight_lin[j]; + int16_t w1 = a_weight_lin[j + 1]; + int32_t num = (int32_t)(w1 - w0) * (f_hz - f0); + int16_t den = f1 - f0; + + w = w0 + (int16_t)(num / den); + } + + weights[i] = w; + sum += w; + } + + /* Normalize weights so they sum to ~2^15. */ + if (sum > 0) { + for (i = 0; i < num_mel; i++) { + int32_t scaled = ((int32_t)weights[i] << 15) / sum; + + weights[i] = (int16_t)scaled; + } + } +} + +int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int sample_rate) +{ + int i; + + if (!vad) + return -EINVAL; + + if (num_mel_bins <= 0 || num_mel_bins > MFCC_VAD_MAX_MEL_BINS) + return -EINVAL; + + vad->num_mel_bins = num_mel_bins; + vad->energy_threshold = MFCC_VAD_ENERGY_THRESHOLD; + vad->noise_rise_alpha_slow = MFCC_VAD_NOISE_RISE_ALPHA; + vad->noise_rise_alpha_fast = MFCC_VAD_NOISE_RISE_ALPHA_FAST; + vad->hangover_max = MFCC_VAD_HANGOVER_FRAMES; + vad->hangover_counter = 0; + vad->init_frames = MFCC_VAD_NOISE_INIT_FRAMES; + vad->frame_count = 0; + vad->is_speech = false; + vad->initialized = false; + + for (i = 0; i < num_mel_bins; i++) + vad->noise_floor[i] = 0; + + mfcc_vad_build_weights(vad->weights, num_mel_bins, sample_rate); + return 0; +} + +int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log) +{ + int64_t energy_delta = 0; + int32_t delta; + int16_t alpha; + bool raw_speech; + int i; + + if (!vad || !mel_log) + return 0; + + vad->frame_count++; + + /* Initialize noise floor to first frame */ + if (!vad->initialized) { + for (i = 0; i < vad->num_mel_bins; i++) + vad->noise_floor[i] = mel_log[i]; + + vad->initialized = true; + } + + /* Select rise alpha based on convergence phase */ + if (vad->frame_count <= vad->init_frames) + alpha = vad->noise_rise_alpha_fast; + else + alpha = vad->noise_rise_alpha_slow; + + /* Update noise floor: follow down instantly, rise slowly */ + for (i = 0; i < vad->num_mel_bins; i++) { + if (mel_log[i] < vad->noise_floor[i]) { + /* Instant follow-down */ + vad->noise_floor[i] = mel_log[i]; + } else { + /* Slow rise: floor += alpha * (mel - floor) + * Q9.23 + Q1.15 * Q9.23 => need Q9.23 result + * alpha is Q1.15, delta is Q9.23 + * product is Q10.38, shift right by 15 to get Q9.23 + */ + delta = mel_log[i] - vad->noise_floor[i]; + vad->noise_floor[i] += (int32_t)(((int64_t)alpha * delta) >> 15); + } + } + + /* Compute weighted energy delta above noise floor. + * energy_delta = sum(weights[i] * (mel[i] - noise_floor[i])) + * weights are Q1.15, mel delta is Q9.23 + * Product is Q10.38, accumulate in int64_t then shift to Q9.23 + */ + for (i = 0; i < vad->num_mel_bins; i++) { + delta = mel_log[i] - vad->noise_floor[i]; + if (delta > 0) + energy_delta += (int64_t)vad->weights[i] * delta; + } + + /* Shift accumulated energy from Q10.38 to Q9.23: shift right by 15 */ + energy_delta >>= 15; + + /* VAD decision with hangover */ + raw_speech = (int32_t)energy_delta > vad->energy_threshold; + + if (raw_speech) { + vad->hangover_counter = vad->hangover_max; + vad->is_speech = true; + } else { + if (vad->hangover_counter > 0) { + vad->hangover_counter--; + vad->is_speech = true; + } else { + vad->is_speech = false; + } + } + + return vad->is_speech ? 1 : 0; +} + +void mfcc_vad_reset(struct mfcc_vad_state *vad) +{ + int i; + + if (!vad) + return; + + vad->frame_count = 0; + vad->hangover_counter = 0; + vad->is_speech = false; + vad->initialized = false; + + for (i = 0; i < vad->num_mel_bins; i++) + vad->noise_floor[i] = 0; +} + +#endif /* CONFIG_COMP_MFCC_VAD */ diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h index 025eef116752..e0617e0f026f 100644 --- a/src/include/sof/audio/mfcc/mfcc_comp.h +++ b/src/include/sof/audio/mfcc/mfcc_comp.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -106,6 +107,10 @@ struct mfcc_state { bool waiting_fill; /**< booleans */ bool prev_samples_valid; bool magic_pending; /**< True when magic word not yet written for current output */ +#ifdef CONFIG_COMP_MFCC_VAD + bool vad_pending; /**< True when VAD flag not yet written for current output */ + int32_t vad_flag; /**< Current VAD result: 1 = speech, 0 = silence */ +#endif size_t sample_buffers_size; /**< bytes */ int16_t *out_data_ptr; /**< Read pointer into scratch data for multi-period output */ int32_t *out_data_ptr_32; /**< Read pointer for 32-bit mel-only output */ @@ -115,6 +120,9 @@ struct mfcc_state { /* MFCC component private data */ struct mfcc_comp_data { struct mfcc_state state; +#ifdef CONFIG_COMP_MFCC_VAD + struct mfcc_vad_state vad; +#endif struct comp_data_blob_handler *model_handler; struct sof_mfcc_config *config; int max_frames; diff --git a/src/include/sof/audio/mfcc/mfcc_vad.h b/src/include/sof/audio/mfcc/mfcc_vad.h new file mode 100644 index 000000000000..2e2cf5668c84 --- /dev/null +++ b/src/include/sof/audio/mfcc/mfcc_vad.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * + * Copyright(c) 2026 Intel Corporation. + * + * Author: Seppo Ingalsuo + */ + +/** + * \file mfcc_vad.h + * \brief Voice Activity Detection based on Mel spectrum energy. + * + * This VAD operates on the Q9.23 Mel log spectrum values produced by + * the MFCC component. It tracks a per-bin noise floor that follows + * the signal downward instantly and rises slowly, then computes a + * speech-weighted energy delta above the floor. + */ + +#ifndef __SOF_AUDIO_MFCC_MFCC_VAD_H__ +#define __SOF_AUDIO_MFCC_MFCC_VAD_H__ + +#include +#include + +#ifdef CONFIG_COMP_MFCC_VAD + +/** + * \brief Maximum number of Mel bins supported by VAD. + */ +#define MFCC_VAD_MAX_MEL_BINS 128 + +/** + * \brief Number of frames for fast noise floor convergence at startup (~1 s at 10 ms/frame). + */ +#define MFCC_VAD_NOISE_INIT_FRAMES 100 + +/** + * \brief Slow noise floor rise coefficient in Q1.15 (0.0025 * 32768 = 82). + */ +#define MFCC_VAD_NOISE_RISE_ALPHA 82 + +/** + * \brief Fast noise floor rise coefficient in Q1.15 (0.05 * 32768 = 1638). + */ +#define MFCC_VAD_NOISE_RISE_ALPHA_FAST 1638 + +/** + * \brief Energy threshold for speech detection in Q9.23 (0.30 * 2^23 = 2516582). + */ +#define MFCC_VAD_ENERGY_THRESHOLD 2516582 + +/** + * \brief Hangover frame count to keep VAD active after last speech detection. + */ +#define MFCC_VAD_HANGOVER_FRAMES 20 + +/** + * \brief VAD state structure. + */ +struct mfcc_vad_state { + int32_t noise_floor[MFCC_VAD_MAX_MEL_BINS]; /**< Per-bin noise floor in Q9.23 */ + int16_t weights[MFCC_VAD_MAX_MEL_BINS]; /**< Speech-frequency emphasis weights Q1.15 */ + int32_t energy_threshold; /**< Energy threshold Q9.23 */ + int16_t noise_rise_alpha_slow; /**< Slow rise alpha Q1.15 */ + int16_t noise_rise_alpha_fast; /**< Fast rise alpha Q1.15 */ + int16_t hangover_max; /**< Maximum hangover frames */ + int16_t hangover_counter; /**< Current hangover counter */ + int16_t num_mel_bins; /**< Number of Mel bins in use */ + int16_t init_frames; /**< Frames remaining in fast convergence period */ + int32_t frame_count; /**< Total frames processed */ + bool is_speech; /**< Current VAD decision */ + bool initialized; /**< True after first frame processed */ +}; + +/** + * \brief Initialize VAD state. + * + * \param[out] vad Pointer to VAD state to initialize. + * \param[in] num_mel_bins Number of Mel bins (must be <= MFCC_VAD_MAX_MEL_BINS). + * \param[in] sample_rate Audio sample rate in Hz. + * \return 0 on success, negative error code on failure. + */ +int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int sample_rate); + +/** + * \brief Process one Mel spectrum frame and update VAD decision. + * + * \param[in,out] vad Pointer to VAD state. + * \param[in] mel_log Mel log spectrum in Q9.23, array of num_mel_bins values. + * \return 1 if speech detected, 0 if silence. + */ +int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log); + +/** + * \brief Reset VAD state without changing configuration. + * + * \param[in,out] vad Pointer to VAD state. + */ +void mfcc_vad_reset(struct mfcc_vad_state *vad); + +#endif /* CONFIG_COMP_MFCC_VAD */ + +#endif /* __SOF_AUDIO_MFCC_MFCC_VAD_H__ */