diff --git a/src/arch/host/configs/library_defconfig b/src/arch/host/configs/library_defconfig index 28c486bec58d..34ea0fe051f3 100644 --- a/src/arch/host/configs/library_defconfig +++ b/src/arch/host/configs/library_defconfig @@ -11,6 +11,7 @@ CONFIG_COMP_IIR=y CONFIG_COMP_IGO_NR=y CONFIG_COMP_LEVEL_MULTIPLIER=y CONFIG_COMP_MFCC=y +CONFIG_COMP_MFCC_VAD=y CONFIG_COMP_MODULE_ADAPTER=y CONFIG_COMP_MULTIBAND_DRC=y CONFIG_COMP_MUX=y diff --git a/src/audio/mfcc/CMakeLists.txt b/src/audio/mfcc/CMakeLists.txt index f8af79d1ca8a..433aa824e713 100644 --- a/src/audio/mfcc/CMakeLists.txt +++ b/src/audio/mfcc/CMakeLists.txt @@ -5,4 +5,7 @@ if(CONFIG_COMP_MFCC STREQUAL "m" AND DEFINED CONFIG_LLEXT) add_dependencies(app mfcc) else() add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c) + if(CONFIG_COMP_MFCC_VAD) + add_local_sources(sof mfcc_vad.c) + endif() endif() diff --git a/src/audio/mfcc/Kconfig b/src/audio/mfcc/Kconfig index f56cadb40de2..fc74700562c0 100644 --- a/src/audio/mfcc/Kconfig +++ b/src/audio/mfcc/Kconfig @@ -24,3 +24,14 @@ config COMP_MFCC The characteristic of the audio features are defined in the binary control blob. Directory tools/tune/mfcc contains a tool to create the configurations. + +config COMP_MFCC_VAD + bool "MFCC Voice Activity Detection" + depends on COMP_MFCC + default n + help + This option enables a Voice Activity Detector (VAD) that operates + on the Mel spectrum values produced by the MFCC component. The VAD + flag is inserted into the output stream as the first int32_t value + after the magic header word. The VAD tracks a per-bin noise floor + and detects speech using a weighted energy delta with hangover. diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c index 1079864e9259..38025f2adf76 100644 --- a/src/audio/mfcc/mfcc_common.c +++ b/src/audio/mfcc/mfcc_common.c @@ -21,6 +21,10 @@ #include #include +#ifdef CONFIG_COMP_MFCC_VAD +#include +#endif + LOG_MODULE_REGISTER(mfcc_common, CONFIG_SOF_LOG_LEVEL); /* @@ -144,6 +148,10 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data * sat_int32(Q_MULTSR_32X32(s, config->mel_scale, 23, 12, 23)); } +#ifdef CONFIG_COMP_MFCC_VAD + /* Run VAD on the mel log spectrum before further processing */ + state->vad_flag = mfcc_vad_update(&cd->vad, state->mel_log_32); +#endif /* Store Q9.7 version in mel_spectra for s16 output mode */ for (j = 0; j < state->dct.num_in; j++) state->mel_spectra->data[j] = @@ -289,6 +297,9 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer state->out_remain = num_ceps; state->magic_pending = true; +#ifdef CONFIG_COMP_MFCC_VAD + state->vad_pending = true; +#endif } /* Write to sink, limited by period size */ @@ -301,6 +312,15 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer state->magic_pending = false; } +#ifdef CONFIG_COMP_MFCC_VAD + /* Write VAD flag as first value after magic (as two int16_t = one int32_t) */ + if (state->vad_pending && sink_samples >= 2) { + w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, 2, (int16_t *)&state->vad_flag); + sink_samples -= 2; + state->vad_pending = false; + } +#endif + /* Write cepstral/mel data from scratch buffer */ to_copy = MIN(state->out_remain, sink_samples); if (to_copy > 0) { @@ -392,6 +412,9 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer state->out_remain = num_ceps; state->magic_pending = true; +#ifdef CONFIG_COMP_MFCC_VAD + state->vad_pending = true; +#endif } /* Write to sink, limited by period size */ @@ -404,6 +427,15 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer state->magic_pending = false; } +#ifdef CONFIG_COMP_MFCC_VAD + /* Write VAD flag as first value after magic */ + if (state->vad_pending && sink_samples >= 1) { + w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, 1, &state->vad_flag); + sink_samples -= 1; + state->vad_pending = false; + } +#endif + if (state->mel_only) { /* Write 32-bit mel data Q9.15, one value per int32_t */ to_copy = MIN(state->out_remain, sink_samples); @@ -467,6 +499,9 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer state->out_remain = num_ceps; state->magic_pending = true; +#ifdef CONFIG_COMP_MFCC_VAD + state->vad_pending = true; +#endif } /* Write to sink, limited by period size */ @@ -479,6 +514,15 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer state->magic_pending = false; } +#ifdef CONFIG_COMP_MFCC_VAD + /* Write VAD flag as first value after magic */ + if (state->vad_pending && sink_samples >= 1) { + w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, 1, &state->vad_flag); + sink_samples -= 1; + state->vad_pending = false; + } +#endif + if (state->mel_only) { /* Write 32-bit mel data Q9.23, one value per int32_t */ to_copy = MIN(state->out_remain, sink_samples); diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c index 1cad4b2b984e..ad5c7a79deb6 100644 --- a/src/audio/mfcc/mfcc_setup.c +++ b/src/audio/mfcc/mfcc_setup.c @@ -18,6 +18,10 @@ #include #include +#ifdef CONFIG_COMP_MFCC_VAD +#include +#endif + /* Definitions for cepstral lifter */ #define PI_Q23 Q_CONVERT_FLOAT(3.1415926536, 23) #define TWO_PI_Q23 Q_CONVERT_FLOAT(6.2831853072, 23) @@ -346,10 +350,24 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i state->waiting_fill = true; state->prev_samples_valid = false; state->magic_pending = false; +#ifdef CONFIG_COMP_MFCC_VAD + state->vad_pending = false; + state->vad_flag = 0; +#endif state->out_data_ptr = NULL; state->out_data_ptr_32 = NULL; state->out_remain = 0; +#ifdef CONFIG_COMP_MFCC_VAD + ret = mfcc_vad_init(&cd->vad, config->num_mel_bins, sample_rate); + if (ret < 0) { + comp_err(dev, "Failed VAD init"); + goto free_lifter; + } + + comp_info(dev, "VAD enabled, num_mel_bins = %d", config->num_mel_bins); +#endif + comp_dbg(dev, "done"); return 0; diff --git a/src/audio/mfcc/mfcc_vad.c b/src/audio/mfcc/mfcc_vad.c new file mode 100644 index 000000000000..5da780b4cd9b --- /dev/null +++ b/src/audio/mfcc/mfcc_vad.c @@ -0,0 +1,239 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2026 Intel Corporation. +// +// Author: Seppo Ingalsuo + +/** + * \file mfcc_vad.c + * \brief Voice Activity Detection based on Mel spectrum energy. + * + * Implements a VAD that tracks per-bin noise floor and computes a + * speech-frequency weighted energy above the floor. Speech is declared + * when the weighted delta exceeds a threshold, with hangover to prevent + * rapid toggling. + */ + +#include + +#ifdef CONFIG_COMP_MFCC_VAD + +#include +#include +#include +#include +#include + +LOG_MODULE_DECLARE(mfcc, CONFIG_SOF_LOG_LEVEL); + +/** + * \brief A-weighting table: 1/3 octave band center frequencies in Hz (Q16.0). + * + * From IEC 61672-1:2013, source: + * https://acousticalengineer.com/a-weighting-table/ + */ +#define A_WEIGHT_TABLE_SIZE 36 + +static const int16_t a_weight_hz[A_WEIGHT_TABLE_SIZE] = { + 6, 8, 10, 13, 16, 20, 25, 32, + 40, 50, 63, 80, 100, 125, 160, 200, + 250, 315, 400, 500, 630, 800, 1000, 1250, + 1600, 2000, 2500, 3150, 4000, 5000, 6300, 8000, + 10000, 12500, 16000, 20000, +}; + +/** + * \brief A-weighting linear amplitude, scaled so peak (at 2500 Hz) maps + * to INT16_MAX (32767). Original dB values converted via + * 10^(dB/20) then scaled by 32767 / max. + */ +static const int16_t a_weight_lin[A_WEIGHT_TABLE_SIZE] = { + 2, 4, 9, 19, 43, 85, 162, 299, + 531, 862, 1382, 2140, 3129, 4370, 6172, 8136, + 10362, 13196, 16234, 19518, 22669, 25730, 28212, 30230, + 31655, 32392, 32767, 32392, 31655, 30230, 27889, 24856, + 21156, 17196, 13045, 9670, +}; + +/** + * \brief Compute A-weighted speech-frequency emphasis weights for Mel bins. + * + * Weights are computed by linearly interpolating the A-weighting table + * at each Mel bin center frequency. Output weights are in Q1.15 and + * sum to approximately 2^15. + * + * \param[out] weights Output weight array. + * \param[in] num_mel Number of Mel bins. + * \param[in] sample_rate Sample rate in Hz. + */ +static void mfcc_vad_build_weights(int16_t *weights, int num_mel, int sample_rate) +{ + int16_t mel_end = psy_hz_to_mel((int16_t)(sample_rate / 2)); + int16_t mel_step = mel_end / (num_mel + 1); + int32_t sum = 0; + int i; + + for (i = 0; i < num_mel; i++) { + int16_t f_hz = psy_mel_to_hz((int16_t)((i + 1) * mel_step)); + int16_t w; + int j; + + /* Find the table interval containing f_hz and interpolate */ + if (f_hz <= a_weight_hz[0]) { + w = a_weight_lin[0]; + } else if (f_hz >= a_weight_hz[A_WEIGHT_TABLE_SIZE - 1]) { + w = a_weight_lin[A_WEIGHT_TABLE_SIZE - 1]; + } else { + /* Find j such that a_weight_hz[j] <= f_hz < a_weight_hz[j+1] */ + for (j = 0; j < A_WEIGHT_TABLE_SIZE - 2; j++) { + if (f_hz < a_weight_hz[j + 1]) + break; + } + + /* Linear interpolation: + * w = w0 + (w1 - w0) * (f - f0) / (f1 - f0) + */ + int16_t f0 = a_weight_hz[j]; + int16_t f1 = a_weight_hz[j + 1]; + int16_t w0 = a_weight_lin[j]; + int16_t w1 = a_weight_lin[j + 1]; + int32_t num = (int32_t)(w1 - w0) * (f_hz - f0); + int16_t den = f1 - f0; + + w = w0 + (int16_t)(num / den); + } + + weights[i] = w; + sum += w; + } + + /* Normalize weights so they sum to ~2^15. */ + if (sum > 0) { + for (i = 0; i < num_mel; i++) { + int32_t scaled = ((int32_t)weights[i] << 15) / sum; + + weights[i] = (int16_t)scaled; + } + } +} + +int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int sample_rate) +{ + int i; + + if (!vad) + return -EINVAL; + + if (num_mel_bins <= 0 || num_mel_bins > MFCC_VAD_MAX_MEL_BINS) + return -EINVAL; + + vad->num_mel_bins = num_mel_bins; + vad->energy_threshold = MFCC_VAD_ENERGY_THRESHOLD; + vad->noise_rise_alpha_slow = MFCC_VAD_NOISE_RISE_ALPHA; + vad->noise_rise_alpha_fast = MFCC_VAD_NOISE_RISE_ALPHA_FAST; + vad->hangover_max = MFCC_VAD_HANGOVER_FRAMES; + vad->hangover_counter = 0; + vad->init_frames = MFCC_VAD_NOISE_INIT_FRAMES; + vad->frame_count = 0; + vad->is_speech = false; + vad->initialized = false; + + for (i = 0; i < num_mel_bins; i++) + vad->noise_floor[i] = 0; + + mfcc_vad_build_weights(vad->weights, num_mel_bins, sample_rate); + return 0; +} + +int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log) +{ + int64_t energy_delta = 0; + int32_t delta; + int16_t alpha; + bool raw_speech; + int i; + + if (!vad || !mel_log) + return 0; + + vad->frame_count++; + + /* Initialize noise floor to first frame */ + if (!vad->initialized) { + for (i = 0; i < vad->num_mel_bins; i++) + vad->noise_floor[i] = mel_log[i]; + + vad->initialized = true; + } + + /* Select rise alpha based on convergence phase */ + if (vad->frame_count <= vad->init_frames) + alpha = vad->noise_rise_alpha_fast; + else + alpha = vad->noise_rise_alpha_slow; + + /* Update noise floor: follow down instantly, rise slowly */ + for (i = 0; i < vad->num_mel_bins; i++) { + if (mel_log[i] < vad->noise_floor[i]) { + /* Instant follow-down */ + vad->noise_floor[i] = mel_log[i]; + } else { + /* Slow rise: floor += alpha * (mel - floor) + * Q9.23 + Q1.15 * Q9.23 => need Q9.23 result + * alpha is Q1.15, delta is Q9.23 + * product is Q10.38, shift right by 15 to get Q9.23 + */ + delta = mel_log[i] - vad->noise_floor[i]; + vad->noise_floor[i] += (int32_t)(((int64_t)alpha * delta) >> 15); + } + } + + /* Compute weighted energy delta above noise floor. + * energy_delta = sum(weights[i] * (mel[i] - noise_floor[i])) + * weights are Q1.15, mel delta is Q9.23 + * Product is Q10.38, accumulate in int64_t then shift to Q9.23 + */ + for (i = 0; i < vad->num_mel_bins; i++) { + delta = mel_log[i] - vad->noise_floor[i]; + if (delta > 0) + energy_delta += (int64_t)vad->weights[i] * delta; + } + + /* Shift accumulated energy from Q10.38 to Q9.23: shift right by 15 */ + energy_delta >>= 15; + + /* VAD decision with hangover */ + raw_speech = (int32_t)energy_delta > vad->energy_threshold; + + if (raw_speech) { + vad->hangover_counter = vad->hangover_max; + vad->is_speech = true; + } else { + if (vad->hangover_counter > 0) { + vad->hangover_counter--; + vad->is_speech = true; + } else { + vad->is_speech = false; + } + } + + return vad->is_speech ? 1 : 0; +} + +void mfcc_vad_reset(struct mfcc_vad_state *vad) +{ + int i; + + if (!vad) + return; + + vad->frame_count = 0; + vad->hangover_counter = 0; + vad->is_speech = false; + vad->initialized = false; + + for (i = 0; i < vad->num_mel_bins; i++) + vad->noise_floor[i] = 0; +} + +#endif /* CONFIG_COMP_MFCC_VAD */ diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h index 025eef116752..e0617e0f026f 100644 --- a/src/include/sof/audio/mfcc/mfcc_comp.h +++ b/src/include/sof/audio/mfcc/mfcc_comp.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -106,6 +107,10 @@ struct mfcc_state { bool waiting_fill; /**< booleans */ bool prev_samples_valid; bool magic_pending; /**< True when magic word not yet written for current output */ +#ifdef CONFIG_COMP_MFCC_VAD + bool vad_pending; /**< True when VAD flag not yet written for current output */ + int32_t vad_flag; /**< Current VAD result: 1 = speech, 0 = silence */ +#endif size_t sample_buffers_size; /**< bytes */ int16_t *out_data_ptr; /**< Read pointer into scratch data for multi-period output */ int32_t *out_data_ptr_32; /**< Read pointer for 32-bit mel-only output */ @@ -115,6 +120,9 @@ struct mfcc_state { /* MFCC component private data */ struct mfcc_comp_data { struct mfcc_state state; +#ifdef CONFIG_COMP_MFCC_VAD + struct mfcc_vad_state vad; +#endif struct comp_data_blob_handler *model_handler; struct sof_mfcc_config *config; int max_frames; diff --git a/src/include/sof/audio/mfcc/mfcc_vad.h b/src/include/sof/audio/mfcc/mfcc_vad.h new file mode 100644 index 000000000000..2e2cf5668c84 --- /dev/null +++ b/src/include/sof/audio/mfcc/mfcc_vad.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * + * Copyright(c) 2026 Intel Corporation. + * + * Author: Seppo Ingalsuo + */ + +/** + * \file mfcc_vad.h + * \brief Voice Activity Detection based on Mel spectrum energy. + * + * This VAD operates on the Q9.23 Mel log spectrum values produced by + * the MFCC component. It tracks a per-bin noise floor that follows + * the signal downward instantly and rises slowly, then computes a + * speech-weighted energy delta above the floor. + */ + +#ifndef __SOF_AUDIO_MFCC_MFCC_VAD_H__ +#define __SOF_AUDIO_MFCC_MFCC_VAD_H__ + +#include +#include + +#ifdef CONFIG_COMP_MFCC_VAD + +/** + * \brief Maximum number of Mel bins supported by VAD. + */ +#define MFCC_VAD_MAX_MEL_BINS 128 + +/** + * \brief Number of frames for fast noise floor convergence at startup (~1 s at 10 ms/frame). + */ +#define MFCC_VAD_NOISE_INIT_FRAMES 100 + +/** + * \brief Slow noise floor rise coefficient in Q1.15 (0.0025 * 32768 = 82). + */ +#define MFCC_VAD_NOISE_RISE_ALPHA 82 + +/** + * \brief Fast noise floor rise coefficient in Q1.15 (0.05 * 32768 = 1638). + */ +#define MFCC_VAD_NOISE_RISE_ALPHA_FAST 1638 + +/** + * \brief Energy threshold for speech detection in Q9.23 (0.30 * 2^23 = 2516582). + */ +#define MFCC_VAD_ENERGY_THRESHOLD 2516582 + +/** + * \brief Hangover frame count to keep VAD active after last speech detection. + */ +#define MFCC_VAD_HANGOVER_FRAMES 20 + +/** + * \brief VAD state structure. + */ +struct mfcc_vad_state { + int32_t noise_floor[MFCC_VAD_MAX_MEL_BINS]; /**< Per-bin noise floor in Q9.23 */ + int16_t weights[MFCC_VAD_MAX_MEL_BINS]; /**< Speech-frequency emphasis weights Q1.15 */ + int32_t energy_threshold; /**< Energy threshold Q9.23 */ + int16_t noise_rise_alpha_slow; /**< Slow rise alpha Q1.15 */ + int16_t noise_rise_alpha_fast; /**< Fast rise alpha Q1.15 */ + int16_t hangover_max; /**< Maximum hangover frames */ + int16_t hangover_counter; /**< Current hangover counter */ + int16_t num_mel_bins; /**< Number of Mel bins in use */ + int16_t init_frames; /**< Frames remaining in fast convergence period */ + int32_t frame_count; /**< Total frames processed */ + bool is_speech; /**< Current VAD decision */ + bool initialized; /**< True after first frame processed */ +}; + +/** + * \brief Initialize VAD state. + * + * \param[out] vad Pointer to VAD state to initialize. + * \param[in] num_mel_bins Number of Mel bins (must be <= MFCC_VAD_MAX_MEL_BINS). + * \param[in] sample_rate Audio sample rate in Hz. + * \return 0 on success, negative error code on failure. + */ +int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int sample_rate); + +/** + * \brief Process one Mel spectrum frame and update VAD decision. + * + * \param[in,out] vad Pointer to VAD state. + * \param[in] mel_log Mel log spectrum in Q9.23, array of num_mel_bins values. + * \return 1 if speech detected, 0 if silence. + */ +int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log); + +/** + * \brief Reset VAD state without changing configuration. + * + * \param[in,out] vad Pointer to VAD state. + */ +void mfcc_vad_reset(struct mfcc_vad_state *vad); + +#endif /* CONFIG_COMP_MFCC_VAD */ + +#endif /* __SOF_AUDIO_MFCC_MFCC_VAD_H__ */