diff --git a/scripts/rebuild-testbench.sh b/scripts/rebuild-testbench.sh
index fee09fd243ae..996d16f45a8c 100755
--- a/scripts/rebuild-testbench.sh
+++ b/scripts/rebuild-testbench.sh
@@ -97,7 +97,7 @@ export_xtensa_setup()
     cat <<EOFSETUP > "$export_script"
 export XTENSA_TOOLS_ROOT=$XTENSA_TOOLS_ROOT
 export XTENSA_CORE=$XTENSA_CORE
-XTENSA_PATH=$tools_bin
+export XTENSA_PATH=$tools_bin
 EOFSETUP
 }
 
diff --git a/src/audio/mfcc/Kconfig b/src/audio/mfcc/Kconfig
index 678331896b5f..6bbf8fc486b9 100644
--- a/src/audio/mfcc/Kconfig
+++ b/src/audio/mfcc/Kconfig
@@ -4,7 +4,7 @@ config COMP_MFCC
 	tristate "MFCC component"
 	depends on COMP_MODULE_ADAPTER
 	select CORDIC_FIXED
-	select MATH_16BIT_MEL_FILTERBANK
+	select MATH_32BIT_MEL_FILTERBANK
 	select MATH_AUDITORY
 	select MATH_DCT
 	select MATH_DECIBELS
diff --git a/src/audio/mfcc/mfcc.c b/src/audio/mfcc/mfcc.c
index 9874edea4be5..656e3d9b7bf7 100644
--- a/src/audio/mfcc/mfcc.c
+++ b/src/audio/mfcc/mfcc.c
@@ -38,13 +38,13 @@ SOF_DEFINE_REG_UUID(mfcc);
 
 __cold_rodata const struct mfcc_func_map mfcc_fm[] = {
 #if CONFIG_FORMAT_S16LE
-	{SOF_IPC_FRAME_S16_LE,  mfcc_s16_default},
+	{SOF_IPC_FRAME_S16_LE, mfcc_s16_default},
 #endif /* CONFIG_FORMAT_S16LE */
 #if CONFIG_FORMAT_S24LE
-	{SOF_IPC_FRAME_S24_4LE, NULL},
+	{SOF_IPC_FRAME_S24_4LE, mfcc_s24_default},
 #endif /* CONFIG_FORMAT_S24LE */
 #if CONFIG_FORMAT_S32LE
-	{SOF_IPC_FRAME_S32_LE,  NULL},
+	{SOF_IPC_FRAME_S32_LE, mfcc_s32_default},
 #endif /* CONFIG_FORMAT_S32LE */
 };
 
diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c
index 688c7afac9b2..76eb7dd04a74 100644
--- a/src/audio/mfcc/mfcc_common.c
+++ b/src/audio/mfcc/mfcc_common.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: BSD-3-Clause
 //
-// Copyright(c) 2023 Intel Corporation. All rights reserved.
+// Copyright(c) 2023-2026 Intel Corporation.
 //
 // Author: Andrula Song <andrula.song@intel.com>
 
@@ -8,6 +8,7 @@
 
 #include <sof/audio/component.h>
 #include <sof/audio/audio_stream.h>
+#include <sof/audio/format.h>
 #include <sof/math/auditory.h>
 #include <sof/math/matrix.h>
 #include <sof/math/sqrt.h>
@@ -36,15 +37,21 @@ LOG_MODULE_REGISTER(mfcc_common, CONFIG_SOF_LOG_LEVEL);
  * The main processing function for MFCC
  */
 
-static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_state *state)
+static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data *cd)
 {
+	struct sof_mfcc_config *config = cd->config;
+	struct mfcc_state *state = &cd->state;
 	struct mfcc_buffer *buf = &state->buf;
 	struct mfcc_fft *fft = &state->fft;
 	int mel_scale_shift;
 	int input_shift;
-	int i;
+	int i, j;
 	int m;
 	int cc_count = 0;
+	int64_t s;
+	int32_t mel_value;
+	int32_t peak;
+	int32_t clamp_value;
 
 	/* Phase 1, wait until whole fft_size is filled with valid data. This way
 	 * first output cepstral coefficients originate from streamed data and not
@@ -103,8 +110,8 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_state *stat
 		fft_execute_32(fft->fft_plan, false);
 #endif
 
-		/* Convert powerspectrum to Mel band logarithmic spectrum */
-		mat_init_16b(state->mel_spectra, 1, state->dct.num_in, 7); /* Q8.7 */
+		/* Convert powerspectrum to Mel band logarithmic spectrum Q9.23 */
+		mat_init_16b(state->mel_spectra, 1, state->dct.num_in, 7); /* Q9.7 */
 
 		/* Compensate FFT lib scaling to Mel log values, e.g. for 512 long FFT
 		 * the fft_plan->len is 9. The scaling is 1/512. Subtract from input_shift it
@@ -114,21 +121,79 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_state *stat
 #if MFCC_FFT_BITS == 16
 		psy_apply_mel_filterbank_16(&state->melfb, fft->fft_out, state->power_spectra,
 					    state->mel_spectra->data, mel_scale_shift);
+		/* Convert Q9.7 int16_t mel log to Q9.23 int32_t for downstream processing */
+		for (j = 0; j < state->dct.num_in; j++)
+			state->mel_log_32[j] = (int32_t)state->mel_spectra->data[j] << 16;
 #else
 		psy_apply_mel_filterbank_32(&state->melfb, fft->fft_out, state->power_spectra,
-					    state->mel_spectra->data, mel_scale_shift);
+					    state->mel_log_32, mel_scale_shift);
 #endif
 
-		/* Multiply Mel spectra with DCT matrix to get cepstral coefficients */
-		mat_init_16b(state->cepstral_coef, 1, state->dct.num_out, 7); /* Q8.7 */
-		mat_multiply(state->mel_spectra, state->dct.matrix, state->cepstral_coef);
+		if (state->mel_only) {
+			/* In Mel-only mode output Mel log spectra directly */
+			cc_count += state->dct.num_in;
+
+			/* Find peak mel value and track state->mmax in Q9.23 */
+			if (config->dynamic_mmax) {
+				peak = state->mel_log_32[0];
+				for (j = 1; j < state->dct.num_in; j++) {
+					if (state->mel_log_32[j] > peak)
+						peak = state->mel_log_32[j];
+				}
+
+				/* Jump to peak immediately if higher, decay otherwise */
+				if (peak > state->mmax) {
+					state->mmax = peak;
+				} else {
+					/* Q9.23 * Q1.15, result Q9.23. The coefficient is small
+					 * so no need for saturation.
+					 */
+					s = (int64_t)peak - state->mmax;
+					state->mmax +=
+						Q_MULTSR_32X32(s, config->mmax_coef, 23, 15, 23);
+				}
+			}
+
+			/* Clamp Mel values lower than mmax - top_db, add offset, and scale.
+			 * Config top_db and mel_offset are Q9.7, shift to Q9.23.
+			 */
+			clamp_value = state->mmax - ((int32_t)config->top_db << 16);
+			for (j = 0; j < state->dct.num_in; j++) {
+				mel_value = state->mel_log_32[j];
+				if (mel_value < clamp_value)
+					mel_value = clamp_value;
+
+				/* Q9.23 * Q4.12, result Q9.23 */
+				s = (int64_t)mel_value + ((int32_t)config->mel_offset << 16);
+				state->mel_log_32[j] =
+					sat_int32(Q_MULTSR_32X32(s, config->mel_scale, 23, 12, 23));
+			}
+
+			/* Store Q9.7 version in mel_spectra for s16 output mode */
+			for (j = 0; j < state->dct.num_in; j++)
+				state->mel_spectra->data[j] =
+					sat_int16(state->mel_log_32[j] >> 16);
 
-		/* Apply cepstral lifter */
-		if (state->lifter.cepstral_lifter != 0)
-			mat_multiply_elementwise(state->cepstral_coef, state->lifter.matrix,
-						 state->cepstral_coef);
+			/* Enable this to check mmax decay */
+			comp_dbg(dev, "state->mmax = %d", state->mmax);
+		} else {
+			/* Convert Q9.23 to Q9.7 for 16-bit DCT */
+			for (j = 0; j < state->dct.num_in; j++)
+				state->mel_spectra->data[j] =
+					sat_int16(state->mel_log_32[j] >> 16);
 
-		cc_count += state->dct.num_out;
+			/* Multiply Mel spectra with DCT matrix to get cepstral coefficients */
+			mat_init_16b(state->cepstral_coef, 1, state->dct.num_out, 7); /* Q9.7 */
+			mat_multiply(state->mel_spectra, state->dct.matrix, state->cepstral_coef);
+
+			/* Apply cepstral lifter */
+			if (state->lifter.cepstral_lifter != 0) {
+				mat_multiply_elementwise(state->cepstral_coef, state->lifter.matrix,
+							 state->cepstral_coef);
+			}
+
+			cc_count += state->dct.num_out;
+		}
 
 		/* Output to sink buffer */
 	}
@@ -139,7 +204,101 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_state *stat
 	return cc_count;
 }
 
+void mfcc_fill_fft_buffer(struct mfcc_state *state)
+{
+	struct mfcc_buffer *buf = &state->buf;
+	struct mfcc_fft *fft = &state->fft;
+#if MFCC_FFT_BITS == 16
+	int16_t *d = &fft->fft_buf[fft->fft_fill_start_idx].real;
+	const int fft_elem_inc = sizeof(fft->fft_buf[0]) / sizeof(int16_t);
+#else
+	int32_t *d = &fft->fft_buf[fft->fft_fill_start_idx].real;
+	const int fft_elem_inc = sizeof(fft->fft_buf[0]) / sizeof(int32_t);
+#endif
+	int16_t *prev = state->prev_data;
+	int16_t *prev_end = prev + state->prev_data_size;
+	int16_t *r = buf->r_ptr;
+	int copied;
+	int nmax;
+	int n;
+	int j;
+
+	/* Copy overlapped samples from state buffer. The fft_buf has been
+	 * cleared by caller so imaginary part remains zero.
+	 */
+	while (prev < prev_end) {
+		*d = *prev++;
+		d += fft_elem_inc;
+	}
+
+	/* Copy hop size of new data from circular buffer */
+	for (copied = 0; copied < fft->fft_hop_size; copied += n) {
+		nmax = fft->fft_hop_size - copied;
+		n = mfcc_buffer_samples_without_wrap(buf, r);
+		n = MIN(n, nmax);
+		for (j = 0; j < n; j++) {
+			*d = *r++;
+			d += fft_elem_inc;
+		}
+		r = mfcc_buffer_wrap(buf, r);
+	}
+
+	buf->s_avail -= copied;
+	buf->s_free += copied;
+	buf->r_ptr = r;
+
+	/* Copy for next time data back to overlap buffer */
+#if MFCC_FFT_BITS == 16
+	d = (int16_t *)&fft->fft_buf[fft->fft_fill_start_idx + fft->fft_hop_size].real;
+#else
+	d = (int32_t *)&fft->fft_buf[fft->fft_fill_start_idx + fft->fft_hop_size].real;
+#endif
+	prev = state->prev_data;
+	while (prev < prev_end) {
+		*prev++ = *d;
+		d += fft_elem_inc;
+	}
+}
+
 #if CONFIG_FORMAT_S16LE
+static int16_t *mfcc_sink_copy_zero_s16(const struct audio_stream *sink, int16_t *w_ptr,
+					int samples)
+{
+	int copied;
+	int nmax;
+	int n;
+
+	for (copied = 0; copied < samples; copied += n) {
+		nmax = samples - copied;
+		n = audio_stream_samples_without_wrap_s16(sink, w_ptr);
+		n = MIN(n, nmax);
+		memset(w_ptr, 0, n * sizeof(int16_t));
+		w_ptr = audio_stream_wrap(sink, w_ptr + n);
+	}
+
+	return w_ptr;
+}
+
+static int16_t *mfcc_sink_copy_data_s16(const struct audio_stream *sink, int16_t *w_ptr,
+					int samples, int16_t *r_ptr)
+{
+	int copied;
+	int nmax;
+	int n;
+
+	for (copied = 0; copied < samples; copied += n) {
+		nmax = samples - copied;
+		n = audio_stream_samples_without_wrap_s16(sink, w_ptr);
+		n = MIN(n, nmax);
+		/* Not using memcpy_s() due to speed need */
+		memcpy(w_ptr, r_ptr, n * sizeof(int16_t));
+		w_ptr = audio_stream_wrap(sink, w_ptr + n);
+		r_ptr += n;
+	}
+
+	return w_ptr;
+}
+
 void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer *bsource,
 		      struct output_stream_buffer *bsink, int frames)
 {
@@ -149,35 +308,243 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer
 	struct mfcc_buffer *buf = &cd->state.buf;
 	uint32_t magic = MFCC_MAGIC;
 	int16_t *w_ptr = audio_stream_get_wptr(sink);
-	// int num_magic = sizeof(magic) / sizeof(int16_t);
 	const int num_magic = 2;
 	int num_ceps;
-	int zero_samples;
+	int sink_samples;
+	int to_copy;
 
 	/* Get samples from source buffer */
 	mfcc_source_copy_s16(bsource, buf, &state->emph, frames, state->source_channel);
 
-	/* Run STFT and processing after FFT: Mel auditory filter and DCT. The sink
-	 * buffer is updated during STDF processing.
-	 */
-	num_ceps = mfcc_stft_process(mod->dev, state);
+	/* Run STFT and processing after FFT: Mel auditory filter and DCT. */
+	num_ceps = mfcc_stft_process(mod->dev, cd);
 
-	/* Done, copy data to sink. This works only if the period has room for magic (2)
-	 * plus num_ceps int16_t samples. TODO: split ceps over multiple periods.
-	 */
-	zero_samples = frames * audio_stream_get_channels(sink);
+	/* If new output produced, set up pointer into scratch data and mark magic pending */
 	if (num_ceps > 0) {
-		zero_samples -= num_ceps + num_magic;
+		if (state->mel_only)
+			state->out_data_ptr = state->mel_spectra->data;
+		else
+			state->out_data_ptr = state->cepstral_coef->data;
+
+		state->out_remain = num_ceps;
+		state->magic_pending = true;
+	}
+
+	/* Write to sink, limited by period size */
+	sink_samples = frames * audio_stream_get_channels(sink);
+
+	/* Write magic word first if pending */
+	if (state->magic_pending && sink_samples >= num_magic) {
 		w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_magic, (int16_t *)&magic);
-		w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_ceps, state->cepstral_coef->data);
+		sink_samples -= num_magic;
+		state->magic_pending = false;
+	}
+
+	/* Write cepstral/mel data from scratch buffer */
+	to_copy = MIN(state->out_remain, sink_samples);
+	if (to_copy > 0) {
+		w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, to_copy, state->out_data_ptr);
+		state->out_data_ptr += to_copy;
+		state->out_remain -= to_copy;
+		sink_samples -= to_copy;
 	}
 
-	w_ptr = mfcc_sink_copy_zero_s16(sink, w_ptr, zero_samples);
+	/* Zero-fill remaining sink samples */
+	w_ptr = mfcc_sink_copy_zero_s16(sink, w_ptr, sink_samples);
 }
 #endif /* CONFIG_FORMAT_S16LE */
 
+#if CONFIG_FORMAT_S24LE || CONFIG_FORMAT_S32LE
+static int32_t *mfcc_sink_copy_zero_s32(const struct audio_stream *sink, int32_t *w_ptr,
+					int samples)
+{
+	int copied;
+	int nmax;
+	int n;
+
+	for (copied = 0; copied < samples; copied += n) {
+		nmax = samples - copied;
+		n = audio_stream_samples_without_wrap_s32(sink, w_ptr);
+		n = MIN(n, nmax);
+		memset(w_ptr, 0, n * sizeof(int32_t));
+		w_ptr = audio_stream_wrap(sink, w_ptr + n);
+	}
+
+	return w_ptr;
+}
+
+static int32_t *mfcc_sink_copy_data_s32(const struct audio_stream *sink, int32_t *w_ptr,
+					int samples, int32_t *r_ptr)
+{
+	int copied;
+	int nmax;
+	int n;
+
+	for (copied = 0; copied < samples; copied += n) {
+		nmax = samples - copied;
+		n = audio_stream_samples_without_wrap_s32(sink, w_ptr);
+		n = MIN(n, nmax);
+		/* Not using memcpy_s() due to speed need */
+		memcpy(w_ptr, r_ptr, n * sizeof(int32_t));
+		w_ptr = audio_stream_wrap(sink, w_ptr + n);
+		r_ptr += n;
+	}
+
+	return w_ptr;
+}
+#endif /* CONFIG_FORMAT_S24LE || CONFIG_FORMAT_S32LE */
+
 #if CONFIG_FORMAT_S24LE
+void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer *bsource,
+		      struct output_stream_buffer *bsink, int frames)
+{
+	struct audio_stream *sink = bsink->data;
+	struct mfcc_comp_data *cd = module_get_private_data(mod);
+	struct mfcc_state *state = &cd->state;
+	struct mfcc_buffer *buf = &cd->state.buf;
+	uint32_t magic = MFCC_MAGIC;
+	int32_t *w_ptr = audio_stream_get_wptr(sink);
+	const int num_magic = 1; /* one int32_t word for magic */
+	int num_ceps;
+	int sink_samples;
+	int remain_s32;
+	int to_copy;
+	int k;
+
+	/* Get samples from source buffer */
+	mfcc_source_copy_s24(bsource, buf, &state->emph, frames, state->source_channel);
+
+	/* Run STFT and processing after FFT */
+	num_ceps = mfcc_stft_process(mod->dev, cd);
+
+	/* If new output produced, set up pointer into scratch data */
+	if (num_ceps > 0) {
+		if (state->mel_only) {
+			/* Convert mel_log_32 from Q9.23 to Q9.15 in-place */
+			for (k = 0; k < num_ceps; k++)
+				state->mel_log_32[k] >>= 8;
+
+			state->out_data_ptr_32 = state->mel_log_32;
+		} else {
+			state->out_data_ptr = state->cepstral_coef->data;
+		}
+
+		state->out_remain = num_ceps;
+		state->magic_pending = true;
+	}
+
+	/* Write to sink, limited by period size */
+	sink_samples = frames * audio_stream_get_channels(sink);
+
+	/* Write magic word first if pending */
+	if (state->magic_pending && sink_samples >= num_magic) {
+		w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic);
+		sink_samples -= num_magic;
+		state->magic_pending = false;
+	}
+
+	if (state->mel_only) {
+		/* Write 32-bit mel data Q9.15, one value per int32_t */
+		to_copy = MIN(state->out_remain, sink_samples);
+		if (to_copy > 0) {
+			w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy,
+							state->out_data_ptr_32);
+			state->out_data_ptr_32 += to_copy;
+			state->out_remain -= to_copy;
+			sink_samples -= to_copy;
+		}
+	} else {
+		/* Write cepstral data packed as int32_t from scratch buffer */
+		remain_s32 = (state->out_remain + 1) / 2;
+		to_copy = MIN(remain_s32, sink_samples);
+		if (to_copy > 0) {
+			w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy,
+							(int32_t *)state->out_data_ptr);
+			state->out_data_ptr += to_copy * 2;
+			state->out_remain -= to_copy * 2;
+			if (state->out_remain < 0)
+				state->out_remain = 0;
+
+			sink_samples -= to_copy;
+		}
+	}
+
+	/* Zero-fill remaining sink samples */
+	w_ptr = mfcc_sink_copy_zero_s32(sink, w_ptr, sink_samples);
+}
 #endif /* CONFIG_FORMAT_S24LE */
 
 #if CONFIG_FORMAT_S32LE
+void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer *bsource,
+		      struct output_stream_buffer *bsink, int frames)
+{
+	struct audio_stream *sink = bsink->data;
+	struct mfcc_comp_data *cd = module_get_private_data(mod);
+	struct mfcc_state *state = &cd->state;
+	struct mfcc_buffer *buf = &cd->state.buf;
+	uint32_t magic = MFCC_MAGIC;
+	int32_t *w_ptr = audio_stream_get_wptr(sink);
+	const int num_magic = 1; /* one int32_t word for magic */
+	int num_ceps;
+	int sink_samples;
+	int remain_s32;
+	int to_copy;
+
+	/* Get samples from source buffer */
+	mfcc_source_copy_s32(bsource, buf, &state->emph, frames, state->source_channel);
+
+	/* Run STFT and processing after FFT */
+	num_ceps = mfcc_stft_process(mod->dev, cd);
+
+	/* If new output produced, set up pointer into scratch data */
+	if (num_ceps > 0) {
+		if (state->mel_only) {
+			state->out_data_ptr_32 = state->mel_log_32;
+		} else {
+			state->out_data_ptr = state->cepstral_coef->data;
+		}
+
+		state->out_remain = num_ceps;
+		state->magic_pending = true;
+	}
+
+	/* Write to sink, limited by period size */
+	sink_samples = frames * audio_stream_get_channels(sink);
+
+	/* Write magic word first if pending */
+	if (state->magic_pending && sink_samples >= num_magic) {
+		w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic);
+		sink_samples -= num_magic;
+		state->magic_pending = false;
+	}
+
+	if (state->mel_only) {
+		/* Write 32-bit mel data Q9.23, one value per int32_t */
+		to_copy = MIN(state->out_remain, sink_samples);
+		if (to_copy > 0) {
+			w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy,
+							state->out_data_ptr_32);
+			state->out_data_ptr_32 += to_copy;
+			state->out_remain -= to_copy;
+			sink_samples -= to_copy;
+		}
+	} else {
+		/* Write cepstral data packed as int32_t from scratch buffer */
+		remain_s32 = (state->out_remain + 1) / 2;
+		to_copy = MIN(remain_s32, sink_samples);
+		if (to_copy > 0) {
+			w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy,
+							(int32_t *)state->out_data_ptr);
+			state->out_data_ptr += to_copy * 2;
+			state->out_remain -= to_copy * 2;
+			if (state->out_remain < 0)
+				state->out_remain = 0;
+
+			sink_samples -= to_copy;
+		}
+	}
+
+	/* Zero-fill remaining sink samples */
+	w_ptr = mfcc_sink_copy_zero_s32(sink, w_ptr, sink_samples);
+}
 #endif /* CONFIG_FORMAT_S32LE */
diff --git a/src/audio/mfcc/mfcc_generic.c b/src/audio/mfcc/mfcc_generic.c
index ecc95474326b..c6c699a2b724 100644
--- a/src/audio/mfcc/mfcc_generic.c
+++ b/src/audio/mfcc/mfcc_generic.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: BSD-3-Clause
 //
-// Copyright(c) 2022 Intel Corporation. All rights reserved.
+// Copyright(c) 2022-2026 Intel Corporation.
 //
 // Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
 
@@ -26,53 +26,6 @@
  * MFCC algorithm code
  */
 
-void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
-			  struct mfcc_pre_emph *emph, int frames, int source_channel)
-{
-	struct audio_stream *source = bsource->data;
-	int32_t s;
-	int16_t *x0;
-	int16_t *x = audio_stream_get_rptr(source);
-	int16_t *w = buf->w_ptr;
-	int copied;
-	int nmax;
-	int n1;
-	int n2;
-	int n;
-	int i;
-	int num_channels = audio_stream_get_channels(source);
-
-	/* Copy from source to pre-buffer for FFT.
-	 * The pre-emphasis filter is done in this step.
-	 */
-	for (copied = 0; copied < frames; copied += n) {
-		nmax = frames - copied;
-		n1 = audio_stream_frames_without_wrap(source, x);
-		n2 = mfcc_buffer_samples_without_wrap(buf, w);
-		n = MIN(n1, n2);
-		n = MIN(n, nmax);
-		x0 = x + source_channel;
-		for (i = 0; i < n; i++) {
-			if (emph->enable) {
-				/* Q1.15 x Q1.15 -> Q2.30 */
-				s = (int32_t)emph->delay * emph->coef + Q_SHIFT_LEFT(*x0, 15, 30);
-				*w = sat_int16(Q_SHIFT_RND(s, 30, 15));
-				emph->delay = *x0;
-			} else {
-				*w = *x0;
-			}
-			x0 += num_channels;
-			w++;
-		}
-
-		x = audio_stream_wrap(source, x + n * audio_stream_get_channels(source));
-		w = mfcc_buffer_wrap(buf, w);
-	}
-	buf->s_avail += copied;
-	buf->s_free -= copied;
-	buf->w_ptr = w;
-}
-
 void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data,
 			    int prev_data_length)
 {
@@ -98,47 +51,6 @@ void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data,
 	buf->r_ptr = r;
 }
 
-void mfcc_fill_fft_buffer(struct mfcc_state *state)
-{
-	struct mfcc_buffer *buf = &state->buf;
-	struct mfcc_fft *fft = &state->fft;
-	int16_t *r = buf->r_ptr;
-	int copied;
-	int nmax;
-	int idx = fft->fft_fill_start_idx;
-	int j;
-	int n;
-
-	/* Copy overlapped samples from state buffer. Imaginary part of input
-	 * remains zero.
-	 */
-	for (j = 0; j < state->prev_data_size; j++)
-		fft->fft_buf[idx + j].real = state->prev_data[j];
-
-	/* Copy hop size of new data from circular buffer */
-	idx += state->prev_data_size;
-	for (copied = 0; copied < fft->fft_hop_size; copied += n) {
-		nmax = fft->fft_hop_size - copied;
-		n = mfcc_buffer_samples_without_wrap(buf, r);
-		n = MIN(n, nmax);
-		for (j = 0; j < n; j++) {
-			fft->fft_buf[idx].real = *r;
-			r++;
-			idx++;
-		}
-		r = mfcc_buffer_wrap(buf, r);
-	}
-
-	buf->s_avail -= copied;
-	buf->s_free += copied;
-	buf->r_ptr = r;
-
-	/* Copy for next time data back to overlap buffer */
-	idx = fft->fft_fill_start_idx + fft->fft_hop_size;
-	for (j = 0; j < state->prev_data_size; j++)
-		state->prev_data[j] = fft->fft_buf[idx + j].real;
-}
-
 #ifdef MFCC_NORMALIZE_FFT
 int mfcc_normalize_fft_buffer(struct mfcc_state *state)
 {
@@ -189,53 +101,160 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift)
 }
 
 #if CONFIG_FORMAT_S16LE
-
-int16_t *mfcc_sink_copy_zero_s16(const struct audio_stream *sink,
-				 int16_t *w_ptr, int samples)
+void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
+			  struct mfcc_pre_emph *emph, int frames, int source_channel)
 {
+	struct audio_stream *source = bsource->data;
+	int32_t s;
+	int16_t *x0;
+	int16_t *x = audio_stream_get_rptr(source);
+	int16_t *w = buf->w_ptr;
 	int copied;
 	int nmax;
-	int i;
+	int n1;
+	int n2;
 	int n;
+	int i;
+	int num_channels = audio_stream_get_channels(source);
 
-	for (copied = 0; copied < samples; copied += n) {
-		nmax = samples - copied;
-		n = audio_stream_samples_without_wrap_s16(sink, w_ptr);
+	/* Copy from source to pre-buffer for FFT.
+	 * The pre-emphasis filter is done in this step.
+	 */
+	for (copied = 0; copied < frames; copied += n) {
+		nmax = frames - copied;
+		n1 = audio_stream_frames_without_wrap(source, x);
+		n2 = mfcc_buffer_samples_without_wrap(buf, w);
+		n = MIN(n1, n2);
 		n = MIN(n, nmax);
+		x0 = x + source_channel;
 		for (i = 0; i < n; i++) {
-			*w_ptr = 0;
-			w_ptr++;
+			if (emph->enable) {
+				/* Q1.15 x Q1.15 -> Q2.30 */
+				s = (int32_t)emph->delay * emph->coef + Q_SHIFT_LEFT(*x0, 15, 30);
+				*w = sat_int16(Q_SHIFT_RND(s, 30, 15));
+				emph->delay = *x0;
+			} else {
+				*w = *x0;
+			}
+			x0 += num_channels;
+			w++;
 		}
 
-		w_ptr = audio_stream_wrap(sink, w_ptr);
+		x = audio_stream_wrap(source, x + n * audio_stream_get_channels(source));
+		w = mfcc_buffer_wrap(buf, w);
 	}
-
-	return w_ptr;
+	buf->s_avail += copied;
+	buf->s_free -= copied;
+	buf->w_ptr = w;
 }
+#endif /* CONFIG_FORMAT_S16LE */
+
+#if CONFIG_FORMAT_S24LE
 
-int16_t *mfcc_sink_copy_data_s16(const struct audio_stream *sink, int16_t *w_ptr,
-				 int samples, int16_t *r_ptr)
+void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
+			  struct mfcc_pre_emph *emph, int frames, int source_channel)
 {
+	struct audio_stream *source = bsource->data;
+	int32_t tmp, s;
+	int32_t *x0;
+	int32_t *x = audio_stream_get_rptr(source);
+	int16_t *w = buf->w_ptr;
 	int copied;
 	int nmax;
-	int i;
+	int n1;
+	int n2;
 	int n;
+	int i;
+	int num_channels = audio_stream_get_channels(source);
 
-	for (copied = 0; copied < samples; copied += n) {
-		nmax = samples - copied;
-		n = audio_stream_samples_without_wrap_s16(sink, w_ptr);
+	/* Copy from source to pre-buffer for FFT.
+	 * The pre-emphasis filter is done in this step.
+	 * S24_4LE data is in 32-bit container, shift left by 8 to Q1.31,
+	 * then convert to Q1.15 with rounding.
+	 */
+	for (copied = 0; copied < frames; copied += n) {
+		nmax = frames - copied;
+		n1 = audio_stream_frames_without_wrap(source, x);
+		n2 = mfcc_buffer_samples_without_wrap(buf, w);
+		n = MIN(n1, n2);
 		n = MIN(n, nmax);
+		x0 = x + source_channel;
 		for (i = 0; i < n; i++) {
-			*w_ptr = *r_ptr;
-			r_ptr++;
-			w_ptr++;
+			if (emph->enable) {
+				/* Convert to Q1.31, ignore highest byte */
+				s = (int32_t)((uint32_t)*x0 << 8);
+				/* Q1.15 x Q1.15 -> Q2.30 */
+				tmp = (int32_t)emph->delay * emph->coef + Q_SHIFT(s, 31, 30);
+				*w = sat_int16(Q_SHIFT_RND(tmp, 30, 15));
+				emph->delay = sat_int16(Q_SHIFT_RND(s, 31, 15));
+			} else {
+				/* Convert to Q1.31, ignore highest byte */
+				s = (int32_t)((uint32_t)*x0 << 8);
+				*w = sat_int16(Q_SHIFT_RND(s, 31, 15));
+			}
+			x0 += num_channels;
+			w++;
 		}
 
-		w_ptr = audio_stream_wrap(sink, w_ptr);
+		x = audio_stream_wrap(source, x + n * audio_stream_get_channels(source));
+		w = mfcc_buffer_wrap(buf, w);
 	}
+	buf->s_avail += copied;
+	buf->s_free -= copied;
+	buf->w_ptr = w;
+}
+
+#endif /* CONFIG_FORMAT_S24LE */
 
-	return w_ptr;
+#if CONFIG_FORMAT_S32LE
+
+void mfcc_source_copy_s32(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
+			  struct mfcc_pre_emph *emph, int frames, int source_channel)
+{
+	struct audio_stream *source = bsource->data;
+	int32_t s;
+	int32_t *x0;
+	int32_t *x = audio_stream_get_rptr(source);
+	int16_t *w = buf->w_ptr;
+	int copied;
+	int nmax;
+	int n1;
+	int n2;
+	int n;
+	int i;
+	int num_channels = audio_stream_get_channels(source);
+
+	/* Copy from source to pre-buffer for FFT.
+	 * The pre-emphasis filter is done in this step.
+	 * S32 data is in 32-bit container, shift right by 16 to get 16-bit.
+	 */
+	for (copied = 0; copied < frames; copied += n) {
+		nmax = frames - copied;
+		n1 = audio_stream_frames_without_wrap(source, x);
+		n2 = mfcc_buffer_samples_without_wrap(buf, w);
+		n = MIN(n1, n2);
+		n = MIN(n, nmax);
+		x0 = x + source_channel;
+		for (i = 0; i < n; i++) {
+			if (emph->enable) {
+				/* Q1.15 x Q1.15 -> Q2.30 */
+				s = (int32_t)emph->delay * emph->coef + Q_SHIFT(*x0, 31, 30);
+				*w = sat_int16(Q_SHIFT_RND(s, 30, 15));
+				emph->delay = sat_int16(Q_SHIFT_RND(*x0, 31, 15));
+			} else {
+				*w = sat_int16(Q_SHIFT_RND(*x0, 31, 15));
+			}
+			x0 += num_channels;
+			w++;
+		}
+
+		x = audio_stream_wrap(source, x + n * audio_stream_get_channels(source));
+		w = mfcc_buffer_wrap(buf, w);
+	}
+	buf->s_avail += copied;
+	buf->s_free -= copied;
+	buf->w_ptr = w;
 }
+#endif /* CONFIG_FORMAT_S32LE */
 
-#endif /* CONFIG_FORMAT_S16LE */
-#endif
+#endif /* MFCC_GENERIC */
diff --git a/src/audio/mfcc/mfcc_hifi3.c b/src/audio/mfcc/mfcc_hifi3.c
index b3b5d99967db..b9ed6c7f8380 100644
--- a/src/audio/mfcc/mfcc_hifi3.c
+++ b/src/audio/mfcc/mfcc_hifi3.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: BSD-3-Clause
 //
-// Copyright(c) 2023 Intel Corporation. All rights reserved.
+// Copyright(c) 2023-2026 Intel Corporation.
 //
 // Author: Andrula Song <andrula.song@intel.com>
 
@@ -35,6 +35,7 @@ static inline void set_circular_buf0(const void *start, const void *end)
  * MFCC algorithm code
  */
 
+#if CONFIG_FORMAT_S16LE
 void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
 			  struct mfcc_pre_emph *emph, int frames, int source_channel)
 {
@@ -92,6 +93,7 @@ void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffe
 	buf->s_free -= copied;
 	buf->w_ptr = (int16_t *)out;
 }
+#endif /* CONFIG_FORMAT_S16LE */
 
 void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data,
 			    int prev_data_length)
@@ -126,50 +128,6 @@ void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data,
 	buf->r_ptr = (void *)in; /* int16_t pointer but direct cast is not possible */
 }
 
-void mfcc_fill_fft_buffer(struct mfcc_state *state)
-{
-	struct mfcc_buffer *buf = &state->buf;
-	struct mfcc_fft *fft = &state->fft;
-	int idx = fft->fft_fill_start_idx;
-	ae_int16 *out = (ae_int16 *)&fft->fft_buf[idx].real;
-	ae_int16 *in = (ae_int16 *)state->prev_data;
-	ae_int16x4 sample;
-	const int buf_inc = sizeof(ae_int16);
-	const int fft_inc = sizeof(fft->fft_buf[0]);
-	int j;
-
-	/* Copy overlapped samples from state buffer. Imaginary part of input
-	 * remains zero.
-	 */
-	for (j = 0; j < state->prev_data_size; j++) {
-		AE_L16_XP(sample, in, buf_inc);
-		AE_S16_0_XP(sample, out, fft_inc);
-	}
-
-	/* Copy hop size of new data from circular buffer */
-	idx += state->prev_data_size;
-	in = (ae_int16 *)buf->r_ptr;
-	out = (ae_int16 *)&fft->fft_buf[idx].real;
-	set_circular_buf0(buf->addr, buf->end_addr);
-	for (j = 0; j < fft->fft_hop_size; j++) {
-		AE_L16_XC(sample, in, buf_inc);
-		AE_S16_0_XP(sample, out, fft_inc);
-	}
-
-	buf->s_avail -= fft->fft_hop_size;
-	buf->s_free += fft->fft_hop_size;
-	buf->r_ptr = (int16_t *)in;
-
-	/* Copy for next time data back to overlap buffer */
-	idx = fft->fft_fill_start_idx + fft->fft_hop_size;
-	in = (ae_int16 *)&fft->fft_buf[idx].real;
-	out = (ae_int16 *)state->prev_data;
-	for (j = 0; j < state->prev_data_size; j++) {
-		AE_L16_XP(sample, in, fft_inc);
-		AE_S16_0_XP(sample, out, buf_inc);
-	}
-}
-
 #ifdef MFCC_NORMALIZE_FFT
 int mfcc_normalize_fft_buffer(struct mfcc_state *state)
 {
@@ -192,6 +150,7 @@ int mfcc_normalize_fft_buffer(struct mfcc_state *state)
 	return shift;
 }
 #endif
+
 void mfcc_apply_window(struct mfcc_state *state, int input_shift)
 {
 	struct mfcc_fft *fft = &state->fft;
@@ -221,7 +180,8 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift)
 	for (j = 0; j < fft->fft_size; j++) {
 		AE_L32_IP(sample, fft_in, 0);
 		AE_L16_XP(win, win_in, win_inc);
-		temp = AE_MULFP32X16X2RS_H(sample, win);
+		/* Data is 16-bit in 32-bit container, shift to Q1.31 for fractional multiply */
+		sample = AE_SLAI32S(sample, 16);
 		temp = AE_MULFP32X16X2RS_L(sample, win);
 		temp = AE_SLAA32S(temp, input_shift);
 		AE_S32_L_XP(temp, fft_in, fft_inc);
@@ -229,65 +189,129 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift)
 #endif
 }
 
-#if CONFIG_FORMAT_S16LE
-
-int16_t *mfcc_sink_copy_zero_s16(const struct audio_stream *sink,
-				 int16_t *w_ptr, int samples)
+#if CONFIG_FORMAT_S24LE
+void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
+			  struct mfcc_pre_emph *emph, int frames, int source_channel)
 {
+	struct audio_stream *source = bsource->data;
+	int copied;
+	int nmax;
+	int n;
 	int i;
-	int n = samples >> 2;
-	int m = samples & 0x03;
-	ae_int16x4 *out = (ae_int16x4 *)w_ptr;
-	const int inc = sizeof(ae_int16);
-	ae_valign outu = AE_ZALIGN64();
-	ae_int16x4 zero = AE_ZERO16();
-
-	set_circular_buf0(sink->addr, sink->end_addr);
-
-	for (i = 0; i < n; i++)
-		AE_SA16X4_IC(zero, outu, out);
+	int num_channels = audio_stream_get_channels(source);
+	ae_int32 *in;
+	ae_int32 *x = (ae_int32 *)audio_stream_get_rptr(source);
+	ae_int16 *out = (ae_int16 *)buf->w_ptr;
+	ae_int32x2 sample32;
+	ae_int16x4 sample;
+	ae_int32x2 temp;
+	ae_int16x4 coef = emph->coef;
+	ae_int16x4 delay;
+	const int in_inc = sizeof(ae_int32) * num_channels;
 
-	AE_SA64POS_FP(outu, out);
-	/* process the left samples that less than 4
-	 * one by one to avoid memory access overrun
-	 */
-	for (i = 0; i < m ; i++)
-		AE_S16_0_XC(zero, (ae_int16 *)out, inc);
+	for (copied = 0; copied < frames; copied += n) {
+		nmax = frames - copied;
+		n = audio_stream_frames_without_wrap(source, x);
+		n = MIN(n, nmax);
+		nmax = mfcc_buffer_samples_without_wrap(buf, (int16_t *)out);
+		n = MIN(n, nmax);
+		in = x + source_channel;
+		if (emph->enable) {
+			delay = emph->delay;
+			for (i = 0; i < n; i++) {
+				AE_L32_XP(sample32, in, in_inc);
+				/* Shift left by 8 to sign-extend to Q1.31 */
+				sample32 = AE_SLAI32(sample32, 8);
+				/* Then shift right by 16 to get 16-bit */
+				sample32 = AE_SRAI32(sample32, 16);
+				sample = AE_SAT16X4(sample32, sample32);
+				/* Q1.15 -> Q1.31 */
+				temp = AE_CVT32X2F16_10(sample);
+				AE_MULAF16SS_00(temp, delay, coef);
+				delay = sample;
+				sample = AE_ROUND16X4F32SSYM(temp, temp);
+				AE_S16_0_IP(sample, out, 2);
+			}
+			emph->delay = delay;
+		} else {
+			for (i = 0; i < n; i++) {
+				AE_L32_XP(sample32, in, in_inc);
+				/* Shift left by 8 to sign-extend to Q1.31 */
+				sample32 = AE_SLAI32(sample32, 8);
+				/* Then shift right by 16 to get 16-bit */
+				sample32 = AE_SRAI32(sample32, 16);
+				sample = AE_SAT16X4(sample32, sample32);
+				AE_S16_0_IP(sample, out, 2);
+			}
+		}
 
-	return (int16_t *)out;
+		x = audio_stream_wrap(source, x + n * num_channels);
+		out = (ae_int16 *)mfcc_buffer_wrap(buf, (int16_t *)out);
+	}
+	buf->s_avail += copied;
+	buf->s_free -= copied;
+	buf->w_ptr = (int16_t *)out;
 }
+#endif /* CONFIG_FORMAT_S24LE */
 
-int16_t *mfcc_sink_copy_data_s16(const struct audio_stream *sink, int16_t *w_ptr,
-				 int samples, int16_t *r_ptr)
+#if CONFIG_FORMAT_S32LE
+void mfcc_source_copy_s32(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
+			  struct mfcc_pre_emph *emph, int frames, int source_channel)
 {
+	struct audio_stream *source = bsource->data;
+	int copied;
+	int nmax;
+	int n;
 	int i;
-	int n = samples >> 2;
-	int m = samples & 0x03;
-	ae_int16x4 *out = (ae_int16x4 *)w_ptr;
-	ae_int16x4 *in = (ae_int16x4 *)r_ptr;
-	ae_valign outu = AE_ZALIGN64();
-	ae_valign inu = AE_ZALIGN64();
-	const int inc = sizeof(ae_int16);
-	ae_int16x4 in_sample;
+	int num_channels = audio_stream_get_channels(source);
+	ae_int32 *in;
+	ae_int32 *x = (ae_int32 *)audio_stream_get_rptr(source);
+	ae_int16 *out = (ae_int16 *)buf->w_ptr;
+	ae_int32x2 sample32;
+	ae_int16x4 sample;
+	ae_int32x2 temp;
+	ae_int16x4 coef = emph->coef;
+	ae_int16x4 delay;
+	const int in_inc = sizeof(ae_int32) * num_channels;
 
-	set_circular_buf0(sink->addr, sink->end_addr);
+	for (copied = 0; copied < frames; copied += n) {
+		nmax = frames - copied;
+		n = audio_stream_frames_without_wrap(source, x);
+		n = MIN(n, nmax);
+		nmax = mfcc_buffer_samples_without_wrap(buf, (int16_t *)out);
+		n = MIN(n, nmax);
+		in = x + source_channel;
+		if (emph->enable) {
+			delay = emph->delay;
+			for (i = 0; i < n; i++) {
+				AE_L32_XP(sample32, in, in_inc);
+				/* S32: shift right by 16 to get 16-bit */
+				sample32 = AE_SRAI32(sample32, 16);
+				sample = AE_SAT16X4(sample32, sample32);
+				/* Q1.15 -> Q1.31 */
+				temp = AE_CVT32X2F16_10(sample);
+				AE_MULAF16SS_00(temp, delay, coef);
+				delay = sample;
+				sample = AE_ROUND16X4F32SSYM(temp, temp);
+				AE_S16_0_IP(sample, out, 2);
+			}
+			emph->delay = delay;
+		} else {
+			for (i = 0; i < n; i++) {
+				AE_L32_XP(sample32, in, in_inc);
+				sample32 = AE_SRAI32(sample32, 16);
+				sample = AE_SAT16X4(sample32, sample32);
+				AE_S16_0_IP(sample, out, 2);
+			}
+		}
 
-	inu = AE_LA64_PP(in);
-	for (i = 0; i < n; i++) {
-		AE_LA16X4_IP(in_sample, inu, in);
-		AE_SA16X4_IC(in_sample, outu, out);
-	}
-	AE_SA64POS_FP(outu, out);
-	/* process the left samples that less than 4
-	 * one by one to avoid memory access overrun
-	 */
-	for (i = 0; i < m ; i++) {
-		AE_L16_XP(in_sample, (ae_int16 *)in, inc);
-		AE_S16_0_XC(in_sample, (ae_int16 *)out, inc);
+		x = audio_stream_wrap(source, x + n * num_channels);
+		out = (ae_int16 *)mfcc_buffer_wrap(buf, (int16_t *)out);
 	}
-
-	return (int16_t *)out;
+	buf->s_avail += copied;
+	buf->s_free -= copied;
+	buf->w_ptr = (int16_t *)out;
 }
+#endif /* CONFIG_FORMAT_S32LE */
 
-#endif /* CONFIG_FORMAT_S16LE */
-#endif
+#endif /* MFCC_HIFI3 */
diff --git a/src/audio/mfcc/mfcc_hifi4.c b/src/audio/mfcc/mfcc_hifi4.c
index 60a4de62ec23..511a9bbf1dca 100644
--- a/src/audio/mfcc/mfcc_hifi4.c
+++ b/src/audio/mfcc/mfcc_hifi4.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: BSD-3-Clause
 //
-// Copyright(c) 2023 Intel Corporation. All rights reserved.
+// Copyright(c) 2023-2026 Intel Corporation.
 //
 // Author: Andrula Song <andrula.song@intel.com>
 
@@ -41,6 +41,8 @@ static inline void set_circular_buf1(const void *start, const void *end)
 /*
  * MFCC algorithm code
  */
+
+#if CONFIG_FORMAT_S16LE
 void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
 			  struct mfcc_pre_emph *emph, int frames, int source_channel)
 {
@@ -87,6 +89,7 @@ void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffe
 	buf->s_free -= frames;
 	buf->w_ptr = (int16_t *)out;
 }
+#endif /* CONFIG_FORMAT_S16LE */
 
 void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data,
 			    int prev_data_length)
@@ -121,50 +124,6 @@ void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data,
 	buf->r_ptr = (int16_t *)in;
 }
 
-void mfcc_fill_fft_buffer(struct mfcc_state *state)
-{
-	struct mfcc_buffer *buf = &state->buf;
-	struct mfcc_fft *fft = &state->fft;
-	int idx = fft->fft_fill_start_idx;
-	ae_int16 *out = (ae_int16 *)&fft->fft_buf[idx].real;
-	ae_int16 *in = (ae_int16 *)state->prev_data;
-	ae_int16x4 sample;
-	const int buf_inc = sizeof(ae_int16);
-	const int fft_inc = sizeof(fft->fft_buf[0]);
-	int j;
-
-	/* Copy overlapped samples from state buffer. Imaginary part of input
-	 * remains zero.
-	 */
-	for (j = 0; j < state->prev_data_size; j++) {
-		AE_L16_XP(sample, in, buf_inc);
-		AE_S16_0_XP(sample, out, fft_inc);
-	}
-
-	/* Copy hop size of new data from circular buffer */
-	idx += state->prev_data_size;
-	in = (ae_int16 *)buf->r_ptr;
-	out = (ae_int16 *)&fft->fft_buf[idx].real;
-	set_circular_buf0(buf->addr, buf->end_addr);
-	for (j = 0; j < fft->fft_hop_size; j++) {
-		AE_L16_XC(sample, in, buf_inc);
-		AE_S16_0_XP(sample, out, fft_inc);
-	}
-
-	buf->s_avail -= fft->fft_hop_size;
-	buf->s_free += fft->fft_hop_size;
-	buf->r_ptr = (int16_t *)in;
-
-	/* Copy for next time data back to overlap buffer */
-	idx = fft->fft_fill_start_idx + fft->fft_hop_size;
-	in = (ae_int16 *)&fft->fft_buf[idx].real;
-	out = (ae_int16 *)state->prev_data;
-	for (j = 0; j < state->prev_data_size; j++) {
-		AE_L16_XP(sample, in, fft_inc);
-		AE_S16_0_XP(sample, out, buf_inc);
-	}
-}
-
 #ifdef MFCC_NORMALIZE_FFT
 int mfcc_normalize_fft_buffer(struct mfcc_state *state)
 {
@@ -217,7 +176,8 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift)
 	for (j = 0; j < fft->fft_size; j++) {
 		AE_L32_IP(sample, fft_in, 0);
 		AE_L16_XP(win, win_in, win_inc);
-		temp = AE_MULFP32X16X2RS_H(sample, win);
+		/* Data is 16-bit in 32-bit container, shift to Q1.31 for fractional multiply */
+		sample = AE_SLAI32S(sample, 16);
 		temp = AE_MULFP32X16X2RS_L(sample, win);
 		temp = AE_SLAA32S(temp, input_shift);
 		AE_S32_L_XP(temp, fft_in, fft_inc);
@@ -225,65 +185,111 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift)
 #endif
 }
 
-#if CONFIG_FORMAT_S16LE
-
-int16_t *mfcc_sink_copy_zero_s16(const struct audio_stream *sink,
-				 int16_t *w_ptr, int samples)
+#if CONFIG_FORMAT_S24LE
+void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
+			  struct mfcc_pre_emph *emph, int frames, int source_channel)
 {
+	struct audio_stream *source = bsource->data;
+	int num_channels = audio_stream_get_channels(source);
+	ae_int32 *in = (ae_int32 *)source->r_ptr + source_channel;
+	ae_int16 *out = (ae_int16 *)buf->w_ptr;
+	ae_int32x2 sample32;
+	ae_int16x4 sample;
+	ae_int32x2 temp;
+	ae_int16x4 coef;
+	ae_int16x4 delay;
+	const int in_inc = sizeof(ae_int32) * num_channels;
+	const int out_inc = sizeof(ae_int16);
 	int i;
-	int n = samples >> 2;
-	int m = samples & 0x03;
-	ae_int16x4 *out = (ae_int16x4 *)w_ptr;
-	const int inc = sizeof(ae_int16);
-	ae_valign outu = AE_ZALIGN64();
-	ae_int16x4 zero = AE_ZERO16();
-
-	set_circular_buf0(sink->addr, sink->end_addr);
 
-	for (i = 0; i < n; i++)
-		AE_SA16X4_IC(zero, outu, out);
+	set_circular_buf1(buf->addr, buf->end_addr);
+	set_circular_buf0(source->addr, source->end_addr);
 
-	AE_SA64POS_FP(outu, out);
-	/* process the left samples that less than 4
-	 * one by one to avoid memory access overrun
-	 */
-	for (i = 0; i < m ; i++)
-		AE_S16_0_XC(zero, (ae_int16 *)out, inc);
+	if (emph->enable) {
+		delay = emph->delay;
+		coef = emph->coef;
+		for (i = 0; i < frames; i++) {
+			AE_L32_XC(sample32, in, in_inc);
+			/* Shift left by 8 to sign-extend to Q1.31 */
+			sample32 = AE_SLAI32(sample32, 8);
+			/* Then shift right by 16 to get 16-bit */
+			sample32 = AE_SRAI32(sample32, 16);
+			sample = AE_SAT16X4(sample32, sample32);
+			/* Q1.15 -> Q1.31 */
+			temp = AE_CVT32X2F16_10(sample);
+			AE_MULAF16SS_00(temp, delay, coef);
+			delay = sample;
+			sample = AE_ROUND16X4F32SSYM(temp, temp);
+			AE_S16_0_XC1(sample, out, out_inc);
+		}
+		emph->delay = delay;
+	} else {
+		for (i = 0; i < frames; i++) {
+			AE_L32_XC(sample32, in, in_inc);
+			/* Shift left by 8 to sign-extend to Q1.31 */
+			sample32 = AE_SLAI32(sample32, 8);
+			/* Then shift right by 16 to get 16-bit */
+			sample32 = AE_SRAI32(sample32, 16);
+			sample = AE_SAT16X4(sample32, sample32);
+			AE_S16_0_XC1(sample, out, out_inc);
+		}
+	}
 
-	return (int16_t *)out;
+	buf->s_avail += frames;
+	buf->s_free -= frames;
+	buf->w_ptr = (int16_t *)out;
 }
+#endif /* CONFIG_FORMAT_S24LE */
 
-int16_t *mfcc_sink_copy_data_s16(const struct audio_stream *sink, int16_t *w_ptr,
-				 int samples, int16_t *r_ptr)
+#if CONFIG_FORMAT_S32LE
+void mfcc_source_copy_s32(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
+			  struct mfcc_pre_emph *emph, int frames, int source_channel)
 {
+	struct audio_stream *source = bsource->data;
+	int num_channels = audio_stream_get_channels(source);
+	ae_int32 *in = (ae_int32 *)source->r_ptr + source_channel;
+	ae_int16 *out = (ae_int16 *)buf->w_ptr;
+	ae_int32x2 sample32;
+	ae_int16x4 sample;
+	ae_int32x2 temp;
+	ae_int16x4 coef;
+	ae_int16x4 delay;
+	const int in_inc = sizeof(ae_int32) * num_channels;
+	const int out_inc = sizeof(ae_int16);
 	int i;
-	int n = samples >> 2;
-	int m = samples & 0x03;
-	ae_int16x4 *out = (ae_int16x4 *)w_ptr;
-	ae_int16x4 *in = (ae_int16x4 *)r_ptr;
-	ae_valign outu = AE_ZALIGN64();
-	ae_valign inu = AE_ZALIGN64();
-	const int inc = sizeof(ae_int16);
-	ae_int16x4 in_sample;
 
-	set_circular_buf0(sink->addr, sink->end_addr);
+	set_circular_buf1(buf->addr, buf->end_addr);
+	set_circular_buf0(source->addr, source->end_addr);
 
-	inu = AE_LA64_PP(in);
-	for (i = 0; i < n; i++) {
-		AE_LA16X4_IP(in_sample, inu, in);
-		AE_SA16X4_IC(in_sample, outu, out);
-	}
-	AE_SA64POS_FP(outu, out);
-	/* process the left samples that less than 4
-	 * one by one to avoid memory access overrun
-	 */
-	for (i = 0; i < m ; i++) {
-		AE_L16_XP(in_sample, (ae_int16 *)in, inc);
-		AE_S16_0_XC(in_sample, (ae_int16 *)out, inc);
+	if (emph->enable) {
+		delay = emph->delay;
+		coef = emph->coef;
+		for (i = 0; i < frames; i++) {
+			AE_L32_XC(sample32, in, in_inc);
+			/* S32: shift right by 16 to get 16-bit */
+			sample32 = AE_SRAI32(sample32, 16);
+			sample = AE_SAT16X4(sample32, sample32);
+			/* Q1.15 -> Q1.31 */
+			temp = AE_CVT32X2F16_10(sample);
+			AE_MULAF16SS_00(temp, delay, coef);
+			delay = sample;
+			sample = AE_ROUND16X4F32SSYM(temp, temp);
+			AE_S16_0_XC1(sample, out, out_inc);
+		}
+		emph->delay = delay;
+	} else {
+		for (i = 0; i < frames; i++) {
+			AE_L32_XC(sample32, in, in_inc);
+			sample32 = AE_SRAI32(sample32, 16);
+			sample = AE_SAT16X4(sample32, sample32);
+			AE_S16_0_XC1(sample, out, out_inc);
+		}
 	}
 
-	return (int16_t *)out;
+	buf->s_avail += frames;
+	buf->s_free -= frames;
+	buf->w_ptr = (int16_t *)out;
 }
+#endif /* CONFIG_FORMAT_S32LE */
 
-#endif /* CONFIG_FORMAT_S16LE */
-#endif
+#endif /* MFCC_HIFI4 */
diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c
index dded450673ad..a936371ccd1c 100644
--- a/src/audio/mfcc/mfcc_setup.c
+++ b/src/audio/mfcc/mfcc_setup.c
@@ -50,10 +50,12 @@ static int mfcc_get_window(struct mfcc_state *state, enum sof_mfcc_fft_window_ty
 	case MFCC_HAMMING_WINDOW:
 		win_hamming_16b(state->window, fft->fft_size);
 		return 0;
+	case MFCC_HANN_WINDOW:
+		win_hann_16b(state->window, fft->fft_size);
+		return 0;
 	case MFCC_POVEY_WINDOW:
 		win_povey_16b(state->window, fft->fft_size);
 		return 0;
-
 	default:
 		return -EINVAL;
 	}
@@ -139,10 +141,9 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
 		return -EINVAL;
 	}
 
-	comp_info(dev, "source_channel = %d, stream_channels = %d",
-		  config->channel, channels);
-	if (config->channel >= channels) {
-		comp_err(dev, "Illegal channel");
+	if (config->channel >= channels || (config->channel < 0 && channels != 1)) {
+		comp_err(dev, "Illegal source_channel %d for stream channels %d", config->channel,
+			 channels);
 		return -EINVAL;
 	}
 
@@ -151,6 +152,7 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
 	else
 		state->source_channel = config->channel;
 
+	state->mmax = (int32_t)config->mmax_init << 16; /* Q9.7 -> Q9.23 */
 	state->emph.enable = config->preemphasis_coefficient > 0;
 	state->emph.coef = -config->preemphasis_coefficient; /* Negate config parameter */
 	fft->fft_size = config->frame_length;
@@ -224,7 +226,7 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
 	ret = mfcc_get_window(state, config->window);
 	if (ret < 0) {
 		comp_err(dev, "Failed Window function");
-		goto free_fft_out;
+		goto free_fft_plan;
 	}
 
 	/* Setup Mel auditory filterbank. FFT input and output buffers are used
@@ -246,39 +248,53 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
 	ret = mod_psy_get_mel_filterbank(mod, fb);
 	if (ret < 0) {
 		comp_err(dev, "Failed Mel filterbank");
-		goto free_fft_out;
-	}
-
-	/* Setup DCT */
-	dct->num_in = config->num_mel_bins;
-	dct->num_out = config->num_ceps;
-	dct->type = (enum dct_type)config->dct;
-	dct->ortho = true;
-	ret = mod_dct_initialize_16(mod, dct);
-	if (ret < 0) {
-		comp_err(dev, "Failed DCT init");
-		goto free_melfb_data;
+		goto free_fft_plan;
 	}
 
-	state->lifter.num_ceps = config->num_ceps;
-	state->lifter.cepstral_lifter = config->cepstral_lifter; /* Q7.9 max 64.0*/
-	ret = mfcc_get_cepstral_lifter(mod, &state->lifter);
-	if (ret < 0) {
-		comp_err(dev, "Failed cepstral lifter");
-		goto free_dct_matrix;
+	/* Setup DCT and cepstral lifter only when num_ceps > 0.
+	 * When num_ceps is zero, skip DCT/lifter and output Mel
+	 * log spectra directly.
+	 */
+	if (config->num_ceps > 0) {
+		dct->num_in = config->num_mel_bins;
+		dct->num_out = config->num_ceps;
+		dct->type = (enum dct_type)config->dct;
+		dct->ortho = true;
+		ret = mod_dct_initialize_16(mod, dct);
+		if (ret < 0) {
+			comp_err(dev, "Failed DCT init");
+			goto free_melfb_data;
+		}
+
+		state->lifter.num_ceps = config->num_ceps;
+		state->lifter.cepstral_lifter = config->cepstral_lifter; /* Q7.9 max 64.0*/
+		ret = mfcc_get_cepstral_lifter(mod, &state->lifter);
+		if (ret < 0) {
+			comp_err(dev, "Failed cepstral lifter");
+			goto free_dct_matrix;
+		}
+
+		state->mel_only = false;
+	} else {
+		comp_info(dev, "num_ceps is 0, Mel log spectra output mode");
+		dct->num_in = config->num_mel_bins;
+		dct->num_out = 0;
+		dct->matrix = NULL;
+		state->lifter.matrix = NULL;
+		state->mel_only = true;
 	}
 
 	/* Scratch overlay during runtime
 	 *
 	 *  +--------------------------------------------------------+
-	 *  | 1. fft_buf[], 16 bits,size x 4, e.g. 512 -> 2048 bytes |
+	 *  | 1. fft_buf[], 32 bits, size x 8, e.g. 512 -> 4096 bytes|
+	 *  +-------------------------------------+------------------+
+	 *  | 3. power_spectra[],                 | 6. mel_log_32[], |
+	 *  |    32 bits, e.g. x257 -> 1028 bytes |    32b, 92 bytes |
 	 *  +-------------------------------------+------------------+
-	 *  | 3. power_spectra[],                 |
-	 *  |    32 bits, e.g. x257 -> 1028 bytes |
-	 *  +-------------------------------------+
 	 *
 	 *  +---------------------------------------------------------------------------------+
-	 *  | 2. fft_out[], 16 bits,size x 4, e.g. 512 -> 2048 bytes                          |
+	 *  | 2. fft_out[], 32 bits, size x 8, e.g. 512 -> 4096 bytes                         |
 	 *  +----------------------------------+----------------------------------+-----------+
 	 *  | 4. mel_spectra[],                | 5. cepstral_coef[],              |
 	 *  |    16 bits, e.g. x23 -> 46 bytes |    16 bits, e.g. 13x -> 26 bytes |
@@ -288,31 +304,78 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
 
 	/* Use FFT buffer as scratch for later computed data */
 	state->power_spectra = (int32_t *)&fft->fft_buf[0];
+	state->mel_log_32 = &state->power_spectra[fft->half_fft_size];
+
+	/* Check that mel_log_32 fits in the remaining fft_buf scratch space */
+	int mel_log_32_space = (int)(fft->fft_buffer_size / sizeof(int32_t)) - fft->half_fft_size;
+
+	if (config->num_mel_bins > mel_log_32_space) {
+		comp_err(dev, "num_mel_bins %d exceeds mel_log_32 scratch space %d",
+			 config->num_mel_bins, mel_log_32_space);
+		ret = -EINVAL;
+		goto free_lifter;
+	}
+
 	state->mel_spectra = (struct mat_matrix_16b *)&fft->fft_out[0];
-	state->cepstral_coef = (struct mat_matrix_16b *)
-		&state->mel_spectra->data[state->dct.num_in];
+	if (!state->mel_only) {
+		state->cepstral_coef =
+			(struct mat_matrix_16b *)&state->mel_spectra->data[state->dct.num_in];
+	} else {
+		state->cepstral_coef = NULL;
+	}
+
+	/* Allocate output buffer for multi-period output. Size allows for
+	 * current output data plus leftover from previous period.
+	 */
+	int max_out_per_hop = state->mel_only ? dct->num_in : dct->num_out;
+
+	/* Check that output data can be drained within the periods spanned by one
+	 * FFT hop. Each hop consumes fft_hop_size input samples and produces
+	 * max_out_per_hop + 2 (magic) int16_t output values. The sink provides at
+	 * least fft_hop_size * channels int16_t samples per hop (worst case s16).
+	 * If output exceeds this, data accumulates and will eventually overflow.
+	 */
+	int out_per_hop = max_out_per_hop + 2;
+	int sink_per_hop = fft->fft_hop_size * channels;
+
+	if (out_per_hop > sink_per_hop) {
+		comp_err(dev, "Output %d int16 per hop exceeds sink capacity %d (hop %d x ch %d)",
+			 out_per_hop, sink_per_hop, fft->fft_hop_size, channels);
+		ret = -EINVAL;
+		goto free_lifter;
+	}
 
 	/* Set initial state for STFT */
 	state->waiting_fill = true;
 	state->prev_samples_valid = false;
+	state->magic_pending = false;
+	state->out_data_ptr = NULL;
+	state->out_data_ptr_32 = NULL;
+	state->out_remain = 0;
 
 	comp_dbg(dev, "done");
 	return 0;
 
+free_lifter:
+	mod_free(mod, state->lifter.matrix);
+
 free_dct_matrix:
-	rfree(state->dct.matrix);
+	mod_free(mod, state->dct.matrix);
 
 free_melfb_data:
-	rfree(fb->data);
+	mod_free(mod, fb->data);
+
+free_fft_plan:
+	mod_fft_plan_free(mod, fft->fft_plan);
 
 free_fft_out:
-	rfree(fft->fft_out);
+	mod_free(mod, fft->fft_out);
 
 free_fft_buf:
-	rfree(fft->fft_buf);
+	mod_free(mod, fft->fft_buf);
 
 free_buffers:
-	rfree(state->buffers);
+	mod_free(mod, state->buffers);
 
 exit:
 	return ret;
diff --git a/src/audio/mfcc/tune/README.txt b/src/audio/mfcc/tune/README.txt
index fb8208992ed4..a0c3189e81a3 100644
--- a/src/audio/mfcc/tune/README.txt
+++ b/src/audio/mfcc/tune/README.txt
@@ -8,20 +8,43 @@ need to be created with "scripts/build-tools.sh -t". Next the testbench
 is build with "scripts/rebuild-testbench.sh".
 
 Once the previous steps are done, a sample wav file can be processed
-into stream of cepstral coefficients with script run_mfcc.sh. E.g.
-next command processes an ALSA test file with speech clip "front center".
-The output file is hard-coded to mfcc.raw.
+with script run_mfcc.sh. The script converts the input to raw 16 kHz
+stereo format and runs the testbench for S16, S24, and S32 bit depths,
+producing both cepstral coefficient (MFCC) and Mel spectrogram outputs.
 
 ./run_mfcc.sh /usr/share/sounds/alsa/Front_Center.wav
 
-The output can be plotted and retrieved with Matlab or Octave command:
+Output files from host testbench:
+  mfcc_s16.raw, mfcc_s24.raw, mfcc_s32.raw   - cepstral coefficients
+  mel_s16.raw, mel_s24.raw, mel_s32.raw       - Mel spectrogram
 
-[ceps, t, n] = decode_ceps('mfcc.raw', 13);
+If the XTENSA_PATH environment variable is set, the script also runs
+the Xtensa build of the testbench (via xt-run) and produces additional
+output files prefixed with "xt_":
+  xt_mfcc_s16.raw, xt_mfcc_s24.raw, xt_mfcc_s32.raw
+  xt_mel_s16.raw, xt_mel_s24.raw, xt_mel_s32.raw
+
+All output files can be decoded and plotted at once in Matlab or Octave
+with the decode_all.m script:
+
+decode_all
+
+This calls decode_ceps for each MFCC file (13 cepstral coefficients) and
+decode_mel for each Mel file (80 Mel bins), plotting spectrograms for all
+files that exist including the Xtensa variants.
+
+Individual files can also be decoded manually:
+
+[ceps, t, n] = decode_ceps('mfcc_s16.raw', 13);
 
 In the above it's known from configuration script that MFCC was set up to
 output 13 cepstral coefficients from each FFT -> Mel -> DCT -> Cepstral
 coefficients computation run.
 
+The 80 bands Mel output can be visualized with command:
+
+[mel, t, n] = decode_mel('mel_s16.raw', 80);
+
 Other kind of signals have quite big visual difference in audio features. Try
 e.g. other sound files found in computer.
 
diff --git a/src/audio/mfcc/tune/decode_all.m b/src/audio/mfcc/tune/decode_all.m
new file mode 100644
index 000000000000..d5b60289b4cf
--- /dev/null
+++ b/src/audio/mfcc/tune/decode_all.m
@@ -0,0 +1,39 @@
+% decode_all.m - Decode all MFCC and Mel raw output files from run_mfcc.sh
+%
+% SPDX-License-Identifier: BSD-3-Clause
+% Copyright(c) 2026 Intel Corporation.
+
+num_ceps = 13;
+num_mel = 80;
+
+% MFCC cepstral output files
+ceps_files = {'mfcc_s16.raw', 'mfcc_s24.raw', 'mfcc_s32.raw'};
+
+% Mel output files with corresponding format
+mel_files = {'mel_s16.raw', 'mel_s24.raw', 'mel_s32.raw'};
+mel_fmts  = {'s16',         's24',          's32'};
+
+% Xtensa prefixed variants
+xt_ceps_files = {'xt_mfcc_s16.raw', 'xt_mfcc_s24.raw', 'xt_mfcc_s32.raw'};
+xt_mel_files  = {'xt_mel_s16.raw',  'xt_mel_s24.raw',  'xt_mel_s32.raw'};
+
+all_ceps_files = [ceps_files, xt_ceps_files];
+all_mel_files  = [mel_files, xt_mel_files];
+all_mel_fmts   = [mel_fmts, mel_fmts];
+
+for i = 1:length(all_ceps_files)
+	fn = all_ceps_files{i};
+	if exist(fn, 'file')
+		fprintf('Decoding MFCC ceps: %s\n', fn);
+		[ceps, t, n] = decode_ceps(fn, num_ceps);
+	end
+end
+
+for i = 1:length(all_mel_files)
+	fn = all_mel_files{i};
+	fmt = all_mel_fmts{i};
+	if exist(fn, 'file')
+		fprintf('Decoding Mel: %s\n', fn);
+		[mel, t, n] = decode_mel(fn, num_mel, fmt);
+	end
+end
diff --git a/src/audio/mfcc/tune/decode_mel.m b/src/audio/mfcc/tune/decode_mel.m
new file mode 100644
index 000000000000..899d17ac72bd
--- /dev/null
+++ b/src/audio/mfcc/tune/decode_mel.m
@@ -0,0 +1,138 @@
+% [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels)
+%
+% Input
+%   fn - File with MFCC data in .raw or .wav format
+%   num_mel - number of Mel coefficients per frame
+%   fmt - format of the MFCC data ('s16', 's24', 's32')
+%   num_channels - needed for .raw format, omit for .wav
+%
+% Outputs
+%   mel - Mel coefficients
+%   t - time vector for plotting
+%   n - mel 1..num_mel vector for plotting
+
+% SPDX-License-Identifier: BSD-3-Clause
+% Copyright(c) 2026 Intel Corporation.
+
+function [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels)
+
+if nargin < 3
+	fmt = 's16';
+end
+if nargin < 4
+	num_channels = 1;
+end
+
+% MFCC stream
+fs = 16e3;
+
+switch fmt
+  case 's16'
+    qformat = 7;
+    magic = [25443 28006]; % ASCII 'mfcc' as two int16
+    num_magic = 2;
+  case 's24'
+    qformat = 15;
+    magic = int32(1835426659); % 0x6D666363 as int32
+    num_magic = 1;
+  case 's32'
+    qformat = 23;
+    magic = int32(1835426659); % 0x6D666363 as int32
+    num_magic = 1;
+end
+
+% Load output data
+[data, num_channels] = get_file(fn, num_channels, fmt);
+
+if strcmp(fmt, 's16')
+	idx1 = find(data == magic(1));
+	idx = [];
+	for i = 1:length(idx1)
+		if data(idx1(i) + 1) == magic(2)
+			idx = [idx idx1(i)];
+		end
+	end
+else
+	idx = find(data == magic);
+end
+
+if isempty(idx)
+	error('No magic value markers found from stream');
+end
+
+period_mel = idx(2)-idx(1);
+num_frames = length(idx);
+
+% Last frame can be incomplete due to span over multiple periods
+last = idx(end) + num_mel - 1;
+if (last > length(data))
+    num_frames = num_frames - 1;
+end
+
+t_mel = period_mel / num_channels / fs;
+t = (0:num_frames -1) * t_mel;
+n = 1:num_mel;
+
+mel = zeros(num_mel, num_frames);
+for i = 1:num_frames
+	i1 = idx(i) + num_magic;
+	i2 = i1 + num_mel - 1;
+	mel(:,i) = double(data(i1:i2)) / 2^qformat;
+end
+
+figure;
+imagesc(t, n, mel);
+axis xy;
+colormap(jet);
+colorbar;
+tstr = sprintf('SOF MFCC Mel coefficients (%s)', fn);
+title(tstr, 'Interpreter', 'None');
+xlabel('Time (s)');
+ylabel('Mel coef #');
+
+end
+
+function [data, num_channels] = get_file(fn, num_channels, fmt)
+
+[~, ~, ext] = fileparts(fn);
+
+switch fmt
+	case 's16'
+		read_fmt = 'int16';
+	case {'s24', 's32'}
+		read_fmt = 'int32';
+end
+
+switch lower(ext)
+	case '.raw'
+		fh = fopen(fn, 'r');
+		data = fread(fh, read_fmt);
+		fclose(fh);
+	case '.wav'
+		tmp = audioread(fn, 'native');
+		t = whos('tmp');
+		switch fmt
+			case 's16'
+				if ~strcmp(t.class, 'int16')
+					error('Expected 16-bit wav for s16 format');
+				end
+			case {'s24', 's32'}
+				if ~strcmp(t.class, 'int32')
+					error('Expected 32-bit wav for %s format', fmt);
+				end
+		end
+		s = size(tmp);
+		num_channels = s(2);
+		if num_channels > 1
+			data = zeros(prod(s), 1, t.class);
+			for i = 1:num_channels
+				data(i:num_channels:end) = tmp(:, i);
+			end
+		else
+			data = tmp;
+		end
+	otherwise
+		error('Unknown audio format');
+end
+
+end
diff --git a/src/audio/mfcc/tune/run_mfcc.sh b/src/audio/mfcc/tune/run_mfcc.sh
index d531e4519755..e3c309fbc03e 100755
--- a/src/audio/mfcc/tune/run_mfcc.sh
+++ b/src/audio/mfcc/tune/run_mfcc.sh
@@ -4,19 +4,52 @@
 
 set -e
 
-RAW_INPUT=in.raw
-RAW_OUTPUT=mfcc.raw
+RAW_INPUT_S16=in_s16.raw
+RAW_INPUT_S24=in_s24.raw
+RAW_INPUT_S32=in_s32.raw
 
+VALGRIND="valgrind --leak-check=full"
+#VALGRIND=""
 TESTBENCH=$SOF_WORKSPACE/sof/tools/testbench/build_testbench/install/bin/sof-testbench4
-TOPOLOGY=$SOF_WORKSPACE/sof/tools/build_tools/topology/topology2/development/sof-hda-benchmark-mfcc16.tplg
-OPT="-r 16000 -c 2 -b S16_LE -p 3,4 -t $TOPOLOGY -i $RAW_INPUT -o $RAW_OUTPUT"
+TESTBENCH_RUN="$VALGRIND $TESTBENCH"
 
-# Convert input audio file raw 16 kHz 1 channel 16 bit
-sox --encoding signed-integer "$1" -L -r 16000 -c 1 -b 16 "$RAW_INPUT"
+convert_input() {
+	sox -R --encoding signed-integer "$1" -L -r 16000 -c 2 -b 16 "$RAW_INPUT_S16"
+	sox -R --no-dither --encoding signed-integer -L -r 16000 -c 2 -b 16 \
+		"$RAW_INPUT_S16" -b 32 "$RAW_INPUT_S32"
+	sox -R --no-dither --encoding signed-integer -L -r 16000 -c 2 -b 16 \
+		"$RAW_INPUT_S16" -b 32 "$RAW_INPUT_S24" vol 0.003906250000
+}
 
-# Run testbench
-$TESTBENCH $OPT -i "$RAW_INPUT" -o "$RAW_OUTPUT"
+run_testbench() {
+	local tplg_base="$1"
+	local out_s16="$2"
+	local out_s24="$3"
+	local out_s32="$4"
+	local label="$5"
+	local tplg_s16="${SOF_WORKSPACE}/sof/tools/build_tools/topology/topology2/development/${tplg_base}16.tplg"
+	local tplg_s24="${SOF_WORKSPACE}/sof/tools/build_tools/topology/topology2/development/${tplg_base}24.tplg"
+	local tplg_s32="${SOF_WORKSPACE}/sof/tools/build_tools/topology/topology2/development/${tplg_base}32.tplg"
 
-echo -----------------------------------------------
-echo   The MFCC data was output to file $RAW_OUTPUT
-echo -----------------------------------------------
+	$TESTBENCH_RUN -r 16000 -c 2 -b S16_LE -p 3,4 -t "$tplg_s16" -i "$RAW_INPUT_S16" -o "$out_s16"
+	$TESTBENCH_RUN -r 16000 -c 2 -b S24_LE -p 3,4 -t "$tplg_s24" -i "$RAW_INPUT_S24" -o "$out_s24"
+	$TESTBENCH_RUN -r 16000 -c 2 -b S32_LE -p 3,4 -t "$tplg_s32" -i "$RAW_INPUT_S32" -o "$out_s32"
+
+	echo ----------------------------------------------------------------------------------
+	echo "The ${label} data was output to file ${out_s16}, ${out_s24}, ${out_s32}"
+	echo ----------------------------------------------------------------------------------
+}
+
+main() {
+	convert_input "$1"
+	run_testbench "sof-hda-benchmark-mfcc" mfcc_s16.raw mfcc_s24.raw mfcc_s32.raw "MFCC"
+	run_testbench "sof-hda-benchmark-mfccmel" mel_s16.raw mel_s24.raw mel_s32.raw "MFCC Mel"
+
+	if [ -n "$XTENSA_PATH" ]; then
+		TESTBENCH_RUN="$XTENSA_PATH/xt-run $SOF_WORKSPACE/sof/tools/testbench/build_xt_testbench/sof-testbench4"
+		run_testbench "sof-hda-benchmark-mfcc" xt_mfcc_s16.raw xt_mfcc_s24.raw xt_mfcc_s32.raw "Xtensa MFCC"
+		run_testbench "sof-hda-benchmark-mfccmel" xt_mel_s16.raw xt_mel_s24.raw xt_mel_s32.raw "Xtensa MFCC Mel"
+	fi
+}
+
+main "$@"
diff --git a/src/audio/mfcc/tune/setup_mfcc.m b/src/audio/mfcc/tune/setup_mfcc.m
index e0d42e1e034d..bd2b3f11e60b 100644
--- a/src/audio/mfcc/tune/setup_mfcc.m
+++ b/src/audio/mfcc/tune/setup_mfcc.m
@@ -1,23 +1,36 @@
-% setup_mfcc(cfg)
+% setup_mfcc()
 %
-% Input
-%   cfg - optional MFCC configuration parameters struct, see
-%         below from code
-%
-% Create binary configuration blob for MFCC component. The hex data
-% is written to tools/topology/topology2/include/components/mfcc and
-% tools/topology/topology1/m4/mfcc.
+% Create binary configuration blobs for the MFCC component.
+% The hex data is written to files in directory
+% tools/topology/topology2/include/components/mfcc.
 
 % SPDX-License-Identifier: BSD-3-Clause
 %
-% Copyright (c) 2018-2026, Intel Corporation. All rights reserved.
+% Copyright (c) 2018-2026, Intel Corporation.
+
+function setup_mfcc()
+
+	gen_cfg.tplg_ver = 2;
+	gen_cfg.ipc_ver = 4;
+	gen_cfg.tools_path = '../../../../tools/';
+	gen_cfg.mfcc_conf_path = [gen_cfg.tools_path 'topology/topology2/include/components/mfcc/'];
+
+	% Default blob
+	setup = get_mfcc_default_config();
+	setup.tplg_fn = 'default.conf';
+	export_mfcc_setup(gen_cfg, setup);
 
-function setup_mfcc(cfg)
+	% Blob for mel spectrogram data
+	setup = get_mel_spectrogram_config();
+	setup.tplg_fn = 'mel80.conf';
+	export_mfcc_setup(gen_cfg, setup);
 
-if nargin < 1
+end
+
+function cfg = get_mfcc_default_config()
 	cfg.blackman_coef = 0.42;
 	cfg.cepstral_lifter = 22.0;
-	cfg.channel = -1; % -1 expect mono, 0 left, 1 right ...
+	cfg.channel = 0; % -1 expect mono, 0 left, 1 right ...
 	cfg.dither = 0.0; % no support
 	cfg.energy_floor = 1.0;
 	cfg.frame_length = 25.0; % ms
@@ -44,26 +57,54 @@ function setup_mfcc(cfg)
 	cfg.mel_log = 'log'; % Set to 'db' for librosa, set to 'log10' for matlab
 	cfg.pmin = 5e-10; % Set to 1e-10 for librosa
 	cfg.top_db = 200; % Set to 80 for librosa
+	cfg.mel_offset = 0; % For mel_only mode, no impact with num_ceps > 0
+	cfg.mel_scale = 0; % same
+	cfg.mmax_init = 0; % same
+	cfg.mmax_coef = 0; % same
+	cfg.dynamic_mmax = false; % same
 end
 
-cfg.tools = '../../../../tools/';
-
-cfg.tplg_fn = [cfg.tools 'topology/topology1/m4/mfcc/mfcc_config.m4'];
-cfg.tplg_ver = 1;
-cfg.ipc_ver = 3;
-export_mfcc_setup(cfg);
-
-cfg.tplg_fn = [cfg.tools 'topology/topology2/include/components/mfcc/default.conf'];
-cfg.tplg_ver = 2;
-cfg.ipc_ver = 4;
-export_mfcc_setup(cfg);
-
+function cfg = get_mel_spectrogram_config()
+	cfg.blackman_coef = 0;
+	cfg.cepstral_lifter = 0;
+	cfg.channel = 0;
+	cfg.dither = 0;
+	cfg.energy_floor = 1.0;
+	cfg.frame_length = 25.0; % 400 samples at 16 kHz
+	cfg.frame_shift = 10.0; % 160 samples at 16 kHz
+	cfg.high_freq = 8000;
+	cfg.htk_compat = false;
+	cfg.low_freq = 0;
+	cfg.num_ceps = 0; % Mel-only mode, no DCT
+	cfg.min_duration = 0;
+	cfg.norm = 'slaney';
+	cfg.num_mel_bins = 80;
+	cfg.preemphasis_coefficient = 0;
+	cfg.raw_energy = false;
+	cfg.remove_dc_offset = false;
+	cfg.round_to_power_of_two = true;
+	cfg.sample_frequency = 16000;
+	cfg.snip_edges = true;
+	cfg.subtract_mean = false;
+	cfg.use_energy = false;
+	cfg.vtln_high = 0;
+	cfg.vtln_low = 0;
+	cfg.vtln_warp = 1.0;
+	cfg.window_type = 'hann';
+	cfg.mel_log = 'log10';
+	cfg.pmin = 1e-10;
+	cfg.top_db = 8; % applied for log10, would be 80 dB clamp for decibels as 10*log10()
+	cfg.mel_offset = 4.0; % For whisper like Mel scale and normalize
+	cfg.mel_scale = 0.25; % For whisper like Mel scale and normalize
+	cfg.mmax_init = 0; % Initial value max Mel value, data clamp is mmax - top_db
+	cfg.mmax_coef = 0; % Dynamic max Mel value decay coefficient (zero lock to found max)
+	cfg.dynamic_mmax = true;
 end
 
-function export_mfcc_setup(cfg)
+function export_mfcc_setup(gen_cfg, cfg)
 
 %% Use blob tool from EQ
-addpath([cfg.tools 'tune/common']);
+addpath([gen_cfg.tools_path 'tune/common']);
 
 %% Blob size, size plus reserved(8) + current parameters
 nbytes_data = 104;
@@ -73,7 +114,7 @@ function export_mfcc_setup(cfg)
 sh16 = [0 -8];
 
 %% Get ABI information
-[abi_bytes, nbytes_abi] = sof_get_abi(nbytes_data, cfg.ipc_ver);
+[abi_bytes, nbytes_abi] = sof_get_abi(nbytes_data, gen_cfg.ipc_ver);
 
 %% Initialize correct size uint8 array
 nbytes = nbytes_abi + nbytes_data;
@@ -86,14 +127,21 @@ function export_mfcc_setup(cfg)
 
 %% Apply default MFCC configuration, first struct header and reserved, then data
 [b8, j] = add_w32b(nbytes_data, b8, j);
-for i = 1:8
+
+v = q_convert(cfg.mel_offset, 7);                [b8, j] = add_w16b(v, b8, j);
+v = q_convert(cfg.mel_scale, 12);                [b8, j] = add_w16b(v, b8, j);
+v = q_convert(cfg.mmax_init, 7);                 [b8, j] = add_w16b(v, b8, j);
+v = q_convert(cfg.mmax_coef, 15);                [b8, j] = add_w16b(v, b8, j);
+
+% Reserved
+for i = 1:6
 	[b8, j] = add_w32b(0, b8, j);
 end
 
 v = q_convert(cfg.sample_frequency, 0);          [b8, j] = add_w32b(v, b8, j);
 v = q_convert(cfg.pmin, 31);                     [b8, j] = add_w32b(v, b8, j);
-v = 0;                                           [b8, j] = add_w32b(v, b8, j); % enum mel_log
-v = 0;                                           [b8, j] = add_w32b(v, b8, j); % enum norm
+v = get_mel_log_value(cfg.mel_log);              [b8, j] = add_w32b(v, b8, j); % enum mel_log
+v = get_norm_value(cfg.norm);                    [b8, j] = add_w32b(v, b8, j); % enum norm
 v = 0;                                           [b8, j] = add_w32b(v, b8, j); % enum pad
 v = get_window(cfg);                             [b8, j] = add_w32b(v, b8, j); % enum window
 v = 1;                                           [b8, j] = add_w32b(v, b8, j); % enum dct type
@@ -119,22 +167,24 @@ function export_mfcc_setup(cfg)
 v = cfg.snip_edges;                              [b8, j] = add_w8b(v, b8, j); % bool
 v = cfg.subtract_mean;                           [b8, j] = add_w8b(v, b8, j); % bool
 v = cfg.use_energy;                              [b8, j] = add_w8b(v, b8, j); % bool
+v = cfg.dynamic_mmax;                            [b8, j] = add_w8b(v, b8, j); % bool
 
 %% Export
-switch cfg.tplg_ver
+tplg_fn = [gen_cfg.mfcc_conf_path cfg.tplg_fn];
+switch gen_cfg.tplg_ver
        case 1
-	       sof_tplg_write(cfg.tplg_fn, b8, "DEF_MFCC_PRIV", ...
+	       sof_tplg_write(tplg_fn, b8, "DEF_MFCC_PRIV", ...
 			      "Exported with script setup_mfcc.m", ...
 			      "cd src/audio/mfcc/tune; octave setup_mfcc.m");
        case 2
-	       sof_tplg2_write(cfg.tplg_fn, b8, "mfcc_config", ...
+	       sof_tplg2_write(tplg_fn, b8, "mfcc_config", ...
 			       "Exported MFCC configuration", ...
 			       "cd src/audio/mfcc/tune; octave setup_mfcc.m");
        otherwise
-	       error("Illegal cfg.tplg_ver, use 1 for topology v1 or 2 topology v2.");
+	       error("Illegal tplg_ver, use 1 for topology v1 or 2 topology v2.");
 end
 
-rmpath([cfg.tools 'tune/common']);
+rmpath([gen_cfg.tools_path 'tune/common']);
 
 end
 
@@ -157,6 +207,30 @@ function export_mfcc_setup(cfg)
 	end
 end
 
+function n = get_mel_log_value(mel_log)
+	switch lower(mel_log)
+		case 'log'
+			n = 0;
+		case 'log10'
+			n = 1;
+		case 'db'
+			n = 2;
+		otherwise
+			error('Unknown mel_log type');
+	end
+end
+
+function n = get_norm_value(norm)
+	switch lower(norm)
+		case 'none'
+			n = 0;
+		case 'slaney'
+			n = 1;
+		otherwise
+			error('Unknown norm type');
+	end
+end
+
 function bytes = w8b(word)
 bytes = uint8(zeros(1,1));
 bytes(1) = bitand(word, 255);
diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h
index 7323428ec37d..abee71faf947 100644
--- a/src/include/sof/audio/mfcc/mfcc_comp.h
+++ b/src/include/sof/audio/mfcc/mfcc_comp.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  *
- * Copyright(c) 2022 Intel Corporation. All rights reserved.
+ * Copyright(c) 2022-2026 Intel Corporation.
  *
  * Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
  */
@@ -36,7 +36,7 @@
  * set to 32 the FFT and Mel filterbank are computed with better 32 bit precision. There
  * is also need to enable 32 bit FFT from Kconfig if set.
  */
-#define MFCC_FFT_BITS	16
+#define MFCC_FFT_BITS	32
 
 /* MFCC with 16 bit FFT benefits from data normalize, for 32 bits there's no
  * significant impact. The amount of left shifts for FFT input is limited to
@@ -114,6 +114,8 @@ struct mfcc_state {
 	struct mat_matrix_16b *mel_spectra; /**< Pointer to scratch */
 	struct mat_matrix_16b *cepstral_coef; /**< Pointer to scratch */
 	int32_t *power_spectra; /**< Pointer to scratch */
+	int32_t *mel_log_32; /**< Pointer to scratch for 32-bit Mel output Q9.23 */
+	int32_t mmax; /**< Maximum Mel value in Q9.23 */
 	int16_t buf_avail;
 	int16_t *buffers;
 	int16_t *prev_data; /**< prev_data_size */
@@ -125,9 +127,14 @@ struct mfcc_state {
 	int low_freq;
 	int high_freq;
 	int sample_rate;
+	bool mel_only; /**< When true, output Mel spectra instead of cepstral coefficients */
 	bool waiting_fill; /**< booleans */
 	bool prev_samples_valid;
+	bool magic_pending; /**< True when magic word not yet written for current output */
 	size_t sample_buffers_size; /**< bytes */
+	int16_t *out_data_ptr; /**< Read pointer into scratch data for multi-period output */
+	int32_t *out_data_ptr_32; /**< Read pointer for 32-bit mel-only output */
+	int out_remain; /**< Remaining int16_t samples to write to sink from scratch */
 };
 
 /* MFCC component private data */
@@ -156,12 +163,6 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int rate, int chan
 
 void mfcc_free_buffers(struct processing_module *mod);
 
-void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer *bsource,
-		      struct output_stream_buffer *bsink, int frames);
-
-void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
-			  struct mfcc_pre_emph *emph, int frames, int source_channel);
-
 void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data,
 			    int prev_data_length);
 
@@ -175,16 +176,31 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift);
 
 #if CONFIG_FORMAT_S16LE
 
-int16_t *mfcc_sink_copy_zero_s16(const struct audio_stream *sink,
-				 int16_t *w_ptr, int samples);
-
-int16_t *mfcc_sink_copy_data_s16(const struct audio_stream *sink, int16_t *w_ptr,
-				 int samples, int16_t *r_ptr);
+void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
+			  struct mfcc_pre_emph *emph, int frames, int source_channel);
 
 void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer *bsource,
 		      struct output_stream_buffer *bsink, int frames);
 #endif
 
+#if CONFIG_FORMAT_S24LE
+
+void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
+			  struct mfcc_pre_emph *emph, int frames, int source_channel);
+
+void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer *bsource,
+		      struct output_stream_buffer *bsink, int frames);
+#endif
+
+#if CONFIG_FORMAT_S32LE
+
+void mfcc_source_copy_s32(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
+			  struct mfcc_pre_emph *emph, int frames, int source_channel);
+
+void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer *bsource,
+		      struct output_stream_buffer *bsink, int frames);
+#endif
+
 #ifdef UNIT_TEST
 void sys_comp_module_mfcc_interface_init(void);
 #endif
diff --git a/src/include/sof/math/auditory.h b/src/include/sof/math/auditory.h
index b09017786e36..b3fd46dcf26f 100644
--- a/src/include/sof/math/auditory.h
+++ b/src/include/sof/math/auditory.h
@@ -108,6 +108,6 @@ void psy_apply_mel_filterbank_16(struct psy_mel_filterbank *mel_fb, struct icomp
  *                           be subtracted from the log or decibels notation.
  */
 void psy_apply_mel_filterbank_32(struct psy_mel_filterbank *mel_fb, struct icomplex32 *fft_out,
-				 int32_t *power_spectra, int16_t *mel_log, int bitshift);
+				 int32_t *power_spectra, int32_t *mel_log, int bitshift);
 
 #endif /* __SOF_MATH_AUDITORY_H__ */
diff --git a/src/include/sof/math/fft.h b/src/include/sof/math/fft.h
index df06baf47c81..f98cb724506a 100644
--- a/src/include/sof/math/fft.h
+++ b/src/include/sof/math/fft.h
@@ -11,6 +11,7 @@
 
 #include <sof/audio/module_adapter/module/generic.h>
 #include <sof/audio/format.h>
+#include <sof/math/icomplex16.h>
 #include <sof/math/icomplex32.h>
 #include <sof/common.h>
 #include <stdbool.h>
diff --git a/src/include/user/mfcc.h b/src/include/user/mfcc.h
index 7a5b7fcca98e..8a0defcd9883 100644
--- a/src/include/user/mfcc.h
+++ b/src/include/user/mfcc.h
@@ -50,7 +50,11 @@ enum sof_mfcc_dct_type {
  */
 struct sof_mfcc_config {
 	uint32_t size; /**< Size of this struct in bytes */
-	uint32_t reserved[8];
+	int16_t mel_offset; /**< Q8.7 default 0, use 4.0 for Whisper */
+	int16_t mel_scale; /**< Q4.12 default 1.0, use 0.25 for Whisper */
+	int16_t mmax_init; /**< Q8.7 default 0, with dynamic_mmax false, can sim. Whisper mmax */
+	int16_t mmax_coef; /**< Q1.15 decay coefficient for dynamic mmax, a small value for slow */
+	uint32_t reserved[6];
 	int32_t sample_frequency; /**< Hz. e.g. 16000 */
 	int32_t pmin; /**< Q1.31 linear power, limit minimum Mel energy, e.g. 1e-9 */
 	enum sof_mfcc_mel_log_type mel_log; /**< Use MEL_LOG_IS_LOG, LOG10 or DB*/
@@ -69,7 +73,7 @@ struct sof_mfcc_config {
 	int16_t num_ceps; /**< Number of cepstral coefficients, e.g. 13 */
 	int16_t num_mel_bins; /**< Number of internal Mel bands, e.g. 23 */
 	int16_t preemphasis_coefficient; /**< Q1.15, e.g. 0.97, or 0 for disable */
-	int16_t top_db; /**< Q8.7 dB, limit Mel energies to this value e.g. 200 */
+	int16_t top_db; /**< Q8.7 dB, limit min. Mel energies to chunk max - top_dB, e.g. 80 */
 	int16_t vtln_high; /**< Reserved, no support */
 	int16_t vtln_low; /**< Reserved, no support */
 	int16_t vtln_warp; /**< Reserved, no support */
@@ -80,7 +84,7 @@ struct sof_mfcc_config {
 	bool snip_edges; /**< Must be true (1) */
 	bool subtract_mean; /**< Must be false (0) */
 	bool use_energy; /**< Must be false (0) */
-	bool reserved_bool1;
+	bool dynamic_mmax; /**< Track max Mel value for clamp with top_db value */
 	bool reserved_bool2;
 	bool reserved_bool3;
 } __attribute__((packed));
diff --git a/src/math/auditory/mel_filterbank_32.c b/src/math/auditory/mel_filterbank_32.c
index a80d09ad624a..414ddf482f93 100644
--- a/src/math/auditory/mel_filterbank_32.c
+++ b/src/math/auditory/mel_filterbank_32.c
@@ -12,7 +12,7 @@
 #include <stdint.h>
 
 void psy_apply_mel_filterbank_32(struct psy_mel_filterbank *fb, struct icomplex32 *fft_out,
-				 int32_t *power_spectra, int16_t *mel_log, int bitshift)
+				 int32_t *power_spectra, int32_t *mel_log, int bitshift)
 {
 	int64_t pmax;
 	int64_t p;
@@ -79,8 +79,8 @@ void psy_apply_mel_filterbank_32(struct psy_mel_filterbank *fb, struct icomplex3
 		 */
 		log -= ((int32_t)lshift + 2 * bitshift) << 16;
 
-		/* Scale for desired log  */
-		log = Q_MULTSR_32X32((int64_t)log, fb->log_mult, 16, 29, 7);
-		mel_log[i] = sat_int16(log); /* Q8.7 */
+		/* Scale for desired log, output as Q9.23 */
+		log = Q_MULTSR_32X32((int64_t)log, fb->log_mult, 16, 29, 23);
+		mel_log[i] = log; /* Q9.23 */
 	}
 }
diff --git a/test/cmocka/src/math/auditory/auditory.c b/test/cmocka/src/math/auditory/auditory.c
index dc05c387cfae..ff222e52fadd 100644
--- a/test/cmocka/src/math/auditory/auditory.c
+++ b/test/cmocka/src/math/auditory/auditory.c
@@ -163,7 +163,8 @@ static void filterbank_32_test(const int32_t *fft_real, const int32_t *fft_imag,
 	float error_rms;
 	float delta_max = 0;
 	int32_t *power_spectra;
-	int16_t *mel_log;
+	int32_t *mel_log;
+	int16_t mel_log_16;
 	int i;
 	const int half_fft = num_fft_bins / 2 + 1;
 	const int fft_size = num_fft_bins * sizeof(struct icomplex32);
@@ -181,7 +182,7 @@ static void filterbank_32_test(const int32_t *fft_real, const int32_t *fft_imag,
 		goto err_out_alloc;
 	}
 
-	mel_log = malloc(MEL_FILTERBANK_32_TEST1_NUM_MEL_BINS * sizeof(int16_t));
+	mel_log = malloc(num_mel_bins * sizeof(int32_t));
 	if (!mel_log) {
 		fprintf(stderr, "Failed to allocate output vector\n");
 		goto err_mel_alloc;
@@ -215,9 +216,10 @@ static void filterbank_32_test(const int32_t *fft_real, const int32_t *fft_imag,
 	power_spectra = (int32_t *)&fft_buf[0];
 	psy_apply_mel_filterbank_32(&fb, fft_out, power_spectra, mel_log, shift);
 
-	/* Check */
+	/* Check: convert Q9.23 output to Q9.7 for comparison with reference */
 	for (i = 0; i < num_mel_bins; i++) {
-		delta = (float)ref_mel_log[i] - (float)mel_log[i];
+		mel_log_16 = (int16_t)(mel_log[i] >> 16);
+		delta = (float)ref_mel_log[i] - (float)mel_log_16;
 		sum_squares += delta * delta;
 		if (delta > delta_max)
 			delta_max = delta;
@@ -233,7 +235,7 @@ static void filterbank_32_test(const int32_t *fft_real, const int32_t *fft_imag,
 	FILE *fh = fopen("mel_filterbank_32.txt", "w");
 
 	for (i = 0; i < num_mel_bins; i++)
-		fprintf(fh, "%d %d\n", ref_mel_log[i], mel_log[i]);
+		fprintf(fh, "%d %d\n", ref_mel_log[i], (int16_t)(mel_log[i] >> 16));
 
 	fclose(fh);
 #endif
diff --git a/tools/topology/topology2/cavs-benchmark-hda.conf b/tools/topology/topology2/cavs-benchmark-hda.conf
index 62c0ad4f4fbc..95ab67431812 100644
--- a/tools/topology/topology2/cavs-benchmark-hda.conf
+++ b/tools/topology/topology2/cavs-benchmark-hda.conf
@@ -834,6 +834,16 @@ IncludeByKey.BENCH_CONFIG {
 		<include/bench/mfcc_s32.conf>
 	}
 
+	"mfccmel16" {
+		<include/bench/mfccmel_s16.conf>
+	}
+	"mfccmel24" {
+		<include/bench/mfccmel_s24.conf>
+	}
+	"mfccmel32" {
+		<include/bench/mfccmel_s32.conf>
+	}
+
 	#
 	# Micsel component
 	#
diff --git a/tools/topology/topology2/development/tplg-targets-bench.cmake b/tools/topology/topology2/development/tplg-targets-bench.cmake
index eff707d49aa9..5c0f82dc7dfc 100644
--- a/tools/topology/topology2/development/tplg-targets-bench.cmake
+++ b/tools/topology/topology2/development/tplg-targets-bench.cmake
@@ -19,6 +19,7 @@ set(components
 	"igo_nr"
 	"level_multiplier"
 	"mfcc"
+	"mfccmel"
 	"micsel"
 	"rtnr"
 	"sound_dose"
@@ -45,6 +46,7 @@ set(component_parameters
 	"BENCH_IGO_NR_PARAMS=default"
 	"BENCH_LEVEL_MULTIPLIER_PARAMS=default"
 	"BENCH_MFCC_PARAMS=default"
+	"BENCH_MFCC_PARAMS=mel80"
 	"BENCH_MICSEL_PARAMS=passthrough"
 	"BENCH_RTNR_PARAMS=default"
 	"BENCH_SOUND_DOSE_PARAMS=default"
diff --git a/tools/topology/topology2/include/bench/mfcc_controls_capture.conf b/tools/topology/topology2/include/bench/mfcc_controls_capture.conf
index 56a731b86687..d45baec1ee8f 100644
--- a/tools/topology/topology2/include/bench/mfcc_controls_capture.conf
+++ b/tools/topology/topology2/include/bench/mfcc_controls_capture.conf
@@ -6,6 +6,7 @@
 					name '$ANALOG_CAPTURE_PCM MFCC bytes'
 					IncludeByKey.BENCH_MFCC_PARAMS {
 						"default" "include/components/mfcc/default.conf"
+						"mel80" "include/components/mfcc/mel80.conf"
 					}
 				}
 				#mixer."1" {
diff --git a/tools/topology/topology2/include/bench/mfcc_controls_playback.conf b/tools/topology/topology2/include/bench/mfcc_controls_playback.conf
index 7649678c8468..cc2ada04b8d7 100644
--- a/tools/topology/topology2/include/bench/mfcc_controls_playback.conf
+++ b/tools/topology/topology2/include/bench/mfcc_controls_playback.conf
@@ -6,6 +6,7 @@
 					name '$ANALOG_PLAYBACK_PCM MFCC bytes'
 					IncludeByKey.BENCH_MFCC_PARAMS {
 						"default" "include/components/mfcc/default.conf"
+						"mel80" "include/components/mfcc/mel80.conf"
 					}
 				}
 				#mixer."1" {
diff --git a/tools/topology/topology2/include/bench/mfccmel_s16.conf b/tools/topology/topology2/include/bench/mfccmel_s16.conf
new file mode 100644
index 000000000000..ec89bffb90a1
--- /dev/null
+++ b/tools/topology/topology2/include/bench/mfccmel_s16.conf
@@ -0,0 +1,13 @@
+		# Created with script "./bench_comp_generate.sh mfcc"
+		Object.Widget.mfcc.1 {
+			index $BENCH_PLAYBACK_HOST_PIPELINE
+			<include/bench/one_input_output_format_s16_16k.conf>
+			<include/bench/mfcc_controls_playback.conf>
+		}
+		Object.Widget.mfcc.2 {
+			index $BENCH_CAPTURE_HOST_PIPELINE
+			<include/bench/one_input_output_format_s16_16k.conf>
+			<include/bench/mfcc_controls_capture.conf>
+		}
+		<include/bench/host_gateway_pipelines_s16_16k.conf>
+		<include/bench/mfcc_route.conf>
diff --git a/tools/topology/topology2/include/bench/mfccmel_s24.conf b/tools/topology/topology2/include/bench/mfccmel_s24.conf
new file mode 100644
index 000000000000..73571fabe5f2
--- /dev/null
+++ b/tools/topology/topology2/include/bench/mfccmel_s24.conf
@@ -0,0 +1,13 @@
+		# Created with script "./bench_comp_generate.sh mfcc"
+		Object.Widget.mfcc.1 {
+			index $BENCH_PLAYBACK_HOST_PIPELINE
+			<include/bench/one_input_output_format_s24_16k.conf>
+			<include/bench/mfcc_controls_playback.conf>
+		}
+		Object.Widget.mfcc.2 {
+			index $BENCH_CAPTURE_HOST_PIPELINE
+			<include/bench/one_input_output_format_s24_16k.conf>
+			<include/bench/mfcc_controls_capture.conf>
+		}
+		<include/bench/host_gateway_pipelines_s24_16k.conf>
+		<include/bench/mfcc_route.conf>
diff --git a/tools/topology/topology2/include/bench/mfccmel_s32.conf b/tools/topology/topology2/include/bench/mfccmel_s32.conf
new file mode 100644
index 000000000000..75c01eaf4a43
--- /dev/null
+++ b/tools/topology/topology2/include/bench/mfccmel_s32.conf
@@ -0,0 +1,13 @@
+		# Created with script "./bench_comp_generate.sh mfcc"
+		Object.Widget.mfcc.1 {
+			index $BENCH_PLAYBACK_HOST_PIPELINE
+			<include/bench/one_input_output_format_s32_16k.conf>
+			<include/bench/mfcc_controls_playback.conf>
+		}
+		Object.Widget.mfcc.2 {
+			index $BENCH_CAPTURE_HOST_PIPELINE
+			<include/bench/one_input_output_format_s32_16k.conf>
+			<include/bench/mfcc_controls_capture.conf>
+		}
+		<include/bench/host_gateway_pipelines_s32_16k.conf>
+		<include/bench/mfcc_route.conf>
diff --git a/tools/topology/topology2/include/components/mfcc/default.conf b/tools/topology/topology2/include/components/mfcc/default.conf
index 1f9141886de9..42a6d6608b8b 100644
--- a/tools/topology/topology2/include/components/mfcc/default.conf
+++ b/tools/topology/topology2/include/components/mfcc/default.conf
@@ -1,9 +1,9 @@
-# Exported MFCC configuration 24-Jul-2024
-# cd tools/tune/mfcc; octave setup_mfcc.m
+# Exported MFCC configuration 05-May-2026
+# cd src/audio/mfcc/tune; octave setup_mfcc.m
 Object.Base.data."mfcc_config" {
 	bytes "
 		0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00,
-		0x68,0x00,0x00,0x00,0x00,0xa0,0x01,0x03,
+		0x68,0x00,0x00,0x00,0x01,0xd0,0x01,0x03,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x68,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
@@ -14,7 +14,7 @@ Object.Base.data."mfcc_config" {
 		0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x02,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-		0xc3,0x35,0x00,0x2c,0xff,0xff,0x00,0x00,
+		0xc3,0x35,0x00,0x2c,0x00,0x00,0x00,0x00,
 		0x90,0x01,0xa0,0x00,0x00,0x00,0x14,0x00,
 		0x0d,0x00,0x17,0x00,0x00,0x00,0x00,0x64,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,
diff --git a/tools/topology/topology2/include/components/mfcc/mel80.conf b/tools/topology/topology2/include/components/mfcc/mel80.conf
new file mode 100644
index 000000000000..04aa2a15c660
--- /dev/null
+++ b/tools/topology/topology2/include/components/mfcc/mel80.conf
@@ -0,0 +1,22 @@
+# Exported MFCC configuration 05-May-2026
+# cd src/audio/mfcc/tune; octave setup_mfcc.m
+Object.Base.data."mfcc_config" {
+	bytes "
+		0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00,
+		0x68,0x00,0x00,0x00,0x01,0xd0,0x01,0x03,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x68,0x00,0x00,0x00,0x00,0x02,0x00,0x04,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x80,0x3e,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+		0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x03,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x90,0x01,0xa0,0x00,0x40,0x1f,0x00,0x00,
+		0x00,0x00,0x50,0x00,0x00,0x00,0x00,0x04,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x01,0x01,0x00,0x00,0x01,0x00,0x00"
+}