From 6d13c9d7f9a11b3b70941ae162ba428ac239e452 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Tue, 12 May 2026 14:13:08 +0300 Subject: [PATCH 1/8] Audio: MFCC: Add Voice Activity Detection based on Mel spectrum Add mfcc_vad module with A-weighted energy-based voice activity detection that operates on the Mel log spectrum produced by the MFCC component. The algorithm tracks a per-bin noise floor with instant-down and slow-rise behavior, then computes a weighted energy delta above the floor. Speech is declared when the delta exceeds a threshold (0.35 in Q9.23) with a 20-frame hangover to prevent rapid toggling. The VAD is gated on the new enable_vad flag in sof_mfcc_config. Add struct mfcc_data_header with six int32 fields (magic, frame_number, reserved, energy, noise_energy, vad_flag) prepended to every output frame in all format paths (S16, S24, S32). This replaces the previous magic-word-only header. The header carries the VAD decision and energy values from the DSP for downstream consumers. Extend sof_mfcc_config in user/mfcc.h with reserved16[3] padding for 32-bit alignment, and new boolean fields enable_vad, enable_dtx, update_controls, and reserved_bool[5]. The config blob size increases from 104 to 116 bytes. Update Matlab/Octave decode scripts (decode_mel.m, decode_ceps.m, decode_all.m) and setup_mfcc.m for the expanded header and config struct. Regenerate topology2 configuration blobs (default.conf, mel80.conf) with the new blob size. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/CMakeLists.txt | 2 +- src/audio/mfcc/mfcc_common.c | 91 ++++--- src/audio/mfcc/mfcc_setup.c | 28 ++- src/audio/mfcc/mfcc_vad.c | 229 ++++++++++++++++++ src/audio/mfcc/tune/decode_all.m | 4 +- src/audio/mfcc/tune/decode_ceps.m | 42 +++- src/audio/mfcc/tune/decode_mel.m | 82 +++++-- src/audio/mfcc/tune/setup_mfcc.m | 19 +- src/include/sof/audio/mfcc/mfcc_comp.h | 22 +- src/include/sof/audio/mfcc/mfcc_vad.h | 92 +++++++ src/include/user/mfcc.h | 7 +- .../include/components/mfcc/default.conf | 12 +- .../include/components/mfcc/mel80.conf | 10 +- 13 files changed, 568 insertions(+), 72 deletions(-) create mode 100644 src/audio/mfcc/mfcc_vad.c create mode 100644 src/include/sof/audio/mfcc/mfcc_vad.h diff --git a/src/audio/mfcc/CMakeLists.txt b/src/audio/mfcc/CMakeLists.txt index f8af79d1ca8a..10daf78aa2a6 100644 --- a/src/audio/mfcc/CMakeLists.txt +++ b/src/audio/mfcc/CMakeLists.txt @@ -4,5 +4,5 @@ if(CONFIG_COMP_MFCC STREQUAL "m" AND DEFINED CONFIG_LLEXT) add_subdirectory(llext ${PROJECT_BINARY_DIR}/mfcc_llext) add_dependencies(app mfcc) else() - add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c) + add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c mfcc_vad.c) endif() diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c index 1079864e9259..70ccdcfa55d6 100644 --- a/src/audio/mfcc/mfcc_common.c +++ b/src/audio/mfcc/mfcc_common.c @@ -21,6 +21,8 @@ #include #include +#include + LOG_MODULE_REGISTER(mfcc_common, CONFIG_SOF_LOG_LEVEL); /* @@ -169,6 +171,22 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data * cc_count += state->dct.num_out; } + + /* Use hop counter for frame numbering (independent of VAD enable) */ + state->header.frame_number = state->hop_count; + + /* Run VAD on the mel log spectrum (available in both modes) */ + if (config->enable_vad) { + mfcc_vad_update(&cd->vad, state->mel_log_32); + + /* Populate data header for this output frame */ + state->header.energy = cd->vad.energy; + state->header.noise_energy = cd->vad.noise_energy; + state->header.vad_flag = cd->vad.is_speech ? 1 : 0; + } + + /* Increment hop counter at end of hop processing */ + state->hop_count++; } return cc_count; @@ -267,9 +285,8 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer struct mfcc_comp_data *cd = module_get_private_data(mod); struct mfcc_state *state = &cd->state; struct mfcc_buffer *buf = &cd->state.buf; - uint32_t magic = MFCC_MAGIC; int16_t *w_ptr = audio_stream_get_wptr(sink); - const int num_magic = 2; + const int num_header_s16 = sizeof(state->header) / sizeof(int16_t); int num_ceps; int sink_samples; int to_copy; @@ -280,25 +297,33 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer /* Run STFT and processing after FFT: Mel auditory filter and DCT. */ num_ceps = mfcc_stft_process(mod->dev, cd); - /* If new output produced, set up pointer into scratch data and mark magic pending */ + /* If new output produced, set up pointer into scratch data and mark header pending */ if (num_ceps > 0) { - if (state->mel_only) + if (state->mel_only) { state->out_data_ptr = state->mel_spectra->data; - else + } else { state->out_data_ptr = state->cepstral_coef->data; + } state->out_remain = num_ceps; - state->magic_pending = true; + state->header_pending = true; } /* Write to sink, limited by period size */ sink_samples = frames * audio_stream_get_channels(sink); - /* Write magic word first if pending */ - if (state->magic_pending && sink_samples >= num_magic) { - w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_magic, (int16_t *)&magic); - sink_samples -= num_magic; - state->magic_pending = false; + /* Write data header first if pending */ + if (state->header_pending) { + if (sink_samples < num_header_s16) { + /* Not enough sink space for header, defer entire frame */ + mfcc_sink_copy_zero_s16(sink, w_ptr, sink_samples); + return; + } + + w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_header_s16, + (int16_t *)&state->header); + sink_samples -= num_header_s16; + state->header_pending = false; } /* Write cepstral/mel data from scratch buffer */ @@ -363,9 +388,8 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer struct mfcc_comp_data *cd = module_get_private_data(mod); struct mfcc_state *state = &cd->state; struct mfcc_buffer *buf = &cd->state.buf; - uint32_t magic = MFCC_MAGIC; int32_t *w_ptr = audio_stream_get_wptr(sink); - const int num_magic = 1; /* one int32_t word for magic */ + const int num_header_s32 = sizeof(state->header) / sizeof(int32_t); int num_ceps; int sink_samples; int remain_s32; @@ -391,17 +415,24 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer } state->out_remain = num_ceps; - state->magic_pending = true; + state->header_pending = true; } /* Write to sink, limited by period size */ sink_samples = frames * audio_stream_get_channels(sink); - /* Write magic word first if pending */ - if (state->magic_pending && sink_samples >= num_magic) { - w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic); - sink_samples -= num_magic; - state->magic_pending = false; + /* Write data header first if pending */ + if (state->header_pending) { + if (sink_samples < num_header_s32) { + /* Not enough sink space for header, defer entire frame */ + mfcc_sink_copy_zero_s32(sink, w_ptr, sink_samples); + return; + } + + w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_header_s32, + (int32_t *)&state->header); + sink_samples -= num_header_s32; + state->header_pending = false; } if (state->mel_only) { @@ -443,9 +474,8 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer struct mfcc_comp_data *cd = module_get_private_data(mod); struct mfcc_state *state = &cd->state; struct mfcc_buffer *buf = &cd->state.buf; - uint32_t magic = MFCC_MAGIC; int32_t *w_ptr = audio_stream_get_wptr(sink); - const int num_magic = 1; /* one int32_t word for magic */ + const int num_header_s32 = sizeof(state->header) / sizeof(int32_t); int num_ceps; int sink_samples; int remain_s32; @@ -466,17 +496,24 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer } state->out_remain = num_ceps; - state->magic_pending = true; + state->header_pending = true; } /* Write to sink, limited by period size */ sink_samples = frames * audio_stream_get_channels(sink); - /* Write magic word first if pending */ - if (state->magic_pending && sink_samples >= num_magic) { - w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic); - sink_samples -= num_magic; - state->magic_pending = false; + /* Write data header first if pending */ + if (state->header_pending) { + if (sink_samples < num_header_s32) { + /* Not enough sink space for header, defer entire frame */ + mfcc_sink_copy_zero_s32(sink, w_ptr, sink_samples); + return; + } + + w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_header_s32, + (int32_t *)&state->header); + sink_samples -= num_header_s32; + state->header_pending = false; } if (state->mel_only) { diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c index 1cad4b2b984e..25c6876344af 100644 --- a/src/audio/mfcc/mfcc_setup.c +++ b/src/audio/mfcc/mfcc_setup.c @@ -18,6 +18,8 @@ #include #include +#include + /* Definitions for cepstral lifter */ #define PI_Q23 Q_CONVERT_FLOAT(3.1415926536, 23) #define TWO_PI_Q23 Q_CONVERT_FLOAT(6.2831853072, 23) @@ -127,6 +129,11 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i return -EINVAL; } + if (sample_rate > MFCC_MAX_SAMPLE_RATE) { + comp_err(dev, "Sample rate %d exceeds max %d Hz", sample_rate, MFCC_MAX_SAMPLE_RATE); + return -EINVAL; + } + if (config->sample_frequency != sample_rate) { comp_err(dev, "Config sample_frequency does not match stream"); return -EINVAL; @@ -328,11 +335,11 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i /* Check that output data can be drained within the periods spanned by one * FFT hop. Each hop consumes fft_hop_size input samples and produces - * max_out_per_hop + 2 (magic) int16_t output values. The sink provides at - * least fft_hop_size * channels int16_t samples per hop (worst case s16). + * max_out_per_hop + 12 (magic header) int16_t output values. The sink provides + * at least fft_hop_size * channels int16_t samples per hop (worst case s16). * If output exceeds this, data accumulates and will eventually overflow. */ - int out_per_hop = max_out_per_hop + 2; + int out_per_hop = max_out_per_hop + sizeof(state->header) / sizeof(int16_t); int sink_per_hop = fft->fft_hop_size * channels; if (out_per_hop > sink_per_hop) { @@ -345,11 +352,22 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i /* Set initial state for STFT */ state->waiting_fill = true; state->prev_samples_valid = false; - state->magic_pending = false; + state->header_pending = false; + state->hop_count = 0; + memset(&state->header, 0, sizeof(state->header)); + state->header.magic = MFCC_MAGIC; state->out_data_ptr = NULL; state->out_data_ptr_32 = NULL; state->out_remain = 0; + if (config->enable_vad) { + ret = mfcc_vad_init(&cd->vad, config->num_mel_bins, sample_rate, mod); + if (ret < 0) { + comp_err(dev, "Failed VAD init"); + goto free_lifter; + } + } + comp_dbg(dev, "done"); return 0; @@ -389,4 +407,6 @@ void mfcc_free_buffers(struct processing_module *mod) mod_free(mod, cd->state.melfb.data); mod_free(mod, cd->state.dct.matrix); mod_free(mod, cd->state.lifter.matrix); + mod_free(mod, cd->vad.noise_floor); + mod_free(mod, cd->vad.weights); } diff --git a/src/audio/mfcc/mfcc_vad.c b/src/audio/mfcc/mfcc_vad.c new file mode 100644 index 000000000000..f44a89a7dea3 --- /dev/null +++ b/src/audio/mfcc/mfcc_vad.c @@ -0,0 +1,229 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2026 Intel Corporation. +// +// Author: Seppo Ingalsuo + +/** + * \file mfcc_vad.c + * \brief Voice Activity Detection based on Mel spectrum energy. + * + * Implements a VAD that tracks per-bin noise floor and computes a + * speech-frequency weighted energy above the floor. Speech is declared + * when the weighted delta exceeds a threshold, with hangover to prevent + * rapid toggling. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +LOG_MODULE_DECLARE(mfcc, CONFIG_SOF_LOG_LEVEL); + +/** + * \brief A-weighting table: 1/3 octave band center frequencies in Hz (Q16.0). + * + * From IEC 61672-1:2013, source: + * https://acousticalengineer.com/a-weighting-table/ + */ +#define A_WEIGHT_TABLE_SIZE 36 + +static const int16_t a_weight_hz[A_WEIGHT_TABLE_SIZE] = { + 6, 8, 10, 13, 16, 20, 25, 32, + 40, 50, 63, 80, 100, 125, 160, 200, + 250, 315, 400, 500, 630, 800, 1000, 1250, + 1600, 2000, 2500, 3150, 4000, 5000, 6300, 8000, + 10000, 12500, 16000, 20000, +}; + +/** + * \brief A-weighting linear amplitude, scaled so peak (at 2500 Hz) maps + * to INT16_MAX (32767). Original dB values converted via + * 10^(dB/20) then scaled by 32767 / max. + */ +static const int16_t a_weight_lin[A_WEIGHT_TABLE_SIZE] = { + 2, 4, 9, 19, 43, 85, 162, 299, + 531, 862, 1382, 2140, 3129, 4370, 6172, 8136, + 10362, 13196, 16234, 19518, 22669, 25730, 28212, 30230, + 31655, 32392, 32767, 32392, 31655, 30230, 27889, 24856, + 21156, 17196, 13045, 9670, +}; + +/** + * \brief Compute A-weighted speech-frequency emphasis weights for Mel bins. + * + * Weights are computed by linearly interpolating the A-weighting table + * at each Mel bin center frequency. Output weights are in Q1.15 and + * sum to approximately 2^15. + * + * \param[out] weights Output weight array. + * \param[in] num_mel Number of Mel bins. + * \param[in] sample_rate Sample rate in Hz. + */ +static void mfcc_vad_build_weights(int16_t *weights, int num_mel, int32_t sample_rate) +{ + int32_t scaled, num; + int32_t sum = 0; + int16_t f_hz, f0, f1, w, w0, w1, den; + int16_t mel_end = psy_hz_to_mel((int16_t)(sample_rate / 2)); /* Nyquist (max 32767 Hz) in Mel */ + int16_t mel_step = mel_end / (num_mel + 1); + int i, j; + + if (!num_mel) + return; + + for (i = 0; i < num_mel; i++) { + f_hz = psy_mel_to_hz((int16_t)((i + 1) * mel_step)); + + /* Find the table interval containing f_hz and interpolate */ + if (f_hz <= a_weight_hz[0]) { + w = a_weight_lin[0]; + } else if (f_hz >= a_weight_hz[A_WEIGHT_TABLE_SIZE - 1]) { + w = a_weight_lin[A_WEIGHT_TABLE_SIZE - 1]; + } else { + /* Find j such that a_weight_hz[j] <= f_hz < a_weight_hz[j+1] */ + for (j = 0; j < A_WEIGHT_TABLE_SIZE - 2; j++) { + if (f_hz < a_weight_hz[j + 1]) + break; + } + + /* Linear interpolation: w = w0 + (w1 - w0) * (f - f0) / (f1 - f0) */ + f0 = a_weight_hz[j]; + f1 = a_weight_hz[j + 1]; + w0 = a_weight_lin[j]; + w1 = a_weight_lin[j + 1]; + num = (int32_t)(w1 - w0) * (f_hz - f0); + den = f1 - f0; + w = w0 + (int16_t)(num / den); + } + + weights[i] = w; + sum += w; + } + + /* Normalize weights so they sum to 1.0 */ + for (i = 0; i < num_mel; i++) { + scaled = ((int32_t)weights[i] << 16) / sum; /* Q1.16 */ + weights[i] = (int16_t)Q_SHIFT_RND(scaled, 16, 15); /* Round to Q1.15 */ + } +} + +int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int32_t sample_rate, + struct processing_module *mod) +{ + if (!vad) + return -EINVAL; + + if (num_mel_bins <= 0) + return -EINVAL; + + vad->num_mel_bins = num_mel_bins; + vad->energy_threshold = MFCC_VAD_ENERGY_THRESHOLD; + vad->noise_rise_alpha_slow = MFCC_VAD_NOISE_RISE_ALPHA; + vad->noise_rise_alpha_fast = MFCC_VAD_NOISE_RISE_ALPHA_FAST; + vad->hangover_max = MFCC_VAD_HANGOVER_FRAMES; + vad->hangover_counter = 0; + vad->init_frames = MFCC_VAD_NOISE_INIT_FRAMES; + vad->frame_count = 0; + vad->is_speech = false; + vad->initialized = false; + + /* Allocate per-bin noise floor */ + vad->noise_floor = mod_zalloc(mod, num_mel_bins * sizeof(int32_t)); + if (!vad->noise_floor) + return -ENOMEM; + + /* Allocate and compute per-bin weights */ + vad->weights = mod_zalloc(mod, num_mel_bins * sizeof(int16_t)); + if (!vad->weights) { + mod_free(mod, vad->noise_floor); + vad->noise_floor = NULL; + return -ENOMEM; + } + + mfcc_vad_build_weights(vad->weights, num_mel_bins, sample_rate); + return 0; +} + +int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log) +{ + int64_t signal_energy = 0; + int64_t noise_energy = 0; + int64_t energy_delta = 0; + int32_t delta; + int32_t p; + int16_t alpha; + int i; + + if (!vad || !mel_log) + return 0; + + /* Stop incrementing after init phase to avoid wrap-around restarting fast alpha. + * Select rise alpha based on convergence phase. + */ + if (vad->frame_count < vad->init_frames) { + vad->frame_count++; + alpha = vad->noise_rise_alpha_fast; + } else { + alpha = vad->noise_rise_alpha_slow; + } + + /* Initialize noise floor to first frame */ + if (!vad->initialized) { + for (i = 0; i < vad->num_mel_bins; i++) + vad->noise_floor[i] = mel_log[i]; + + vad->initialized = true; + } + + /* Update noise floor: follow down instantly, rise slowly */ + for (i = 0; i < vad->num_mel_bins; i++) { + if (mel_log[i] < vad->noise_floor[i]) { + /* Instant follow-down */ + vad->noise_floor[i] = mel_log[i]; + } else { + /* Slow rise: floor += alpha * (mel - floor) + * Q9.23 + Q1.15 * Q9.23 => need Q9.23 result + * alpha is Q1.15, delta is Q9.23 + */ + delta = mel_log[i] - vad->noise_floor[i]; + p = (int32_t)Q_MULTSR_32X32((int64_t)alpha, delta, 15, 23, 23); + vad->noise_floor[i] += p; + } + } + + /* Compute weighted signal energy and noise floor energy. + * weights are Q1.15, mel values are Q9.23 + * Products are Q10.38, accumulate in int64_t then shift to Q9.23 + */ + + for (i = 0; i < vad->num_mel_bins; i++) { + signal_energy += (int64_t)vad->weights[i] * mel_log[i]; + noise_energy += (int64_t)vad->weights[i] * vad->noise_floor[i]; + } + + vad->energy = sat_int32(Q_SHIFT_RND(signal_energy, 38, 23)); + vad->noise_energy = sat_int32(Q_SHIFT_RND(noise_energy, 38, 23)); + energy_delta = vad->energy - vad->noise_energy; + + /* Round accumulated energy from Q10.38 to Q9.23, saturate to int32 */ + if (energy_delta > vad->energy_threshold) { + vad->hangover_counter = vad->hangover_max; + vad->is_speech = true; + } else { + if (vad->hangover_counter > 0) { + vad->hangover_counter--; + vad->is_speech = true; + } else { + vad->is_speech = false; + } + } + + return vad->is_speech ? 1 : 0; +} diff --git a/src/audio/mfcc/tune/decode_all.m b/src/audio/mfcc/tune/decode_all.m index d5b60289b4cf..f5c7e1a06db4 100644 --- a/src/audio/mfcc/tune/decode_all.m +++ b/src/audio/mfcc/tune/decode_all.m @@ -25,7 +25,7 @@ fn = all_ceps_files{i}; if exist(fn, 'file') fprintf('Decoding MFCC ceps: %s\n', fn); - [ceps, t, n] = decode_ceps(fn, num_ceps); + [ceps, t, n, vad, energy, noise_energy, frame_num] = decode_ceps(fn, num_ceps); end end @@ -34,6 +34,6 @@ fmt = all_mel_fmts{i}; if exist(fn, 'file') fprintf('Decoding Mel: %s\n', fn); - [mel, t, n] = decode_mel(fn, num_mel, fmt); + [mel, t, n, vad, energy, noise_energy, frame_num] = decode_mel(fn, num_mel, fmt); end end diff --git a/src/audio/mfcc/tune/decode_ceps.m b/src/audio/mfcc/tune/decode_ceps.m index a63677fa3731..c094ced7c0e1 100644 --- a/src/audio/mfcc/tune/decode_ceps.m +++ b/src/audio/mfcc/tune/decode_ceps.m @@ -1,4 +1,4 @@ -% [ceps, t, n] = decode_ceps(fn, num_ceps, num_channels) +% [ceps, t, n, vad, energy, noise_energy, frame_number] = decode_ceps(fn, num_ceps, num_channels) % % Input % fn - File with MFCC data in .raw or .wav format @@ -9,11 +9,16 @@ % ceps - cepstral coefficients % t - time vector for plotting % n - ceps 1..num_ceps vector for plotting +% vad - VAD flag per frame from DSP +% energy - weighted signal energy per frame from DSP +% noise_energy - weighted noise floor energy per frame from DSP +% frame_number - frame number from DSP % SPDX-License-Identifier: BSD-3-Clause -% Copyright(c) 2022 Intel Corporation. All rights reserved. +% Copyright(c) 2022-2026 Intel Corporation. All rights reserved. -function [ceps, t, n] = decode_ceps(fn, num_ceps, num_channels) +function [ceps, t, n, vad, energy, noise_energy, frame_number] = ... + decode_ceps(fn, num_ceps, num_channels) if nargin < 3 num_channels = 1; @@ -23,6 +28,7 @@ fs = 16e3; qformat = 7; magic = [25443 28006]; % ASCII 'mfcc' as int16 +num_magic = 2; % magic word is 2 x int16 % Load output data [data, num_channels] = get_file(fn, num_channels); @@ -41,17 +47,37 @@ period_ceps = idx(2)-idx(1); num_frames = length(idx); + +% Header after magic is [frame_number, reserved, energy, noise_energy, vad_flag] +% as int32 (10 int16 slots), followed by num_ceps coefficients. +payload_len = 10 + num_ceps; % 5 int32 = 10 int16, then ceps data + +% Last frame can be incomplete due to span over multiple periods +last = idx(end) + num_magic + payload_len - 1; +if (last > length(data)) + num_frames = num_frames - 1; +end + t_ceps = period_ceps / num_channels / fs; t = (0:num_frames -1) * t_ceps; n = 1:num_ceps; -ceps = zeros(num_ceps, num_frames); +payload = zeros(payload_len, num_frames); for i = 1:num_frames - i1 = idx(i) + 2; - i2 = i1 + num_ceps - 1; - ceps(:,i) = data(i1:i2) / 2^qformat; + i1 = idx(i) + num_magic; + i2 = i1 + payload_len - 1; + payload(:,i) = double(data(i1:i2)); end +% Reassemble int32 from pairs of int16 (little-endian). +% Low half must be treated as unsigned with mod() to handle negative int16. +frame_number = mod(payload(1,:), 65536) + payload(2,:) * 65536; +% payload(3:4,:) is reserved, skip +energy = (mod(payload(5,:), 65536) + payload(6,:) * 65536) / 2^23; +noise_energy = (mod(payload(7,:), 65536) + payload(8,:) * 65536) / 2^23; +vad = mod(payload(9,:), 65536) + payload(10,:) * 65536; +ceps = payload(11:payload_len, :) / 2^qformat; + figure; surf(t, n, ceps, 'EdgeColor', 'none'); colormap(jet); @@ -75,7 +101,7 @@ case '.wav' tmp = audioread(fn, 'native'); t = whos('tmp'); - if ~strcmp(t.class, 'int16'); + if ~strcmp(t.class, 'int16') error('Only 16-bit wav file format is supported'); end s = size(tmp); diff --git a/src/audio/mfcc/tune/decode_mel.m b/src/audio/mfcc/tune/decode_mel.m index f6a723aa2040..0b9b8d09c5a8 100644 --- a/src/audio/mfcc/tune/decode_mel.m +++ b/src/audio/mfcc/tune/decode_mel.m @@ -1,26 +1,32 @@ -% [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels) +% [mel, t, n, vad, energy, noise_energy, frame_number] = decode_mel(fn, num_mel, fmt, num_channels) % % Input % fn - File with Mel data in .raw or .wav format % num_mel - number of Mel coefficients per frame % fmt - format of the Mel data ('s16', 's24', 's32') -% num_channels - needed for .raw format, omit for .wav +% num_channels - needed for .raw format, omit for .wav, default 2 % % Outputs % mel - Mel coefficients % t - time vector for plotting % n - mel 1..num_mel vector for plotting +% vad - VAD flag per frame from DSP +% energy - weighted signal energy per frame from DSP +% noise_energy - weighted noise floor energy per frame from DSP +% frame_number - frame number from DSP % SPDX-License-Identifier: BSD-3-Clause % Copyright(c) 2026 Intel Corporation. -function [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels) +function [mel, t, n, vad, energy, noise_energy, frame_number] = ... + decode_mel(fn, num_mel, fmt, num_channels) if nargin < 3 fmt = 's16'; end + if nargin < 4 - num_channels = 1; + num_channels = 2; end % MFCC stream @@ -30,19 +36,22 @@ case 's16' qformat = 7; magic = [25443 28006]; % ASCII 'mfcc' as two int16 - num_magic = 2; + word_size_multiplier = 2; case 's24' qformat = 15; magic = int32(1835426659); % 0x6D666363 as int32 - num_magic = 1; + word_size_multiplier = 1; case 's32' qformat = 23; magic = int32(1835426659); % 0x6D666363 as int32 - num_magic = 1; + word_size_multiplier = 1; otherwise error("Use 's16', 's24', or 's32' as format."); end +num_magic = word_size_multiplier; % magic word is 2 x int16 or 1 x int32 +num_other_header = 5 * word_size_multiplier; % frame_number, reserved, energy, noise, vad + % Load output data [data, num_channels] = get_file(fn, num_channels, fmt); @@ -68,33 +77,72 @@ period_mel = idx(2)-idx(1); num_frames = length(idx); +% Header after magic is [frame_number, reserved, energy, noise_energy, vad_flag] +% as int32, followed by num_mel coefficients. +% For s16 each int32 occupies 2 int16 slots. +payload_len = num_other_header + num_mel; + % Last frame can be incomplete due to span over multiple periods -last = idx(end) + num_mel - 1; +last = idx(end) + num_magic + payload_len - 1; if (last > length(data)) num_frames = num_frames - 1; end -t_mel = period_mel / num_channels / fs; -t = (0:num_frames -1) * t_mel; -n = 1:num_mel; - -mel = zeros(num_mel, num_frames); +payload = zeros(payload_len, num_frames); for i = 1:num_frames i1 = idx(i) + num_magic; - i2 = i1 + num_mel - 1; - mel(:,i) = double(data(i1:i2)) / 2^qformat; + i2 = i1 + payload_len - 1; + payload(:,i) = double(data(i1:i2)); +end + +if strcmp(fmt, 's16') + % Reassemble int32 from pairs of int16 (little-endian). + % Low half must be treated as unsigned with mod() to handle negative int16. + frame_number = mod(payload(1,:), 65536) + payload(2,:) * 65536; + % payload(3:4,:) is reserved, skip + energy = (mod(payload(5,:), 65536) + payload(6,:) * 65536) / 2^23; + noise_energy = (mod(payload(7,:), 65536) + payload(8,:) * 65536) / 2^23; + vad = mod(payload(9,:), 65536) + payload(10,:) * 65536; + mel = payload(11:payload_len, :) / 2^qformat; +else + frame_number = payload(1, :); + % payload(2,:) is reserved, skip + energy = payload(3, :) / 2^23; + noise_energy = payload(4, :) / 2^23; + vad = payload(5, :); + mel = payload(6:payload_len, :) / 2^qformat; end -figure; +t_mel = period_mel / num_channels / fs; +t = (0:num_frames -1) * t_mel; +n = 1:num_mel; + +figure imagesc(t, n, mel); axis xy; colormap(jet); colorbar; tstr = sprintf('SOF MFCC Mel coefficients (%s)', fn); title(tstr, 'Interpreter', 'None'); -xlabel('Time (s)'); ylabel('Mel coef #'); +figure +subplot(2,1,1); +plot(t, vad) +ax = axis(); +axis([ax(1:2) -0.1 1.1]); +grid on; +title(tstr, 'Interpreter', 'None'); +xlabel('Time (s)'); +ylabel('VAD flag'); + +subplot(2,1,2); +plot(t, energy, t, noise_energy); +grid on; +legend('Energy', 'Noise Energy'); +xlabel('Time (s)'); +ylabel('Energy'); + end function [data, num_channels] = get_file(fn, num_channels, fmt) diff --git a/src/audio/mfcc/tune/setup_mfcc.m b/src/audio/mfcc/tune/setup_mfcc.m index bd2b3f11e60b..3cda3221a4fc 100644 --- a/src/audio/mfcc/tune/setup_mfcc.m +++ b/src/audio/mfcc/tune/setup_mfcc.m @@ -62,6 +62,9 @@ function setup_mfcc() cfg.mmax_init = 0; % same cfg.mmax_coef = 0; % same cfg.dynamic_mmax = false; % same + cfg.enable_vad = false; + cfg.enable_dtx = false; + cfg.update_controls = false; end function cfg = get_mel_spectrogram_config() @@ -99,6 +102,9 @@ function setup_mfcc() cfg.mmax_init = 0; % Initial value max Mel value, data clamp is mmax - top_db cfg.mmax_coef = 0; % Dynamic max Mel value decay coefficient (zero lock to found max) cfg.dynamic_mmax = true; + cfg.enable_vad = true; + cfg.enable_dtx = false; + cfg.update_controls = true; end function export_mfcc_setup(gen_cfg, cfg) @@ -107,7 +113,7 @@ function export_mfcc_setup(gen_cfg, cfg) addpath([gen_cfg.tools_path 'tune/common']); %% Blob size, size plus reserved(8) + current parameters -nbytes_data = 104; +nbytes_data = 116; %% Little endian sh32 = [0 -8 -16 -24]; @@ -160,6 +166,10 @@ function export_mfcc_setup(gen_cfg, cfg) v = 0; [b8, j] = add_w16b(v, b8, j); % vtln_high Qx.y TBD v = 0; [b8, j] = add_w16b(v, b8, j); % vtln_low Qx.y TBD v = 0; [b8, j] = add_w16b(v, b8, j); % vtln_warp Qx.y TBD +% reserved16[3] +for i = 1:3 + [b8, j] = add_w16b(0, b8, j); +end v = cfg.htk_compat; [b8, j] = add_w8b(v, b8, j); % bool v = cfg.raw_energy; [b8, j] = add_w8b(v, b8, j); % bool v = cfg.remove_dc_offset; [b8, j] = add_w8b(v, b8, j); % bool @@ -168,6 +178,13 @@ function export_mfcc_setup(gen_cfg, cfg) v = cfg.subtract_mean; [b8, j] = add_w8b(v, b8, j); % bool v = cfg.use_energy; [b8, j] = add_w8b(v, b8, j); % bool v = cfg.dynamic_mmax; [b8, j] = add_w8b(v, b8, j); % bool +v = cfg.enable_vad; [b8, j] = add_w8b(v, b8, j); % bool +v = cfg.enable_dtx; [b8, j] = add_w8b(v, b8, j); % bool +v = cfg.update_controls; [b8, j] = add_w8b(v, b8, j); % bool +% reserved_bool[5] +for i = 1:5 + [b8, j] = add_w8b(0, b8, j); +end %% Export tplg_fn = [gen_cfg.mfcc_conf_path cfg.tplg_fn]; diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h index 025eef116752..ed91238b5f7a 100644 --- a/src/include/sof/audio/mfcc/mfcc_comp.h +++ b/src/include/sof/audio/mfcc/mfcc_comp.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -31,6 +32,22 @@ #define MFCC_MAGIC 0x6d666363 /* ASCII for "mfcc" */ #define MFCC_FFT_BITS 32 +#define MFCC_MAX_SAMPLE_RATE 64000 /* Max sample rate in Hz, limited by int16_t Mel scale */ + +/** + * \brief Data header prepended to every MFCC output frame. + * + * Written before the Mel spectrum or cepstral coefficient data in each + * output frame. + */ +struct mfcc_data_header { + uint32_t magic; /**< Magic word MFCC_MAGIC (0x6d666363) */ + uint32_t frame_number; /**< Frame number, counting calculated frames starting from 0 */ + int32_t reserved; /**< Reserved for future use, set to 0 */ + int32_t energy; /**< Weighted signal energy in Q9.23 */ + int32_t noise_energy; /**< Weighted noise floor energy in Q9.23 */ + int32_t vad_flag; /**< VAD decision: 1 = speech, 0 = silence */ +}; /** \brief Type definition for processing function select return value. */ typedef void (*mfcc_func)(struct processing_module *mod, @@ -105,16 +122,19 @@ struct mfcc_state { bool mel_only; /**< When true, output Mel spectra instead of cepstral coefficients */ bool waiting_fill; /**< booleans */ bool prev_samples_valid; - bool magic_pending; /**< True when magic word not yet written for current output */ + bool header_pending; /**< True when data header not yet written for current output */ + struct mfcc_data_header header; /**< Data header for current output frame */ size_t sample_buffers_size; /**< bytes */ int16_t *out_data_ptr; /**< Read pointer into scratch data for multi-period output */ int32_t *out_data_ptr_32; /**< Read pointer for 32-bit mel-only output */ int out_remain; /**< Remaining int16_t samples to write to sink from scratch */ + uint32_t hop_count; /**< FFT hop counter, increments every processed hop */ }; /* MFCC component private data */ struct mfcc_comp_data { struct mfcc_state state; + struct mfcc_vad_state vad; struct comp_data_blob_handler *model_handler; struct sof_mfcc_config *config; int max_frames; diff --git a/src/include/sof/audio/mfcc/mfcc_vad.h b/src/include/sof/audio/mfcc/mfcc_vad.h new file mode 100644 index 000000000000..6873343d334e --- /dev/null +++ b/src/include/sof/audio/mfcc/mfcc_vad.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * + * Copyright(c) 2026 Intel Corporation. + * + * Author: Seppo Ingalsuo + */ + +/** + * \file mfcc_vad.h + * \brief Voice Activity Detection based on Mel spectrum energy. + * + * This VAD operates on the Q9.23 Mel log spectrum values produced by + * the MFCC component. It tracks a per-bin noise floor that follows + * the signal downward instantly and rises slowly, then computes a + * speech-weighted energy delta above the floor. + */ + +#ifndef __SOF_AUDIO_MFCC_MFCC_VAD_H__ +#define __SOF_AUDIO_MFCC_MFCC_VAD_H__ + +#include +#include + +struct processing_module; + +/** + * \brief Number of frames for fast noise floor convergence at startup (~1 s at 10 ms/frame). + */ +#define MFCC_VAD_NOISE_INIT_FRAMES 100 + +/** + * \brief Slow noise floor rise coefficient in Q1.15 (0.003 * 2^15). + */ +#define MFCC_VAD_NOISE_RISE_ALPHA 98 + +/** + * \brief Fast noise floor rise coefficient in Q1.15 (0.020 * 2^15). + */ +#define MFCC_VAD_NOISE_RISE_ALPHA_FAST 655 + +/** + * \brief Energy threshold for speech detection in Q9.23 (0.30 * 2^23). + */ +#define MFCC_VAD_ENERGY_THRESHOLD 2516582 + +/** + * \brief Hangover frame count to keep VAD active after last speech detection. + */ +#define MFCC_VAD_HANGOVER_FRAMES 20 + +/** + * \brief VAD state structure. + */ +struct mfcc_vad_state { + int32_t *noise_floor; /**< Per-bin noise floor in Q9.23 */ + int16_t *weights; /**< Speech-frequency emphasis weights Q1.15 */ + int32_t energy; /**< Weighted signal energy in Q9.23 */ + int32_t energy_threshold; /**< Energy threshold Q9.23 */ + int32_t noise_energy; /**< Weighted noise floor energy in Q9.23 */ + int16_t frame_count; /**< Initial convergence frames processed */ + int16_t hangover_counter; /**< Current hangover counter */ + int16_t hangover_max; /**< Maximum hangover frames */ + int16_t init_frames; /**< Number of initial frames for fast convergence */ + int16_t noise_rise_alpha_fast; /**< Fast rise alpha Q1.15 */ + int16_t noise_rise_alpha_slow; /**< Slow rise alpha Q1.15 */ + int16_t num_mel_bins; /**< Number of Mel bins in use */ + bool initialized; /**< True after first frame processed */ + bool is_speech; /**< Current VAD decision */ +}; + +/** + * \brief Initialize VAD state. + * + * \param[out] vad Pointer to VAD state to initialize. + * \param[in] num_mel_bins Number of Mel bins. + * \param[in] sample_rate Audio sample rate in Hz. + * \param[in] mod Processing module for memory allocation. + * \return 0 on success, negative error code on failure. + */ +int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int32_t sample_rate, + struct processing_module *mod); + +/** + * \brief Process one Mel spectrum frame and update VAD decision. + * + * \param[in,out] vad Pointer to VAD state. + * \param[in] mel_log Mel log spectrum in Q9.23, array of num_mel_bins values. + * \return 1 if speech detected, 0 if silence. + */ +int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log); + +#endif /* __SOF_AUDIO_MFCC_MFCC_VAD_H__ */ diff --git a/src/include/user/mfcc.h b/src/include/user/mfcc.h index 8a0defcd9883..a2f3717daa52 100644 --- a/src/include/user/mfcc.h +++ b/src/include/user/mfcc.h @@ -77,6 +77,7 @@ struct sof_mfcc_config { int16_t vtln_high; /**< Reserved, no support */ int16_t vtln_low; /**< Reserved, no support */ int16_t vtln_warp; /**< Reserved, no support */ + int16_t reserved16[3]; /**< Reserved for future 16-bit fields, set to 0 */ bool htk_compat; /**< Must be false */ bool raw_energy; /**< Reserved, no support */ bool remove_dc_offset; /**< Reserved, no support */ @@ -85,8 +86,10 @@ struct sof_mfcc_config { bool subtract_mean; /**< Must be false (0) */ bool use_energy; /**< Must be false (0) */ bool dynamic_mmax; /**< Track max Mel value for clamp with top_db value */ - bool reserved_bool2; - bool reserved_bool3; + bool enable_vad; /**< Run VAD algorithm */ + bool enable_dtx; /**< Reserved (stream once per second non-speech frames) */ + bool update_controls; /**< Update controls with VAD decision */ + bool reserved_bool[5]; /* Reserved for future boolean flags, set to false (0) */ } __attribute__((packed)); #endif /* __USER_MFCC_H__ */ diff --git a/tools/topology/topology2/include/components/mfcc/default.conf b/tools/topology/topology2/include/components/mfcc/default.conf index 42a6d6608b8b..3bbd72696806 100644 --- a/tools/topology/topology2/include/components/mfcc/default.conf +++ b/tools/topology/topology2/include/components/mfcc/default.conf @@ -1,12 +1,12 @@ -# Exported MFCC configuration 05-May-2026 +# Exported MFCC configuration 19-May-2026 # cd src/audio/mfcc/tune; octave setup_mfcc.m Object.Base.data."mfcc_config" { bytes " 0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00, - 0x68,0x00,0x00,0x00,0x01,0xd0,0x01,0x03, + 0x74,0x00,0x00,0x00,0x01,0xd0,0x01,0x03, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x68,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x74,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, @@ -17,6 +17,8 @@ Object.Base.data."mfcc_config" { 0xc3,0x35,0x00,0x2c,0x00,0x00,0x00,0x00, 0x90,0x01,0xa0,0x00,0x00,0x00,0x14,0x00, 0x0d,0x00,0x17,0x00,0x00,0x00,0x00,0x64, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01, - 0x01,0x01,0x01,0x00,0x00,0x00,0x00,0x00" + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x01,0x01,0x01, + 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00" } diff --git a/tools/topology/topology2/include/components/mfcc/mel80.conf b/tools/topology/topology2/include/components/mfcc/mel80.conf index 04aa2a15c660..480725c2d24f 100644 --- a/tools/topology/topology2/include/components/mfcc/mel80.conf +++ b/tools/topology/topology2/include/components/mfcc/mel80.conf @@ -1,12 +1,12 @@ -# Exported MFCC configuration 05-May-2026 +# Exported MFCC configuration 19-May-2026 # cd src/audio/mfcc/tune; octave setup_mfcc.m Object.Base.data."mfcc_config" { bytes " 0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00, - 0x68,0x00,0x00,0x00,0x01,0xd0,0x01,0x03, + 0x74,0x00,0x00,0x00,0x01,0xd0,0x01,0x03, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x68,0x00,0x00,0x00,0x00,0x02,0x00,0x04, + 0x74,0x00,0x00,0x00,0x00,0x02,0x00,0x04, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, @@ -18,5 +18,7 @@ Object.Base.data."mfcc_config" { 0x90,0x01,0xa0,0x00,0x40,0x1f,0x00,0x00, 0x00,0x00,0x50,0x00,0x00,0x00,0x00,0x04, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x01,0x01,0x00,0x00,0x01,0x00,0x00" + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01, + 0x01,0x00,0x00,0x01,0x01,0x00,0x01,0x00, + 0x00,0x00,0x00,0x00" } From 76b5e3e44d7b34665b062aff53b3cf10e9590846 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Tue, 19 May 2026 18:46:55 +0300 Subject: [PATCH 2/8] Audio: MFCC: Add Python script for speech to text with Whisper model Add sof_mel_to_text_live_dsp_vad.py that captures mel spectrogram frames from ALSA with embedded DSP VAD flag and performs live speech-to-text transcription using OpenVINO Whisper. The script buffers mel frames during speech and triggers Whisper inference when silence is detected after speech. Capture runs continuously in a separate thread during inference to avoid frame drops. Replace the old README.txt with a comprehensive README.md that documents the MFCC tuning tools, testbench usage with run_mfcc.sh, output file formats, Matlab/Octave decode and plotting scripts, and the new live transcription workflow. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/tune/README.md | 152 ++++++ src/audio/mfcc/tune/README.txt | 52 -- .../mfcc/tune/sof_mel_to_text_live_dsp_vad.py | 467 ++++++++++++++++++ 3 files changed, 619 insertions(+), 52 deletions(-) create mode 100644 src/audio/mfcc/tune/README.md delete mode 100644 src/audio/mfcc/tune/README.txt create mode 100644 src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py diff --git a/src/audio/mfcc/tune/README.md b/src/audio/mfcc/tune/README.md new file mode 100644 index 000000000000..e50b74d84785 --- /dev/null +++ b/src/audio/mfcc/tune/README.md @@ -0,0 +1,152 @@ +# SOF MFCC Tuning Tools + +This directory contains a tool to create configuration blob for SOF +MFCC component. It's simply run in Matlab or Octave with command +`setup_mfcc`. The MFCC configuration parameters can be edited from the +script. + +## Testbench + +The configuration can be test run with testbench. First the test topologies +need to be created with `scripts/build-tools.sh -t`. Next the testbench +is built with `scripts/rebuild-testbench.sh`. + +Once the previous steps are done, a sample wav file can be processed +with script `run_mfcc.sh`. The script converts the input to raw 16 kHz +stereo format and runs the testbench for S16, S24, and S32 bit depths, +producing both cepstral coefficient (MFCC) and Mel spectrogram outputs. + +``` +./run_mfcc.sh /usr/share/sounds/alsa/Front_Center.wav +``` + +Output files from host testbench: + +| File | Content | +|------|---------| +| `mfcc_s16.raw`, `mfcc_s24.raw`, `mfcc_s32.raw` | Cepstral coefficients | +| `mel_s16.raw`, `mel_s24.raw`, `mel_s32.raw` | Mel spectrogram | + +If the `XTENSA_PATH` environment variable is set, the script also runs +the Xtensa build of the testbench (via `xt-run`) and produces additional +output files prefixed with `xt_`: + +| File | Content | +|------|---------| +| `xt_mfcc_s16.raw`, `xt_mfcc_s24.raw`, `xt_mfcc_s32.raw` | Cepstral coefficients | +| `xt_mel_s16.raw`, `xt_mel_s24.raw`, `xt_mel_s32.raw` | Mel spectrogram | + +## Decoding and Plotting + +All output files can be decoded and plotted at once in Matlab or Octave +with the `decode_all.m` script: + +```matlab +decode_all +``` + +This calls `decode_ceps` for each MFCC file (13 cepstral coefficients) and +`decode_mel` for each Mel file (80 Mel bins), plotting spectrograms for all +files that exist including the Xtensa variants. + +Individual files can also be decoded manually: + +```matlab +[ceps, t, n] = decode_ceps('mfcc_s16.raw', 13); +``` + +In the above it's known from configuration script that MFCC was set up to +output 13 cepstral coefficients from each FFT → Mel → DCT → Cepstral +coefficients computation run. + +The 80 bands Mel output can be visualized with command: + +```matlab +[mel, t, n] = decode_mel('mel_s16.raw', 80); +``` + +## Live Whisper Transcription with DSP VAD + +The directory contains a Python script `sof_mel_to_text_live_dsp_vad.py`. +It can be used with development topologies +`sof-arl-cs42l43-l0-cs35l56-l23-mfcc.tplg` and +`sof-mtl-rt713-l0-rt1316-l12-mfcc.tplg`. It captures from default audio +device `hw:0,47` (headset microphone) Mel audio features and VAD flags. +The captured frames with detected speech are sent to Whisper speech +recognizer model for conversion to text. + +### Prerequisites + +The script needs OpenVINO. Please follow the install procedure from +. + +The following Python pip installs are needed into the same OpenVINO venv: + +```bash +pip install openvino openvino-tokenizers openvino-genai +pip install optimum[intel] +pip install transformers +pip install huggingface_hub +``` + +### NPU / GPU Support + +The script by default runs the Whisper encoder model in the NPU. To +use the NPU, install the driver from +. If the NPU is not +available, change the encoder to CPU with run option `--encoder-device CPU`. +With a GPU both `--encoder-device GPU` and `--decoder-device GPU` can be set. + +### Example run + +Check which capture devices are available. + +```bash +arecord -l +``` + +In this example the devices hw:0,47 and hw:0,48 support the audio +features stream. + +```bash +**** List of CAPTURE Hardware Devices **** +card 0: sofsoundwire [sof-soundwire], device 1: Jack In (*) [] + Subdevices: 1/1 + Subdevice #0: subdevice #0 +card 0: sofsoundwire [sof-soundwire], device 4: Microphone (*) [] + Subdevices: 1/1 + Subdevice #0: subdevice #0 +card 0: sofsoundwire [sof-soundwire], device 47: Jack In Audio Features (*) [] + Subdevices: 1/1 + Subdevice #0: subdevice #0 +card 0: sofsoundwire [sof-soundwire], device 48: Microphone Audio Features (*) [] + Subdevices: 1/1 + Subdevice #0: subdevice #0 +``` + +With Whisper model run the CPU and with internal microphones of laptop +the run command is: + +```bash +python3 sof_mel_to_text_live_dsp_vad.py --encoder-device CPU --device hw:0,48 +``` + +The script run output is shown below + +```bash +=== Live SOF Mel → Whisper Transcription (DSP VAD) === + +Starting capture: arecord -D hw:0,48 -f S32_LE -c 2 -r 16000 -t raw --buffer-size 8192 +VAD source: DSP (embedded in stream) +Silence trigger: 100ms (10 frames) +Whisper model: whisper-medium-int4-ov (encoder: CPU, decoder: CPU) + + [ 0.01s] SILENCE + [ 1.39s] SPEECH + [ 2.57s] SILENCE + [ 2.66s] Transcribing 118 frames (1.2s)... + [Whisper] encoder: 1.30s + [Whisper] decoder: 0.59s (3 tokens) + + >> "Hello computer" +``` diff --git a/src/audio/mfcc/tune/README.txt b/src/audio/mfcc/tune/README.txt deleted file mode 100644 index a0c3189e81a3..000000000000 --- a/src/audio/mfcc/tune/README.txt +++ /dev/null @@ -1,52 +0,0 @@ -This directory contains a tool to create configuration blob for SOF -MFCC component. It's simply run in Matlab or Octave with command -"setup_mfcc". The MFCC configuration parameters can be edited from the -script. - -The configuration can be test run with testbench. First the test topologies -need to be created with "scripts/build-tools.sh -t". Next the testbench -is build with "scripts/rebuild-testbench.sh". - -Once the previous steps are done, a sample wav file can be processed -with script run_mfcc.sh. The script converts the input to raw 16 kHz -stereo format and runs the testbench for S16, S24, and S32 bit depths, -producing both cepstral coefficient (MFCC) and Mel spectrogram outputs. - -./run_mfcc.sh /usr/share/sounds/alsa/Front_Center.wav - -Output files from host testbench: - mfcc_s16.raw, mfcc_s24.raw, mfcc_s32.raw - cepstral coefficients - mel_s16.raw, mel_s24.raw, mel_s32.raw - Mel spectrogram - -If the XTENSA_PATH environment variable is set, the script also runs -the Xtensa build of the testbench (via xt-run) and produces additional -output files prefixed with "xt_": - xt_mfcc_s16.raw, xt_mfcc_s24.raw, xt_mfcc_s32.raw - xt_mel_s16.raw, xt_mel_s24.raw, xt_mel_s32.raw - -All output files can be decoded and plotted at once in Matlab or Octave -with the decode_all.m script: - -decode_all - -This calls decode_ceps for each MFCC file (13 cepstral coefficients) and -decode_mel for each Mel file (80 Mel bins), plotting spectrograms for all -files that exist including the Xtensa variants. - -Individual files can also be decoded manually: - -[ceps, t, n] = decode_ceps('mfcc_s16.raw', 13); - -In the above it's known from configuration script that MFCC was set up to -output 13 cepstral coefficients from each FFT -> Mel -> DCT -> Cepstral -coefficients computation run. - -The 80 bands Mel output can be visualized with command: - -[mel, t, n] = decode_mel('mel_s16.raw', 80); - -Other kind of signals have quite big visual difference in audio features. Try -e.g. other sound files found in computer. - -./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/bark.ogg -./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/sonar.ogg diff --git a/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py b/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py new file mode 100644 index 000000000000..5c267a57ca79 --- /dev/null +++ b/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py @@ -0,0 +1,467 @@ +"""Live SOF mel capture with DSP VAD-triggered Whisper transcription. + +Captures mel frames from ALSA with embedded VAD flag from the DSP. +Frame format: [magic(int32), frame_number(uint32), reserved(int32), energy(int32), noise_energy(int32), vad_flag(int32), mel[0..79](int32)] +When silence of 100ms is detected after speech, sends the buffered mel +features to Whisper (OpenVINO encoder+decoder) for transcription. +Capture continues running during Whisper inference. + +Usage: + python sof_mel_to_text_live_dsp_vad.py [--device hw:0,47] [--model whisper-medium-int4-ov] + python sof_mel_to_text_live_dsp_vad.py --plot # with live spectrogram +""" + +import argparse +import os +import struct +import subprocess +import threading +import time +import numpy as np +import openvino as ov +import huggingface_hub as hf_hub +from pathlib import Path + +# Graphics imports deferred until --plot is used +matplotlib = None +plt = None + +# SOF mel_s32.raw format constants (with DSP data header) +SOF_MAGIC_BYTES = struct.pack(' 3: + del buf[:-3] + return None, None + end = idx + SOF_FRAME_BYTES + if end > len(buf): + del buf[:idx] + return None, None + # Parse vad_flag at offset 20 (after magic + frame_number + reserved + energy + noise_energy) + vad_flag = struct.unpack_from('> \"{text}\"\n", flush=True) + else: + print(" [Whisper] empty result", flush=True) + + try: + while True: + data = proc.stdout.read(read_chunk) + if not data: + rc = proc.poll() + if rc is not None: + stderr_out = proc.stderr.read().decode(errors='replace') + print(f"\narecord exited with code {rc}") + if stderr_out: + print(f"stderr: {stderr_out}") + break + continue + + buf.extend(data) + + while True: + vad_flag, frame_ints = find_frame_in_buffer(buf) + if frame_ints is None: + break + + frame_num += 1 + mel = decode_mel_frame(frame_ints) + speech = vad_flag != 0 + + # Print VAD transitions when not plotting + if plotter is None and speech != prev_speech: + t = frame_num * 0.01 + tag = "SPEECH" if speech else "SILENCE" + print(f" [{t:7.2f}s] {tag}", flush=True) + prev_speech = speech + + # Update plot + if plotter is not None: + plotter.update(mel, speech) + + # --- Speech buffering logic --- + if speech: + if len(speech_buffer) >= MAX_SPEECH_FRAMES: + n = len(speech_buffer) + duration = n * 0.01 + t = frame_num * 0.01 + print(f" [{t:7.2f}s] Buffer full ({duration:.1f}s), " + f"forcing transcription of {n} frames", + flush=True) + if not transcriber.is_busy(): + frames_copy = list(speech_buffer) + transcriber.transcribe_async( + frames_copy, on_transcription) + else: + print(f" [{t:7.2f}s] (Whisper busy, " + f"dropping {n} frames)", flush=True) + speech_buffer.clear() + speech_buffer.append(mel.copy()) + silence_counter = 0 + was_speaking = True + else: + if was_speaking: + silence_counter += 1 + if silence_counter >= SILENCE_TRIGGER_FRAMES: + n = len(speech_buffer) + duration = n * 0.01 + t = frame_num * 0.01 + + if n < MIN_SPEECH_FRAMES: + # Too short — discard + speech_buffer.clear() + silence_counter = 0 + was_speaking = False + continue + + # Silence threshold reached — send to Whisper + print(f" [{t:7.2f}s] Transcribing {n} frames " + f"({duration:.1f}s)...", flush=True) + + if not transcriber.is_busy(): + frames_copy = list(speech_buffer) + transcriber.transcribe_async( + frames_copy, on_transcription) + else: + print(f" [{t:7.2f}s] (Whisper busy, " + f"dropping {n} frames)", flush=True) + + speech_buffer.clear() + silence_counter = 0 + was_speaking = False + + except (KeyboardInterrupt, BrokenPipeError): + pass + finally: + if proc.poll() is None: + proc.terminate() + try: + proc.wait(timeout=3) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + if plotter is not None: + try: + plt.close(plotter.fig) + except Exception: + pass + print("\n\nCapture stopped.") + + +def main(): + parser = argparse.ArgumentParser( + description="Live SOF mel capture with DSP VAD-triggered Whisper transcription") + parser.add_argument('--device', '-D', default='hw:0,47', + help='ALSA capture device (default: hw:0,47)') + parser.add_argument('--rate', '-r', type=int, default=16000, + help='Sample rate for arecord (default: 16000)') + parser.add_argument('--model', '-m', default='whisper-medium-int4-ov', + help='Path to Whisper OpenVINO model directory') + parser.add_argument('--encoder-device', default='NPU', + help='OpenVINO device for encoder (default: NPU)') + parser.add_argument('--decoder-device', default='CPU', + help='OpenVINO device for decoder (default: CPU)') + parser.add_argument('--plot', action='store_true', + help='Show live scrolling mel spectrogram and VAD plot') + args = parser.parse_args() + model_id = "OpenVINO/" + os.path.basename(args.model) + if not os.path.isdir(args.model): + print(f"Downloading model {model_id} ...") + hf_hub.snapshot_download(model_id, local_dir=args.model) + + print("=== Live SOF Mel → Whisper Transcription (DSP VAD) ===\n") + run_capture(args.device, args.rate, args.model, args.encoder_device, + args.decoder_device, enable_plot=args.plot) + + +if __name__ == '__main__': + main() From 475346a12d3fad63f8249045adc2a303bd5023c8 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Tue, 19 May 2026 19:42:36 +0300 Subject: [PATCH 3/8] Audio: MFCC: Add VAD switch control notification to user space Add IPC4 notification that sends the VAD state to user space via a switch control whenever the VAD decision changes between speech and silence. The notification is initialized during prepare and sent from the audio processing path on VAD state transitions. The implementation follows the TDFB/sound_dose notification pattern: mfcc_ipc4.c contains the IPC4-specific notification init and send functions, while mfcc.c provides weak stubs so IPC3 builds link without the IPC4 dependencies. Add handling for SOF_IPC4_SWITCH_CONTROL_PARAM_ID in mfcc_get_config and mfcc_set_config so the kernel driver can read back the current VAD state after receiving a notification. The switch control is read-only from the DSP side. Both the notification init and the VAD state change detection are gated on the update_controls flag in the configuration blob struct. Add a switch control (mixer) to the MFCC topology2 widget definition for the VAD notification. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/CMakeLists.txt | 3 + src/audio/mfcc/mfcc.c | 48 +++--- src/audio/mfcc/mfcc_common.c | 19 ++- src/audio/mfcc/mfcc_ipc4.c | 140 ++++++++++++++++++ src/include/sof/audio/mfcc/mfcc_comp.h | 53 +++++++ .../include/bench/mfcc_controls_capture.conf | 6 +- .../include/bench/mfcc_controls_playback.conf | 6 +- .../topology2/include/components/mfcc.conf | 22 +++ .../topology2/platform/intel/dmic1-mfcc.conf | 5 +- .../intel/sdw-dmic-audio-feature.conf | 3 + .../intel/sdw-jack-audio-feature.conf | 3 + 11 files changed, 266 insertions(+), 42 deletions(-) create mode 100644 src/audio/mfcc/mfcc_ipc4.c diff --git a/src/audio/mfcc/CMakeLists.txt b/src/audio/mfcc/CMakeLists.txt index 10daf78aa2a6..274c7aa05eb8 100644 --- a/src/audio/mfcc/CMakeLists.txt +++ b/src/audio/mfcc/CMakeLists.txt @@ -5,4 +5,7 @@ if(CONFIG_COMP_MFCC STREQUAL "m" AND DEFINED CONFIG_LLEXT) add_dependencies(app mfcc) else() add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c mfcc_vad.c) + if(CONFIG_IPC_MAJOR_4) + add_local_sources(sof mfcc_ipc4.c) + endif() endif() diff --git a/src/audio/mfcc/mfcc.c b/src/audio/mfcc/mfcc.c index ea09d919009b..e56e3b04a7cc 100644 --- a/src/audio/mfcc/mfcc.c +++ b/src/audio/mfcc/mfcc.c @@ -97,36 +97,14 @@ static int mfcc_free(struct processing_module *mod) struct mfcc_comp_data *cd = module_get_private_data(mod); comp_info(mod->dev, "entry"); + ipc_msg_free(cd->msg); + cd->msg = NULL; mod_data_blob_handler_free(mod, cd->model_handler); mfcc_free_buffers(mod); mod_free(mod, cd); return 0; } -static int mfcc_get_config(struct processing_module *mod, - uint32_t config_id, uint32_t *data_offset_size, - uint8_t *fragment, size_t fragment_size) -{ - struct sof_ipc_ctrl_data *cdata = (struct sof_ipc_ctrl_data *)fragment; - struct mfcc_comp_data *cd = module_get_private_data(mod); - - comp_info(mod->dev, "entry"); - - return comp_data_blob_get_cmd(cd->model_handler, cdata, fragment_size); -} - -static int mfcc_set_config(struct processing_module *mod, uint32_t config_id, - enum module_cfg_fragment_position pos, uint32_t data_offset_size, - const uint8_t *fragment, size_t fragment_size, uint8_t *response, - size_t response_size) -{ - struct mfcc_comp_data *cd = module_get_private_data(mod); - - comp_info(mod->dev, "entry"); - - return comp_data_blob_set(cd->model_handler, pos, data_offset_size, - fragment, fragment_size); -} static int mfcc_process(struct processing_module *mod, struct input_stream_buffer *input_buffers, int num_input_buffers, @@ -187,22 +165,30 @@ static int mfcc_prepare(struct processing_module *mod, audio_stream_get_channels(&sourceb->stream)); if (ret < 0) { comp_err(dev, "setup failed."); - goto err; + return ret; } + } else { + comp_err(dev, "configuration is missing."); + return -EINVAL; } cd->mfcc_func = mfcc_find_func(source_format, sink_format, mfcc_fm, ARRAY_SIZE(mfcc_fm)); if (!cd->mfcc_func) { comp_err(dev, "No proc func"); - ret = -EINVAL; - goto err; + return -EINVAL; } - return 0; + /* Initialize VAD switch control notification if enabled */ + if (cd->config->enable_vad && cd->config->update_controls) { + if (!cd->msg) { + ret = mfcc_ipc_notification_init(mod); + if (ret < 0) + return ret; + } + } -err: - comp_set_state(dev, COMP_TRIGGER_RESET); - return ret; + cd->vad_prev = false; + return 0; } static int mfcc_reset(struct processing_module *mod) diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c index 70ccdcfa55d6..0e6efc2e8ba4 100644 --- a/src/audio/mfcc/mfcc_common.c +++ b/src/audio/mfcc/mfcc_common.c @@ -29,8 +29,9 @@ LOG_MODULE_REGISTER(mfcc_common, CONFIG_SOF_LOG_LEVEL); * The main processing function for MFCC */ -static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data *cd) +static int mfcc_stft_process(struct processing_module *mod, struct mfcc_comp_data *cd) { + const struct comp_dev *dev = mod->dev; struct sof_mfcc_config *config = cd->config; struct mfcc_state *state = &cd->state; struct mfcc_buffer *buf = &state->buf; @@ -187,6 +188,16 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data * /* Increment hop counter at end of hop processing */ state->hop_count++; + + /* Send notification when VAD state changes */ + if (config->enable_vad && config->update_controls) { + bool vad_now = cd->vad.is_speech; + + if (vad_now != cd->vad_prev) { + mfcc_send_vad_notification(mod, vad_now ? 1 : 0); + cd->vad_prev = vad_now; + } + } } return cc_count; @@ -295,7 +306,7 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer mfcc_source_copy_s16(bsource, buf, &state->emph, frames, state->source_channel); /* Run STFT and processing after FFT: Mel auditory filter and DCT. */ - num_ceps = mfcc_stft_process(mod->dev, cd); + num_ceps = mfcc_stft_process(mod, cd); /* If new output produced, set up pointer into scratch data and mark header pending */ if (num_ceps > 0) { @@ -400,7 +411,7 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer mfcc_source_copy_s24(bsource, buf, &state->emph, frames, state->source_channel); /* Run STFT and processing after FFT */ - num_ceps = mfcc_stft_process(mod->dev, cd); + num_ceps = mfcc_stft_process(mod, cd); /* If new output produced, set up pointer into scratch data */ if (num_ceps > 0) { @@ -485,7 +496,7 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer mfcc_source_copy_s32(bsource, buf, &state->emph, frames, state->source_channel); /* Run STFT and processing after FFT */ - num_ceps = mfcc_stft_process(mod->dev, cd); + num_ceps = mfcc_stft_process(mod, cd); /* If new output produced, set up pointer into scratch data */ if (num_ceps > 0) { diff --git a/src/audio/mfcc/mfcc_ipc4.c b/src/audio/mfcc/mfcc_ipc4.c new file mode 100644 index 000000000000..bb20d85e413b --- /dev/null +++ b/src/audio/mfcc/mfcc_ipc4.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2026 Intel Corporation. +// +// Author: Seppo Ingalsuo + +/** + * \file mfcc_ipc4.c + * \brief IPC4-specific functions for MFCC component. + * + * Provides VAD switch control notification to user space via the + * IPC4 module notification mechanism. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +LOG_MODULE_DECLARE(mfcc, CONFIG_SOF_LOG_LEVEL); + +/** + * \brief Initialize IPC notification message for VAD switch control. + * + * Allocates and configures the IPC message used to send VAD state + * change notifications to user space via a switch control. + */ +int mfcc_ipc_notification_init(struct processing_module *mod) +{ + struct mfcc_comp_data *cd = module_get_private_data(mod); + struct ipc_msg msg_proto; + struct comp_dev *dev = mod->dev; + struct comp_ipc_config *ipc_config = &dev->ipc_config; + union ipc4_notification_header *primary = + (union ipc4_notification_header *)&msg_proto.header; + struct sof_ipc4_notify_module_data *msg_module_data; + struct sof_ipc4_control_msg_payload *msg_payload; + + memset_s(&msg_proto, sizeof(msg_proto), 0, sizeof(msg_proto)); + primary->r.notif_type = SOF_IPC4_MODULE_NOTIFICATION; + primary->r.type = SOF_IPC4_GLB_NOTIFICATION; + primary->r.rsp = SOF_IPC4_MESSAGE_DIR_MSG_REQUEST; + primary->r.msg_tgt = SOF_IPC4_MESSAGE_TARGET_FW_GEN_MSG; + cd->msg = ipc_msg_w_ext_init(msg_proto.header, msg_proto.extension, + sizeof(struct sof_ipc4_notify_module_data) + + sizeof(struct sof_ipc4_control_msg_payload) + + sizeof(struct sof_ipc4_ctrl_value_chan)); + if (!cd->msg) { + comp_err(dev, "Failed to initialize VAD notification"); + return -ENOMEM; + } + + msg_module_data = (struct sof_ipc4_notify_module_data *)cd->msg->tx_data; + msg_module_data->instance_id = IPC4_INST_ID(ipc_config->id); + msg_module_data->module_id = IPC4_MOD_ID(ipc_config->id); + msg_module_data->event_id = SOF_IPC4_NOTIFY_MODULE_EVENTID_ALSA_MAGIC_VAL | + SOF_IPC4_SWITCH_CONTROL_PARAM_ID; + msg_module_data->event_data_size = sizeof(struct sof_ipc4_control_msg_payload) + + sizeof(struct sof_ipc4_ctrl_value_chan); + + msg_payload = (struct sof_ipc4_control_msg_payload *)msg_module_data->event_data; + msg_payload->id = MFCC_CTRL_INDEX_VAD; + msg_payload->num_elems = 1; + msg_payload->chanv[0].channel = 0; + + comp_dbg(dev, "VAD notification init: instance_id = 0x%08x, module_id = 0x%08x", + msg_module_data->instance_id, msg_module_data->module_id); + return 0; +} + +/** + * \brief Send VAD switch control notification to user space. + * \param mod Processing module. + * \param val VAD value: 1 = speech, 0 = silence. + */ +void mfcc_send_vad_notification(struct processing_module *mod, uint32_t val) +{ + struct mfcc_comp_data *cd = module_get_private_data(mod); + struct sof_ipc4_notify_module_data *msg_module_data; + struct sof_ipc4_control_msg_payload *msg_payload; + + if (!cd->msg) + return; + + msg_module_data = (struct sof_ipc4_notify_module_data *)cd->msg->tx_data; + msg_payload = (struct sof_ipc4_control_msg_payload *)msg_module_data->event_data; + msg_payload->chanv[0].value = val; + ipc_msg_send(cd->msg, NULL, false); +} + +int mfcc_get_config(struct processing_module *mod, + uint32_t config_id, uint32_t *data_offset_size, + uint8_t *fragment, size_t fragment_size) +{ + struct sof_ipc_ctrl_data *cdata = (struct sof_ipc_ctrl_data *)fragment; + struct mfcc_comp_data *cd = module_get_private_data(mod); + struct sof_ipc4_control_msg_payload *ctl; + + comp_info(mod->dev, "entry"); + + switch (config_id) { + case SOF_IPC4_SWITCH_CONTROL_PARAM_ID: + ctl = (struct sof_ipc4_control_msg_payload *)fragment; + if (ctl->id == MFCC_CTRL_INDEX_VAD && ctl->num_elems == 1) { + ctl->chanv[0].value = cd->vad_prev ? 1 : 0; + *data_offset_size = sizeof(*ctl) + sizeof(ctl->chanv[0]); + return 0; + } + return -EINVAL; + default: + return comp_data_blob_get_cmd(cd->model_handler, cdata, fragment_size); + } +} + +int mfcc_set_config(struct processing_module *mod, uint32_t config_id, + enum module_cfg_fragment_position pos, uint32_t data_offset_size, + const uint8_t *fragment, size_t fragment_size, uint8_t *response, + size_t response_size) +{ + struct mfcc_comp_data *cd = module_get_private_data(mod); + + comp_info(mod->dev, "entry"); + + switch (config_id) { + case SOF_IPC4_SWITCH_CONTROL_PARAM_ID: + /* VAD switch is read-only, ignore set requests */ + return 0; + default: + return comp_data_blob_set(cd->model_handler, pos, data_offset_size, + fragment, fragment_size); + } +} diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h index ed91238b5f7a..4e41c8a4df08 100644 --- a/src/include/sof/audio/mfcc/mfcc_comp.h +++ b/src/include/sof/audio/mfcc/mfcc_comp.h @@ -9,10 +9,12 @@ #define __SOF_AUDIO_MFCC_MFCC_COMP_H__ #include +#include #include #include #include #include +#include #include #include @@ -34,6 +36,9 @@ #define MFCC_FFT_BITS 32 #define MFCC_MAX_SAMPLE_RATE 64000 /* Max sample rate in Hz, limited by int16_t Mel scale */ +/** \brief Switch control index for VAD notification to user space */ +#define MFCC_CTRL_INDEX_VAD 0 + /** * \brief Data header prepended to every MFCC output frame. * @@ -137,7 +142,9 @@ struct mfcc_comp_data { struct mfcc_vad_state vad; struct comp_data_blob_handler *model_handler; struct sof_mfcc_config *config; + struct ipc_msg *msg; /**< IPC notification for VAD switch control */ int max_frames; + bool vad_prev; /**< Previous VAD state for edge detection */ mfcc_func mfcc_func; /**< processing function */ }; @@ -192,6 +199,52 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer struct output_stream_buffer *bsink, int frames); #endif +#if CONFIG_IPC_MAJOR_4 +int mfcc_ipc_notification_init(struct processing_module *mod); + +void mfcc_send_vad_notification(struct processing_module *mod, uint32_t val); + +int mfcc_get_config(struct processing_module *mod, + uint32_t config_id, uint32_t *data_offset_size, + uint8_t *fragment, size_t fragment_size); + +int mfcc_set_config(struct processing_module *mod, uint32_t config_id, + enum module_cfg_fragment_position pos, uint32_t data_offset_size, + const uint8_t *fragment, size_t fragment_size, uint8_t *response, + size_t response_size); + +#else +static inline int mfcc_ipc_notification_init(struct processing_module *mod) +{ + return 0; +} + +static inline void mfcc_send_vad_notification(struct processing_module *mod, uint32_t val) +{ +} + +static inline int mfcc_get_config(struct processing_module *mod, + uint32_t config_id, uint32_t *data_offset_size, + uint8_t *fragment, size_t fragment_size) +{ + struct sof_ipc_ctrl_data *cdata = (struct sof_ipc_ctrl_data *)fragment; + struct mfcc_comp_data *cd = module_get_private_data(mod); + + return comp_data_blob_get_cmd(cd->model_handler, cdata, fragment_size); +} + +static inline int mfcc_set_config(struct processing_module *mod, uint32_t config_id, + enum module_cfg_fragment_position pos, uint32_t data_offset_size, + const uint8_t *fragment, size_t fragment_size, uint8_t *response, + size_t response_size) +{ + struct mfcc_comp_data *cd = module_get_private_data(mod); + + return comp_data_blob_set(cd->model_handler, pos, data_offset_size, + fragment, fragment_size); +} +#endif + #ifdef UNIT_TEST void sys_comp_module_mfcc_interface_init(void); #endif diff --git a/tools/topology/topology2/include/bench/mfcc_controls_capture.conf b/tools/topology/topology2/include/bench/mfcc_controls_capture.conf index d45baec1ee8f..8788387ec8c7 100644 --- a/tools/topology/topology2/include/bench/mfcc_controls_capture.conf +++ b/tools/topology/topology2/include/bench/mfcc_controls_capture.conf @@ -9,9 +9,9 @@ "mel80" "include/components/mfcc/mel80.conf" } } - #mixer."1" { - # name '$ANALOG_CAPTURE_PCM MFCC switch or volume' - #} + mixer."1" { + name '$ANALOG_CAPTURE_PCM MFCC switch' + } #enum."1" { # name '$ANALOG_CAPTURE_PCM MFCC enum' #} diff --git a/tools/topology/topology2/include/bench/mfcc_controls_playback.conf b/tools/topology/topology2/include/bench/mfcc_controls_playback.conf index cc2ada04b8d7..007dbb91cd4f 100644 --- a/tools/topology/topology2/include/bench/mfcc_controls_playback.conf +++ b/tools/topology/topology2/include/bench/mfcc_controls_playback.conf @@ -9,9 +9,9 @@ "mel80" "include/components/mfcc/mel80.conf" } } - #mixer."1" { - # name '$ANALOG_PLAYBACK_PCM MFCC switch or volume' - #} + mixer."1" { + name '$ANALOG_PLAYBACK_PCM MFCC switch' + } #enum."1" { # name '$ANALOG_PLAYBACK_PCM MFCC enum' #} diff --git a/tools/topology/topology2/include/components/mfcc.conf b/tools/topology/topology2/include/components/mfcc.conf index 221df8f2d437..775f0d79b1f5 100644 --- a/tools/topology/topology2/include/components/mfcc.conf +++ b/tools/topology/topology2/include/components/mfcc.conf @@ -13,6 +13,8 @@ # # Where M is pipeline ID and N is a unique integer in the parent object. + + Class.Widget."mfcc" { # # Pipeline ID @@ -53,6 +55,26 @@ Class.Widget."mfcc" { unique "instance" } + # + # MFCC Widget switch control to optionally notify VAD state changes + # + Object.Control { + mixer."1" { + Object.Base.channel.1 { + name "fc" + shift 0 + } + Object.Base.ops.1 { + name "ctl" + info "volsw" + #259 binds the mixer control to switch get/put handlers + get 259 + put 259 + } + max 1 + } + } + # # Default attributes for mfcc # diff --git a/tools/topology/topology2/platform/intel/dmic1-mfcc.conf b/tools/topology/topology2/platform/intel/dmic1-mfcc.conf index f3926a283a8b..3aad756a85f5 100644 --- a/tools/topology/topology2/platform/intel/dmic1-mfcc.conf +++ b/tools/topology/topology2/platform/intel/dmic1-mfcc.conf @@ -454,11 +454,14 @@ Object.Widget.mfcc.1 { index $DMIC1_HOST_PIPELINE_ID Object.Control { bytes."1" { - name 'Analog Capture TDFB bytes' + name "Dmic1 Capture MFCC bytes" IncludeByKey.DMIC1_MFCC_PARAMS { "default" "include/components/mfcc/default.conf" } } + mixer."1" { + name "Dmic1 Capture MFCC VAD" + } } IncludeByKey.NUM_DMICS { "1" { diff --git a/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf index 87039b261597..623574db1784 100644 --- a/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf +++ b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf @@ -21,6 +21,9 @@ Object.Pipeline.host-gateway-src-mfcc-capture [ name "$SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC bytes" } + mixer."1" { + name "$SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC VAD" + } } } } diff --git a/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf index 9645199d6907..e9652f49c17f 100644 --- a/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf +++ b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf @@ -21,6 +21,9 @@ Object.Pipeline.host-gateway-src-mfcc-capture [ name "$SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC bytes" } + mixer."1" { + name "$SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC VAD" + } } } } From cc94891e8f1b2394d7e72599278998c77d8782ff Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Fri, 22 May 2026 10:26:43 +0300 Subject: [PATCH 4/8] Audio: MFCC: Fix memory leak on reset-prepare cycles mfcc_reset() did not free buffers allocated by mfcc_setup(), so a stop->reset->prepare->start cycle would leak all MFCC allocations (FFT buffers, mel filterbank, DCT matrix, lifter, VAD buffers). This patch fixes the issue by calling mfcc_free_buffers() from mfcc_reset(). The pointers are set to NULL after free via a helper function mfcc_free_and_null(), so mfcc_free() won't double-free when it calls mfcc_free_buffers() again later. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/mfcc.c | 10 +++++++++- src/audio/mfcc/mfcc_setup.c | 26 ++++++++++++++++++-------- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/src/audio/mfcc/mfcc.c b/src/audio/mfcc/mfcc.c index e56e3b04a7cc..fc81f69c3a10 100644 --- a/src/audio/mfcc/mfcc.c +++ b/src/audio/mfcc/mfcc.c @@ -175,6 +175,7 @@ static int mfcc_prepare(struct processing_module *mod, cd->mfcc_func = mfcc_find_func(source_format, sink_format, mfcc_fm, ARRAY_SIZE(mfcc_fm)); if (!cd->mfcc_func) { comp_err(dev, "No proc func"); + mfcc_free_buffers(mod); return -EINVAL; } @@ -182,8 +183,10 @@ static int mfcc_prepare(struct processing_module *mod, if (cd->config->enable_vad && cd->config->update_controls) { if (!cd->msg) { ret = mfcc_ipc_notification_init(mod); - if (ret < 0) + if (ret < 0) { + mfcc_free_buffers(mod); return ret; + } } } @@ -197,6 +200,11 @@ static int mfcc_reset(struct processing_module *mod) comp_info(mod->dev, "entry"); + /* Free MFCC buffers to prevent leaks on reset->prepare cycles. + * mfcc_free_buffers() NULLs the pointers after free. + */ + mfcc_free_buffers(mod); + /* Reset to similar state as init() */ cd->mfcc_func = NULL; return 0; diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c index 25c6876344af..73e69f6408a8 100644 --- a/src/audio/mfcc/mfcc_setup.c +++ b/src/audio/mfcc/mfcc_setup.c @@ -396,17 +396,27 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i return ret; } +static void mfcc_free_and_null(struct processing_module *mod, void **ptr) +{ + mod_free(mod, *ptr); + *ptr = NULL; +} + +/* Free MFCC buffers to prevent leaks on reset->prepare cycles. + * mfcc_free_buffers() NULLs the pointers after free. + */ void mfcc_free_buffers(struct processing_module *mod) { struct mfcc_comp_data *cd = module_get_private_data(mod); mod_fft_plan_free(mod, cd->state.fft.fft_plan); - mod_free(mod, cd->state.fft.fft_buf); - mod_free(mod, cd->state.fft.fft_out); - mod_free(mod, cd->state.buffers); - mod_free(mod, cd->state.melfb.data); - mod_free(mod, cd->state.dct.matrix); - mod_free(mod, cd->state.lifter.matrix); - mod_free(mod, cd->vad.noise_floor); - mod_free(mod, cd->vad.weights); + cd->state.fft.fft_plan = NULL; + mfcc_free_and_null(mod, (void **)&cd->state.fft.fft_buf); + mfcc_free_and_null(mod, (void **)&cd->state.fft.fft_out); + mfcc_free_and_null(mod, (void **)&cd->state.buffers); + mfcc_free_and_null(mod, (void **)&cd->state.melfb.data); + mfcc_free_and_null(mod, (void **)&cd->state.dct.matrix); + mfcc_free_and_null(mod, (void **)&cd->state.lifter.matrix); + mfcc_free_and_null(mod, (void **)&cd->vad.noise_floor); + mfcc_free_and_null(mod, (void **)&cd->vad.weights); } From 21e7cf381ada78160a8b5e64c9fc67cee758bce2 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Fri, 22 May 2026 17:41:39 +0300 Subject: [PATCH 5/8] audio: mfcc: switch to source/sink API, int32 output, and DTX Switch from process_audio_stream to source/sink API. Add compress PCM output mode (variable-size frames, no zero padding) alongside legacy mode (full period with zero-fill). Unify all output to int32 Q9.23 regardless of source format. Remove out_data_ptr_32, mel_spectra int16 copy, mfcc_func typedef, and per-format output functions from mfcc_common/hifi3/hifi4. Add DTX for compress mode: suppress silence frames after configurable trailing count, with optional periodic keepalive. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/mfcc.c | 97 ++-- src/audio/mfcc/mfcc_common.c | 668 +++++++++++++------------ src/audio/mfcc/mfcc_generic.c | 158 ------ src/audio/mfcc/mfcc_hifi3.c | 186 ------- src/audio/mfcc/mfcc_hifi4.c | 164 ------ src/audio/mfcc/mfcc_setup.c | 18 +- src/include/sof/audio/mfcc/mfcc_comp.h | 64 +-- src/include/user/mfcc.h | 9 +- 8 files changed, 473 insertions(+), 891 deletions(-) diff --git a/src/audio/mfcc/mfcc.c b/src/audio/mfcc/mfcc.c index fc81f69c3a10..971e088cc2cf 100644 --- a/src/audio/mfcc/mfcc.c +++ b/src/audio/mfcc/mfcc.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include #include @@ -36,29 +38,31 @@ LOG_MODULE_REGISTER(mfcc, CONFIG_SOF_LOG_LEVEL); SOF_DEFINE_REG_UUID(mfcc); -__cold_rodata const struct mfcc_func_map mfcc_fm[] = { +/** \brief Source/sink API based source copy function map. */ +struct mfcc_source_func_map { + uint8_t source; + mfcc_source_func func; +}; + +__cold_rodata static const struct mfcc_source_func_map mfcc_sfm[] = { #if CONFIG_FORMAT_S16LE - {SOF_IPC_FRAME_S16_LE, mfcc_s16_default}, -#endif /* CONFIG_FORMAT_S16LE */ + {SOF_IPC_FRAME_S16_LE, mfcc_source_copy_s16}, +#endif #if CONFIG_FORMAT_S24LE - {SOF_IPC_FRAME_S24_4LE, mfcc_s24_default}, -#endif /* CONFIG_FORMAT_S24LE */ + {SOF_IPC_FRAME_S24_4LE, mfcc_source_copy_s24}, +#endif #if CONFIG_FORMAT_S32LE - {SOF_IPC_FRAME_S32_LE, mfcc_s32_default}, -#endif /* CONFIG_FORMAT_S32LE */ + {SOF_IPC_FRAME_S32_LE, mfcc_source_copy_s32}, +#endif }; -static mfcc_func mfcc_find_func(enum sof_ipc_frame source_format, - enum sof_ipc_frame sink_format, - const struct mfcc_func_map *map, - int n) +static mfcc_source_func mfcc_find_source_func(enum sof_ipc_frame source_format) { int i; - /* Find suitable processing function from map. */ - for (i = 0; i < n; i++) { - if (source_format == map[i].source) - return map[i].func; + for (i = 0; i < ARRAY_SIZE(mfcc_sfm); i++) { + if (source_format == mfcc_sfm[i].source) + return mfcc_sfm[i].func; } return NULL; @@ -106,25 +110,38 @@ static int mfcc_free(struct processing_module *mod) } +/** + * \brief Source/sink API based process function for MFCC. + * + * Reads input audio from sof_source, runs the STFT/Mel/DCT stage, and + * delegates output formatting and commit handling to mfcc_common.c. + */ static int mfcc_process(struct processing_module *mod, - struct input_stream_buffer *input_buffers, int num_input_buffers, - struct output_stream_buffer *output_buffers, int num_output_buffers) + struct sof_source **sources, int num_of_sources, + struct sof_sink **sinks, int num_of_sinks) { struct mfcc_comp_data *cd = module_get_private_data(mod); - struct audio_stream *source = input_buffers->data; - struct audio_stream *sink = output_buffers->data; - int frames = input_buffers->size; - - comp_dbg(mod->dev, "start"); - - frames = MIN(frames, cd->max_frames); - cd->mfcc_func(mod, input_buffers, output_buffers, frames); - - /* TODO: use module_update_buffer_position() from #6194 */ - input_buffers->consumed += audio_stream_frame_bytes(source) * frames; - output_buffers->size += audio_stream_frame_bytes(sink) * frames; - comp_dbg(mod->dev, "done"); - return 0; + struct comp_dev *dev = mod->dev; + struct mfcc_state *state = &cd->state; + size_t source_avail; + int frames; + int num_ceps; + + comp_dbg(dev, "start"); + source_avail = source_get_data_frames_available(sources[0]); + frames = MIN(source_avail, cd->max_frames); + if (frames == 0) + return -ENODATA; + + /* Copy input audio from source to MFCC internal circular buffer */ + cd->source_func(sources[0], &state->buf, &state->emph, frames, state->source_channel); + + /* Run STFT and Mel/DCT processing */ + num_ceps = mfcc_stft_process(mod, cd); + if (num_ceps < 0) + return num_ceps; + + return mfcc_process_output(mod, cd, sources, sinks, num_ceps, frames); } static int mfcc_prepare(struct processing_module *mod, @@ -172,13 +189,21 @@ static int mfcc_prepare(struct processing_module *mod, return -EINVAL; } - cd->mfcc_func = mfcc_find_func(source_format, sink_format, mfcc_fm, ARRAY_SIZE(mfcc_fm)); - if (!cd->mfcc_func) { - comp_err(dev, "No proc func"); + cd->source_func = mfcc_find_source_func(source_format); + if (!cd->source_func) { + comp_err(dev, "No source func"); mfcc_free_buffers(mod); return -EINVAL; } + cd->source_format = source_format; + + if (cd->config->compress_output) + comp_info(dev, "compress PCM output mode enabled"); + + if (cd->config->enable_dtx && !cd->config->compress_output) + comp_warn(dev, "enable_dtx ignored in normal PCM mode, only applies to compress"); + /* Initialize VAD switch control notification if enabled */ if (cd->config->enable_vad && cd->config->update_controls) { if (!cd->msg) { @@ -206,7 +231,7 @@ static int mfcc_reset(struct processing_module *mod) mfcc_free_buffers(mod); /* Reset to similar state as init() */ - cd->mfcc_func = NULL; + cd->source_func = NULL; return 0; } @@ -215,7 +240,7 @@ static const struct module_interface mfcc_interface = { .free = mfcc_free, .set_configuration = mfcc_set_config, .get_configuration = mfcc_get_config, - .process_audio_stream = mfcc_process, + .process = mfcc_process, .prepare = mfcc_prepare, .reset = mfcc_reset, }; diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c index 0e6efc2e8ba4..4713df3d2566 100644 --- a/src/audio/mfcc/mfcc_common.c +++ b/src/audio/mfcc/mfcc_common.c @@ -7,7 +7,8 @@ #include #include -#include +#include +#include #include #include #include @@ -20,16 +21,154 @@ #include #include #include +#include #include LOG_MODULE_REGISTER(mfcc_common, CONFIG_SOF_LOG_LEVEL); +/* + * Source/sink API based source copy functions. + * These use sof_source API and are compiled on all platforms (generic, HiFi3, HiFi4). + */ + +#if CONFIG_FORMAT_S16LE +void mfcc_source_copy_s16(struct sof_source *source, struct mfcc_buffer *buf, + struct mfcc_pre_emph *emph, int frames, int source_channel) +{ + int16_t const *src_ptr; + int16_t const *src_start; + int src_samples; + int num_channels = source_get_channels(source); + size_t req_bytes = frames * num_channels * sizeof(int16_t); + int16_t *w = buf->w_ptr; + int16_t const *x; + int32_t s; + int ret; + int i; + + ret = source_get_data_s16(source, req_bytes, &src_ptr, &src_start, &src_samples); + if (ret) + return; + + x = src_ptr + source_channel; + for (i = 0; i < frames; i++) { + if (emph->enable) { + s = (int32_t)emph->delay * emph->coef + Q_SHIFT_LEFT(*x, 15, 30); + *w = sat_int16(Q_SHIFT_RND(s, 30, 15)); + emph->delay = *x; + } else { + *w = *x; + } + x += num_channels; + /* Wrap source pointer */ + if (x >= src_start + src_samples) + x -= src_samples; + + w++; + w = mfcc_buffer_wrap(buf, w); + } + + buf->s_avail += frames; + buf->s_free -= frames; + buf->w_ptr = w; + source_release_data(source, req_bytes); +} +#endif /* CONFIG_FORMAT_S16LE */ + +#if CONFIG_FORMAT_S24LE +void mfcc_source_copy_s24(struct sof_source *source, struct mfcc_buffer *buf, + struct mfcc_pre_emph *emph, int frames, int source_channel) +{ + int32_t const *src_ptr; + int32_t const *src_start; + int src_samples; + int num_channels = source_get_channels(source); + size_t req_bytes = frames * num_channels * sizeof(int32_t); + int16_t *w = buf->w_ptr; + int32_t const *x; + int32_t s, tmp; + int ret; + int i; + + ret = source_get_data_s32(source, req_bytes, &src_ptr, &src_start, &src_samples); + if (ret) + return; + + x = src_ptr + source_channel; + for (i = 0; i < frames; i++) { + if (emph->enable) { + s = (int32_t)((uint32_t)*x << 8); + tmp = (int32_t)emph->delay * emph->coef + Q_SHIFT(s, 31, 30); + *w = sat_int16(Q_SHIFT_RND(tmp, 30, 15)); + emph->delay = sat_int16(Q_SHIFT_RND(s, 31, 15)); + } else { + s = (int32_t)((uint32_t)*x << 8); + *w = sat_int16(Q_SHIFT_RND(s, 31, 15)); + } + x += num_channels; + if (x >= src_start + src_samples) + x -= src_samples; + + w++; + w = mfcc_buffer_wrap(buf, w); + } + + buf->s_avail += frames; + buf->s_free -= frames; + buf->w_ptr = w; + source_release_data(source, req_bytes); +} +#endif /* CONFIG_FORMAT_S24LE */ + +#if CONFIG_FORMAT_S32LE +void mfcc_source_copy_s32(struct sof_source *source, struct mfcc_buffer *buf, + struct mfcc_pre_emph *emph, int frames, int source_channel) +{ + int32_t const *src_ptr; + int32_t const *src_start; + int src_samples; + int num_channels = source_get_channels(source); + size_t req_bytes = frames * num_channels * sizeof(int32_t); + int16_t *w = buf->w_ptr; + int32_t const *x; + int32_t s; + int ret; + int i; + + ret = source_get_data_s32(source, req_bytes, &src_ptr, &src_start, &src_samples); + if (ret) + return; + + x = src_ptr + source_channel; + for (i = 0; i < frames; i++) { + if (emph->enable) { + s = (int32_t)emph->delay * emph->coef + Q_SHIFT(*x, 31, 30); + *w = sat_int16(Q_SHIFT_RND(s, 30, 15)); + emph->delay = sat_int16(Q_SHIFT_RND(*x, 31, 15)); + } else { + *w = sat_int16(Q_SHIFT_RND(*x, 31, 15)); + } + x += num_channels; + if (x >= src_start + src_samples) + x -= src_samples; + + w++; + w = mfcc_buffer_wrap(buf, w); + } + + buf->s_avail += frames; + buf->s_free -= frames; + buf->w_ptr = w; + source_release_data(source, req_bytes); +} +#endif /* CONFIG_FORMAT_S32LE */ + /* * The main processing function for MFCC */ -static int mfcc_stft_process(struct processing_module *mod, struct mfcc_comp_data *cd) +int mfcc_stft_process(struct processing_module *mod, struct mfcc_comp_data *cd) { const struct comp_dev *dev = mod->dev; struct sof_mfcc_config *config = cd->config; @@ -147,11 +286,6 @@ static int mfcc_stft_process(struct processing_module *mod, struct mfcc_comp_dat sat_int32(Q_MULTSR_32X32(s, config->mel_scale, 23, 12, 23)); } - /* Store Q9.7 version in mel_spectra for s16 output mode */ - for (j = 0; j < state->dct.num_in; j++) - state->mel_spectra->data[j] = - sat_int16(state->mel_log_32[j] >> 16); - /* Enable this to check mmax decay */ comp_dbg(dev, "state->mmax = %d", state->mmax); } else { @@ -203,357 +337,277 @@ static int mfcc_stft_process(struct processing_module *mod, struct mfcc_comp_dat return cc_count; } -void mfcc_fill_fft_buffer(struct mfcc_state *state) +/** + * \brief Write bytes into a possibly wrapped sink buffer. + */ +static size_t mfcc_sink_write_bytes(uint8_t **dst, uint8_t *buf_start, + size_t buf_size, const uint8_t *src, + size_t max_bytes) { - struct mfcc_buffer *buf = &state->buf; - struct mfcc_fft *fft = &state->fft; - int32_t *d = &fft->fft_buf[fft->fft_fill_start_idx].real; - const int fft_elem_inc = sizeof(fft->fft_buf[0]) / sizeof(int32_t); - int16_t *prev = state->prev_data; - int16_t *prev_end = prev + state->prev_data_size; - int16_t *r = buf->r_ptr; - int copied; - int nmax; - int n; - int j; + uint8_t *buf_end = buf_start + buf_size; + size_t chunk; - /* Copy overlapped samples from state buffer. The fft_buf has been - * cleared by caller so imaginary part remains zero. - */ - while (prev < prev_end) { - *d = *prev++; - d += fft_elem_inc; - } + if (max_bytes == 0) + return 0; - /* Copy hop size of new data from circular buffer */ - for (copied = 0; copied < fft->fft_hop_size; copied += n) { - nmax = fft->fft_hop_size - copied; - n = mfcc_buffer_samples_without_wrap(buf, r); - n = MIN(n, nmax); - for (j = 0; j < n; j++) { - *d = *r++; - d += fft_elem_inc; - } - r = mfcc_buffer_wrap(buf, r); + chunk = MIN(max_bytes, (size_t)(buf_end - *dst)); + memcpy(*dst, src, chunk); + if (chunk < max_bytes) { + memcpy(buf_start, src + chunk, max_bytes - chunk); + *dst = buf_start + (max_bytes - chunk); + } else { + *dst += chunk; + if (*dst >= buf_end) + *dst = buf_start; } - buf->s_avail -= copied; - buf->s_free += copied; - buf->r_ptr = r; - - /* Copy for next time data back to overlap buffer */ - d = (int32_t *)&fft->fft_buf[fft->fft_fill_start_idx + fft->fft_hop_size].real; - prev = state->prev_data; - while (prev < prev_end) { - *prev++ = *d; - d += fft_elem_inc; - } + return max_bytes; } -#if CONFIG_FORMAT_S16LE -static int16_t *mfcc_sink_copy_zero_s16(const struct audio_stream *sink, int16_t *w_ptr, - int samples) +/** + * \brief Prepare the next MFCC output frame after STFT processing. + */ +static void mfcc_prepare_output(struct mfcc_state *state, int num_ceps) { - int copied; - int nmax; - int n; - - for (copied = 0; copied < samples; copied += n) { - nmax = samples - copied; - n = audio_stream_samples_without_wrap_s16(sink, w_ptr); - n = MIN(n, nmax); - memset(w_ptr, 0, n * sizeof(int16_t)); - w_ptr = audio_stream_wrap(sink, w_ptr + n); - } + int k; - return w_ptr; -} + if (num_ceps <= 0) + return; -static int16_t *mfcc_sink_copy_data_s16(const struct audio_stream *sink, int16_t *w_ptr, - int samples, int16_t *r_ptr) -{ - int copied; - int nmax; - int n; + if (state->mel_only) { + state->out_data_ptr = state->mel_log_32; + } else { + /* Widen int16 Q9.7 cepstral coefficients to int32 Q9.23. + * Safe to copy forward: cepstral_coef is in fft_out while + * mel_log_32 is in fft_buf (separate scratch buffers). + */ + for (k = 0; k < num_ceps; k++) + state->mel_log_32[k] = (int32_t)state->cepstral_coef->data[k] << 16; - for (copied = 0; copied < samples; copied += n) { - nmax = samples - copied; - n = audio_stream_samples_without_wrap_s16(sink, w_ptr); - n = MIN(n, nmax); - /* Not using memcpy_s() due to speed need */ - memcpy(w_ptr, r_ptr, n * sizeof(int16_t)); - w_ptr = audio_stream_wrap(sink, w_ptr + n); - r_ptr += n; + state->out_data_ptr = state->mel_log_32; } - return w_ptr; + state->out_remain = num_ceps; + state->header_pending = true; } -void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer *bsource, - struct output_stream_buffer *bsink, int frames) +/** + * \brief Commit MFCC output in compress mode. + */ +static int mfcc_output_compress(struct processing_module *mod, struct mfcc_comp_data *cd, + struct sof_sink **sinks, int num_ceps) { - struct audio_stream *sink = bsink->data; - struct mfcc_comp_data *cd = module_get_private_data(mod); + struct comp_dev *dev = mod->dev; struct mfcc_state *state = &cd->state; - struct mfcc_buffer *buf = &cd->state.buf; - int16_t *w_ptr = audio_stream_get_wptr(sink); - const int num_header_s16 = sizeof(state->header) / sizeof(int16_t); - int num_ceps; - int sink_samples; - int to_copy; - - /* Get samples from source buffer */ - mfcc_source_copy_s16(bsource, buf, &state->emph, frames, state->source_channel); - - /* Run STFT and processing after FFT: Mel auditory filter and DCT. */ - num_ceps = mfcc_stft_process(mod, cd); - - /* If new output produced, set up pointer into scratch data and mark header pending */ - if (num_ceps > 0) { - if (state->mel_only) { - state->out_data_ptr = state->mel_spectra->data; - } else { - state->out_data_ptr = state->cepstral_coef->data; + size_t out_bytes; + size_t commit_bytes; + void *sink_ptr; + void *sink_start; + size_t sink_buf_size; + int ret; + + if (num_ceps <= 0) + return 0; + + out_bytes = sizeof(state->header) + num_ceps * sizeof(int32_t); + + if (cd->config->enable_vad && !cd->vad.is_speech) { + state->vad_silence_count++; + /* With DTX enabled, send trailing silence frames + * (configurable count) then suppress. After trailing + * frames, optionally send periodic silence updates + * at the configured interval. This gives the host + * enough silence to detect end-of-speech while + * keeping alive updates during long silence. + * Without DTX, output every frame regardless of VAD. + */ + if (cd->config->enable_dtx) { + if (state->vad_silence_count > state->dtx_trailing_silence) { + /* Check periodic silence frame send */ + if (state->dtx_silence_interval > 0) { + state->dtx_silence_counter++; + if (state->dtx_silence_counter >= state->dtx_silence_interval) { + state->dtx_silence_counter = 0; + goto send_frame; + } + } + state->header_pending = false; + state->out_remain = 0; + return 0; + } } - - state->out_remain = num_ceps; - state->header_pending = true; + } else { + state->vad_silence_count = 0; + state->dtx_silence_counter = 0; } - /* Write to sink, limited by period size */ - sink_samples = frames * audio_stream_get_channels(sink); +send_frame: + commit_bytes = out_bytes; - /* Write data header first if pending */ - if (state->header_pending) { - if (sink_samples < num_header_s16) { - /* Not enough sink space for header, defer entire frame */ - mfcc_sink_copy_zero_s16(sink, w_ptr, sink_samples); - return; - } - - w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_header_s16, - (int16_t *)&state->header); - sink_samples -= num_header_s16; - state->header_pending = false; - } + if (sink_get_free_size(sinks[0]) < commit_bytes) + return -ENOSPC; - /* Write cepstral/mel data from scratch buffer */ - to_copy = MIN(state->out_remain, sink_samples); - if (to_copy > 0) { - w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, to_copy, state->out_data_ptr); - state->out_data_ptr += to_copy; - state->out_remain -= to_copy; - sink_samples -= to_copy; - } - - /* Zero-fill remaining sink samples */ - w_ptr = mfcc_sink_copy_zero_s16(sink, w_ptr, sink_samples); -} -#endif /* CONFIG_FORMAT_S16LE */ + ret = sink_get_buffer(sinks[0], commit_bytes, &sink_ptr, + &sink_start, &sink_buf_size); + if (ret) + return ret; -#if CONFIG_FORMAT_S24LE || CONFIG_FORMAT_S32LE -static int32_t *mfcc_sink_copy_zero_s32(const struct audio_stream *sink, int32_t *w_ptr, - int samples) -{ - int copied; - int nmax; - int n; + { + uint8_t *dst = sink_ptr; - for (copied = 0; copied < samples; copied += n) { - nmax = samples - copied; - n = audio_stream_samples_without_wrap_s32(sink, w_ptr); - n = MIN(n, nmax); - memset(w_ptr, 0, n * sizeof(int32_t)); - w_ptr = audio_stream_wrap(sink, w_ptr + n); + mfcc_sink_write_bytes(&dst, sink_start, sink_buf_size, + (uint8_t *)&state->header, sizeof(state->header)); + mfcc_sink_write_bytes(&dst, sink_start, sink_buf_size, + (uint8_t *)state->out_data_ptr, + num_ceps * sizeof(int32_t)); } - return w_ptr; -} + state->header_pending = false; + state->out_remain = 0; -static int32_t *mfcc_sink_copy_data_s32(const struct audio_stream *sink, int32_t *w_ptr, - int samples, int32_t *r_ptr) -{ - int copied; - int nmax; - int n; - - for (copied = 0; copied < samples; copied += n) { - nmax = samples - copied; - n = audio_stream_samples_without_wrap_s32(sink, w_ptr); - n = MIN(n, nmax); - /* Not using memcpy_s() due to speed need */ - memcpy(w_ptr, r_ptr, n * sizeof(int32_t)); - w_ptr = audio_stream_wrap(sink, w_ptr + n); - r_ptr += n; - } - - return w_ptr; + sink_commit_buffer(sinks[0], commit_bytes); + comp_dbg(dev, "done, produced %zu bytes", commit_bytes); + return 0; } -#endif /* CONFIG_FORMAT_S24LE || CONFIG_FORMAT_S32LE */ -#if CONFIG_FORMAT_S24LE -void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer *bsource, - struct output_stream_buffer *bsink, int frames) +/** + * \brief Commit MFCC output in legacy PCM mode. + */ +static int mfcc_output_legacy(struct processing_module *mod, struct mfcc_comp_data *cd, + struct sof_source **sources, struct sof_sink **sinks, + int frames) { - struct audio_stream *sink = bsink->data; - struct mfcc_comp_data *cd = module_get_private_data(mod); + struct comp_dev *dev = mod->dev; struct mfcc_state *state = &cd->state; - struct mfcc_buffer *buf = &cd->state.buf; - int32_t *w_ptr = audio_stream_get_wptr(sink); - const int num_header_s32 = sizeof(state->header) / sizeof(int32_t); - int num_ceps; - int sink_samples; - int remain_s32; - int to_copy; - int k; - - /* Get samples from source buffer */ - mfcc_source_copy_s24(bsource, buf, &state->emph, frames, state->source_channel); - - /* Run STFT and processing after FFT */ - num_ceps = mfcc_stft_process(mod, cd); - - /* If new output produced, set up pointer into scratch data */ - if (num_ceps > 0) { - if (state->mel_only) { - /* Convert mel_log_32 from Q9.23 to Q9.15 in-place */ - for (k = 0; k < num_ceps; k++) - state->mel_log_32[k] >>= 8; - - state->out_data_ptr_32 = state->mel_log_32; - } else { - state->out_data_ptr = state->cepstral_coef->data; + size_t commit_bytes; + void *sink_ptr; + void *sink_start; + size_t sink_buf_size; + int ret; + + commit_bytes = sink_get_frame_bytes(sinks[0]); + commit_bytes *= frames; + + if (sink_get_free_size(sinks[0]) < commit_bytes) + return -ENOSPC; + + ret = sink_get_buffer(sinks[0], commit_bytes, &sink_ptr, + &sink_start, &sink_buf_size); + if (ret) + return ret; + + /* Zero-fill entire period first */ + { + size_t bytes_to_end = (size_t)((uint8_t *)sink_start + sink_buf_size - + (uint8_t *)sink_ptr); + + if (bytes_to_end >= commit_bytes) + memset(sink_ptr, 0, commit_bytes); + else { + memset(sink_ptr, 0, bytes_to_end); + memset(sink_start, 0, commit_bytes - bytes_to_end); } - - state->out_remain = num_ceps; - state->header_pending = true; } - /* Write to sink, limited by period size */ - sink_samples = frames * audio_stream_get_channels(sink); + { + uint8_t *dst = sink_ptr; + size_t avail = commit_bytes; - /* Write data header first if pending */ - if (state->header_pending) { - if (sink_samples < num_header_s32) { - /* Not enough sink space for header, defer entire frame */ - mfcc_sink_copy_zero_s32(sink, w_ptr, sink_samples); - return; - } + /* Write pending header */ + if (state->header_pending && avail > 0) { + size_t hdr_size = sizeof(state->header); - w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_header_s32, - (int32_t *)&state->header); - sink_samples -= num_header_s32; - state->header_pending = false; - } - - if (state->mel_only) { - /* Write 32-bit mel data Q9.15, one value per int32_t */ - to_copy = MIN(state->out_remain, sink_samples); - if (to_copy > 0) { - w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy, - state->out_data_ptr_32); - state->out_data_ptr_32 += to_copy; - state->out_remain -= to_copy; - sink_samples -= to_copy; + if (avail >= hdr_size) { + mfcc_sink_write_bytes(&dst, sink_start, sink_buf_size, + (uint8_t *)&state->header, hdr_size); + avail -= hdr_size; + state->header_pending = false; + } } - } else { - /* Write cepstral data packed as int32_t from scratch buffer */ - remain_s32 = (state->out_remain + 1) / 2; - to_copy = MIN(remain_s32, sink_samples); - if (to_copy > 0) { - w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy, - (int32_t *)state->out_data_ptr); - state->out_data_ptr += to_copy * 2; - state->out_remain -= to_copy * 2; - if (state->out_remain < 0) - state->out_remain = 0; - sink_samples -= to_copy; + /* Write pending feature data (always int32) */ + if (state->out_remain > 0 && avail > 0) { + size_t data_bytes; + size_t to_write; + + data_bytes = state->out_remain * sizeof(int32_t); + to_write = MIN(data_bytes, avail) & ~(size_t)3; + if (to_write > 0) { + int n32; + + mfcc_sink_write_bytes(&dst, sink_start, sink_buf_size, + (uint8_t *)state->out_data_ptr, + to_write); + n32 = to_write / sizeof(int32_t); + state->out_data_ptr += n32; + state->out_remain -= n32; + } } } - /* Zero-fill remaining sink samples */ - w_ptr = mfcc_sink_copy_zero_s32(sink, w_ptr, sink_samples); + sink_commit_buffer(sinks[0], commit_bytes); + comp_dbg(dev, "done, produced %zu bytes", commit_bytes); + return 0; } -#endif /* CONFIG_FORMAT_S24LE */ -#if CONFIG_FORMAT_S32LE -void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer *bsource, - struct output_stream_buffer *bsink, int frames) +int mfcc_process_output(struct processing_module *mod, struct mfcc_comp_data *cd, + struct sof_source **sources, struct sof_sink **sinks, + int num_ceps, int frames) { - struct audio_stream *sink = bsink->data; - struct mfcc_comp_data *cd = module_get_private_data(mod); - struct mfcc_state *state = &cd->state; - struct mfcc_buffer *buf = &cd->state.buf; - int32_t *w_ptr = audio_stream_get_wptr(sink); - const int num_header_s32 = sizeof(state->header) / sizeof(int32_t); - int num_ceps; - int sink_samples; - int remain_s32; - int to_copy; - - /* Get samples from source buffer */ - mfcc_source_copy_s32(bsource, buf, &state->emph, frames, state->source_channel); - - /* Run STFT and processing after FFT */ - num_ceps = mfcc_stft_process(mod, cd); - - /* If new output produced, set up pointer into scratch data */ - if (num_ceps > 0) { - if (state->mel_only) { - state->out_data_ptr_32 = state->mel_log_32; - } else { - state->out_data_ptr = state->cepstral_coef->data; - } + if (num_ceps > 0) + mfcc_prepare_output(&cd->state, num_ceps); - state->out_remain = num_ceps; - state->header_pending = true; - } + if (cd->config->compress_output) + return mfcc_output_compress(mod, cd, sinks, num_ceps); - /* Write to sink, limited by period size */ - sink_samples = frames * audio_stream_get_channels(sink); + return mfcc_output_legacy(mod, cd, sources, sinks, frames); +} - /* Write data header first if pending */ - if (state->header_pending) { - if (sink_samples < num_header_s32) { - /* Not enough sink space for header, defer entire frame */ - mfcc_sink_copy_zero_s32(sink, w_ptr, sink_samples); - return; - } +void mfcc_fill_fft_buffer(struct mfcc_state *state) +{ + struct mfcc_buffer *buf = &state->buf; + struct mfcc_fft *fft = &state->fft; + int32_t *d = &fft->fft_buf[fft->fft_fill_start_idx].real; + const int fft_elem_inc = sizeof(fft->fft_buf[0]) / sizeof(int32_t); + int16_t *prev = state->prev_data; + int16_t *prev_end = prev + state->prev_data_size; + int16_t *r = buf->r_ptr; + int copied; + int nmax; + int n; + int j; - w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_header_s32, - (int32_t *)&state->header); - sink_samples -= num_header_s32; - state->header_pending = false; + /* Copy overlapped samples from state buffer. The fft_buf has been + * cleared by caller so imaginary part remains zero. + */ + while (prev < prev_end) { + *d = *prev++; + d += fft_elem_inc; } - if (state->mel_only) { - /* Write 32-bit mel data Q9.23, one value per int32_t */ - to_copy = MIN(state->out_remain, sink_samples); - if (to_copy > 0) { - w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy, - state->out_data_ptr_32); - state->out_data_ptr_32 += to_copy; - state->out_remain -= to_copy; - sink_samples -= to_copy; - } - } else { - /* Write cepstral data packed as int32_t from scratch buffer */ - remain_s32 = (state->out_remain + 1) / 2; - to_copy = MIN(remain_s32, sink_samples); - if (to_copy > 0) { - w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy, - (int32_t *)state->out_data_ptr); - state->out_data_ptr += to_copy * 2; - state->out_remain -= to_copy * 2; - if (state->out_remain < 0) - state->out_remain = 0; - - sink_samples -= to_copy; + /* Copy hop size of new data from circular buffer */ + for (copied = 0; copied < fft->fft_hop_size; copied += n) { + nmax = fft->fft_hop_size - copied; + n = mfcc_buffer_samples_without_wrap(buf, r); + n = MIN(n, nmax); + for (j = 0; j < n; j++) { + *d = *r++; + d += fft_elem_inc; } + r = mfcc_buffer_wrap(buf, r); } - /* Zero-fill remaining sink samples */ - w_ptr = mfcc_sink_copy_zero_s32(sink, w_ptr, sink_samples); + buf->s_avail -= copied; + buf->s_free += copied; + buf->r_ptr = r; + + /* Copy for next time data back to overlap buffer */ + d = (int32_t *)&fft->fft_buf[fft->fft_fill_start_idx + fft->fft_hop_size].real; + prev = state->prev_data; + while (prev < prev_end) { + *prev++ = *d; + d += fft_elem_inc; + } } -#endif /* CONFIG_FORMAT_S32LE */ + + diff --git a/src/audio/mfcc/mfcc_generic.c b/src/audio/mfcc/mfcc_generic.c index 73ac49272ed4..d5eaf65ba091 100644 --- a/src/audio/mfcc/mfcc_generic.c +++ b/src/audio/mfcc/mfcc_generic.c @@ -8,7 +8,6 @@ #ifdef MFCC_GENERIC #include -#include #include #include #include @@ -64,161 +63,4 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift) fft->fft_buf[i + j].real = (fft->fft_buf[i + j].real * state->window[j]) << s; } -#if CONFIG_FORMAT_S16LE -void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel) -{ - struct audio_stream *source = bsource->data; - int32_t s; - int16_t *x0; - int16_t *x = audio_stream_get_rptr(source); - int16_t *w = buf->w_ptr; - int copied; - int nmax; - int n1; - int n2; - int n; - int i; - int num_channels = audio_stream_get_channels(source); - - /* Copy from source to pre-buffer for FFT. - * The pre-emphasis filter is done in this step. - */ - for (copied = 0; copied < frames; copied += n) { - nmax = frames - copied; - n1 = audio_stream_frames_without_wrap(source, x); - n2 = mfcc_buffer_samples_without_wrap(buf, w); - n = MIN(n1, n2); - n = MIN(n, nmax); - x0 = x + source_channel; - for (i = 0; i < n; i++) { - if (emph->enable) { - /* Q1.15 x Q1.15 -> Q2.30 */ - s = (int32_t)emph->delay * emph->coef + Q_SHIFT_LEFT(*x0, 15, 30); - *w = sat_int16(Q_SHIFT_RND(s, 30, 15)); - emph->delay = *x0; - } else { - *w = *x0; - } - x0 += num_channels; - w++; - } - - x = audio_stream_wrap(source, x + n * audio_stream_get_channels(source)); - w = mfcc_buffer_wrap(buf, w); - } - buf->s_avail += copied; - buf->s_free -= copied; - buf->w_ptr = w; -} -#endif /* CONFIG_FORMAT_S16LE */ - -#if CONFIG_FORMAT_S24LE - -void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel) -{ - struct audio_stream *source = bsource->data; - int32_t tmp, s; - int32_t *x0; - int32_t *x = audio_stream_get_rptr(source); - int16_t *w = buf->w_ptr; - int copied; - int nmax; - int n1; - int n2; - int n; - int i; - int num_channels = audio_stream_get_channels(source); - - /* Copy from source to pre-buffer for FFT. - * The pre-emphasis filter is done in this step. - * S24_4LE data is in 32-bit container, shift left by 8 to Q1.31, - * then convert to Q1.15 with rounding. - */ - for (copied = 0; copied < frames; copied += n) { - nmax = frames - copied; - n1 = audio_stream_frames_without_wrap(source, x); - n2 = mfcc_buffer_samples_without_wrap(buf, w); - n = MIN(n1, n2); - n = MIN(n, nmax); - x0 = x + source_channel; - for (i = 0; i < n; i++) { - if (emph->enable) { - /* Convert to Q1.31, ignore highest byte */ - s = (int32_t)((uint32_t)*x0 << 8); - /* Q1.15 x Q1.15 -> Q2.30 */ - tmp = (int32_t)emph->delay * emph->coef + Q_SHIFT(s, 31, 30); - *w = sat_int16(Q_SHIFT_RND(tmp, 30, 15)); - emph->delay = sat_int16(Q_SHIFT_RND(s, 31, 15)); - } else { - /* Convert to Q1.31, ignore highest byte */ - s = (int32_t)((uint32_t)*x0 << 8); - *w = sat_int16(Q_SHIFT_RND(s, 31, 15)); - } - x0 += num_channels; - w++; - } - - x = audio_stream_wrap(source, x + n * audio_stream_get_channels(source)); - w = mfcc_buffer_wrap(buf, w); - } - buf->s_avail += copied; - buf->s_free -= copied; - buf->w_ptr = w; -} - -#endif /* CONFIG_FORMAT_S24LE */ - -#if CONFIG_FORMAT_S32LE - -void mfcc_source_copy_s32(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel) -{ - struct audio_stream *source = bsource->data; - int32_t s; - int32_t *x0; - int32_t *x = audio_stream_get_rptr(source); - int16_t *w = buf->w_ptr; - int copied; - int nmax; - int n1; - int n2; - int n; - int i; - int num_channels = audio_stream_get_channels(source); - - /* Copy from source to pre-buffer for FFT. - * The pre-emphasis filter is done in this step. - * S32 data is in 32-bit container, shift right by 16 to get 16-bit. - */ - for (copied = 0; copied < frames; copied += n) { - nmax = frames - copied; - n1 = audio_stream_frames_without_wrap(source, x); - n2 = mfcc_buffer_samples_without_wrap(buf, w); - n = MIN(n1, n2); - n = MIN(n, nmax); - x0 = x + source_channel; - for (i = 0; i < n; i++) { - if (emph->enable) { - /* Q1.15 x Q1.15 -> Q2.30 */ - s = (int32_t)emph->delay * emph->coef + Q_SHIFT(*x0, 31, 30); - *w = sat_int16(Q_SHIFT_RND(s, 30, 15)); - emph->delay = sat_int16(Q_SHIFT_RND(*x0, 31, 15)); - } else { - *w = sat_int16(Q_SHIFT_RND(*x0, 31, 15)); - } - x0 += num_channels; - w++; - } - - x = audio_stream_wrap(source, x + n * audio_stream_get_channels(source)); - w = mfcc_buffer_wrap(buf, w); - } - buf->s_avail += copied; - buf->s_free -= copied; - buf->w_ptr = w; -} -#endif /* CONFIG_FORMAT_S32LE */ - #endif /* MFCC_GENERIC */ diff --git a/src/audio/mfcc/mfcc_hifi3.c b/src/audio/mfcc/mfcc_hifi3.c index 80c384ad6c64..8b6a01e1f40d 100644 --- a/src/audio/mfcc/mfcc_hifi3.c +++ b/src/audio/mfcc/mfcc_hifi3.c @@ -9,7 +9,6 @@ #ifdef MFCC_HIFI3 #include -#include #include #include #include @@ -35,66 +34,6 @@ static inline void set_circular_buf0(const void *start, const void *end) * MFCC algorithm code */ -#if CONFIG_FORMAT_S16LE -void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel) -{ - struct audio_stream *source = bsource->data; - int copied; - int nmax; - int n; - int i; - int num_channels = audio_stream_get_channels(source); - ae_int16 *in; - ae_int16 *x = (ae_int16 *)audio_stream_get_rptr(source); - ae_int16 *out = (ae_int16 *)buf->w_ptr; - ae_int16x4 sample; - ae_int32x2 temp; - ae_int16x4 coef = emph->coef; - ae_int16x4 delay; - const int in_inc = sizeof(ae_int16) * num_channels; - - /* Copy from source to pre-buffer for FFT. - * The pre-emphasis filter is done in this step. - */ - for (copied = 0; copied < frames; copied += n) { - nmax = frames - copied; - n = audio_stream_frames_without_wrap(source, x); - n = MIN(n, nmax); - nmax = mfcc_buffer_samples_without_wrap(buf, (int16_t *)out); - n = MIN(n, nmax); - in = x + source_channel; - if (emph->enable) { - delay = emph->delay; - for (i = 0; i < n; i++) { - AE_L16_XP(sample, in, in_inc); - /* Q1.15 -> Q1.31 */ - temp = AE_CVT32X2F16_10(sample); - AE_MULAF16SS_00(temp, delay, coef); - delay = sample; - sample = AE_ROUND16X4F32SSYM(temp, temp); - /* 2 = sizeof(ae_int16)*/ - AE_S16_0_IP(sample, out, 2); - } - emph->delay = delay; - - } else { - for (i = 0; i < n; i++) { - AE_L16_XP(sample, in, in_inc); - /* 2 = sizeof(ae_int16)*/ - AE_S16_0_IP(sample, out, 2); - } - } - - x = audio_stream_wrap(source, x + n * num_channels); - out = (ae_int16 *)mfcc_buffer_wrap(buf, (int16_t *)out); - } - buf->s_avail += copied; - buf->s_free -= copied; - buf->w_ptr = (int16_t *)out; -} -#endif /* CONFIG_FORMAT_S16LE */ - void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data, int prev_data_length) { @@ -152,129 +91,4 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift) } } -#if CONFIG_FORMAT_S24LE -void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel) -{ - struct audio_stream *source = bsource->data; - int copied; - int nmax; - int n; - int i; - int num_channels = audio_stream_get_channels(source); - ae_int32 *in; - ae_int32 *x = (ae_int32 *)audio_stream_get_rptr(source); - ae_int16 *out = (ae_int16 *)buf->w_ptr; - ae_int32x2 sample32; - ae_int16x4 sample; - ae_int32x2 temp; - ae_int16x4 coef = emph->coef; - ae_int16x4 delay; - const int in_inc = sizeof(ae_int32) * num_channels; - - for (copied = 0; copied < frames; copied += n) { - nmax = frames - copied; - n = audio_stream_frames_without_wrap(source, x); - n = MIN(n, nmax); - nmax = mfcc_buffer_samples_without_wrap(buf, (int16_t *)out); - n = MIN(n, nmax); - in = x + source_channel; - if (emph->enable) { - delay = emph->delay; - for (i = 0; i < n; i++) { - AE_L32_XP(sample32, in, in_inc); - /* Shift left by 8 to sign-extend to Q1.31 */ - sample32 = AE_SLAI32(sample32, 8); - /* Then shift right by 16 to get 16-bit */ - sample32 = AE_SRAI32(sample32, 16); - sample = AE_SAT16X4(sample32, sample32); - /* Q1.15 -> Q1.31 */ - temp = AE_CVT32X2F16_10(sample); - AE_MULAF16SS_00(temp, delay, coef); - delay = sample; - sample = AE_ROUND16X4F32SSYM(temp, temp); - AE_S16_0_IP(sample, out, 2); - } - emph->delay = delay; - } else { - for (i = 0; i < n; i++) { - AE_L32_XP(sample32, in, in_inc); - /* Shift left by 8 to sign-extend to Q1.31 */ - sample32 = AE_SLAI32(sample32, 8); - /* Then shift right by 16 to get 16-bit */ - sample32 = AE_SRAI32(sample32, 16); - sample = AE_SAT16X4(sample32, sample32); - AE_S16_0_IP(sample, out, 2); - } - } - - x = audio_stream_wrap(source, x + n * num_channels); - out = (ae_int16 *)mfcc_buffer_wrap(buf, (int16_t *)out); - } - buf->s_avail += copied; - buf->s_free -= copied; - buf->w_ptr = (int16_t *)out; -} -#endif /* CONFIG_FORMAT_S24LE */ - -#if CONFIG_FORMAT_S32LE -void mfcc_source_copy_s32(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel) -{ - struct audio_stream *source = bsource->data; - int copied; - int nmax; - int n; - int i; - int num_channels = audio_stream_get_channels(source); - ae_int32 *in; - ae_int32 *x = (ae_int32 *)audio_stream_get_rptr(source); - ae_int16 *out = (ae_int16 *)buf->w_ptr; - ae_int32x2 sample32; - ae_int16x4 sample; - ae_int32x2 temp; - ae_int16x4 coef = emph->coef; - ae_int16x4 delay; - const int in_inc = sizeof(ae_int32) * num_channels; - - for (copied = 0; copied < frames; copied += n) { - nmax = frames - copied; - n = audio_stream_frames_without_wrap(source, x); - n = MIN(n, nmax); - nmax = mfcc_buffer_samples_without_wrap(buf, (int16_t *)out); - n = MIN(n, nmax); - in = x + source_channel; - if (emph->enable) { - delay = emph->delay; - for (i = 0; i < n; i++) { - AE_L32_XP(sample32, in, in_inc); - /* S32: shift right by 16 to get 16-bit */ - sample32 = AE_SRAI32(sample32, 16); - sample = AE_SAT16X4(sample32, sample32); - /* Q1.15 -> Q1.31 */ - temp = AE_CVT32X2F16_10(sample); - AE_MULAF16SS_00(temp, delay, coef); - delay = sample; - sample = AE_ROUND16X4F32SSYM(temp, temp); - AE_S16_0_IP(sample, out, 2); - } - emph->delay = delay; - } else { - for (i = 0; i < n; i++) { - AE_L32_XP(sample32, in, in_inc); - sample32 = AE_SRAI32(sample32, 16); - sample = AE_SAT16X4(sample32, sample32); - AE_S16_0_IP(sample, out, 2); - } - } - - x = audio_stream_wrap(source, x + n * num_channels); - out = (ae_int16 *)mfcc_buffer_wrap(buf, (int16_t *)out); - } - buf->s_avail += copied; - buf->s_free -= copied; - buf->w_ptr = (int16_t *)out; -} -#endif /* CONFIG_FORMAT_S32LE */ - #endif /* MFCC_HIFI3 */ diff --git a/src/audio/mfcc/mfcc_hifi4.c b/src/audio/mfcc/mfcc_hifi4.c index 63986870793b..8cd956fcb079 100644 --- a/src/audio/mfcc/mfcc_hifi4.c +++ b/src/audio/mfcc/mfcc_hifi4.c @@ -9,7 +9,6 @@ #ifdef MFCC_HIFI4 #include -#include #include #include #include @@ -31,66 +30,10 @@ static inline void set_circular_buf0(const void *start, const void *end) AE_SETCEND0(end); } -/* Setup circular for buffer 1 */ -static inline void set_circular_buf1(const void *start, const void *end) -{ - AE_SETCBEGIN1(start); - AE_SETCEND1(end); -} - /* * MFCC algorithm code */ -#if CONFIG_FORMAT_S16LE -void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel) -{ - struct audio_stream *source = bsource->data; - int num_channels = audio_stream_get_channels(source); - ae_int16 *in = (ae_int16 *)source->r_ptr + source_channel; - ae_int16 *out = (ae_int16 *)buf->w_ptr; - ae_int16x4 sample; - ae_int32x2 temp; - ae_int16x4 coef; - ae_int16x4 delay; - const int in_inc = sizeof(ae_int16) * num_channels; - const int out_inc = sizeof(ae_int16); - int i; - - set_circular_buf1(buf->addr, buf->end_addr); - set_circular_buf0(source->addr, source->end_addr); - - /* Copy from source to pre-buffer for FFT. - * The pre-emphasis filter is done in this step. - */ - if (emph->enable) { - delay = emph->delay; - coef = emph->coef; - for (i = 0; i < frames; i++) { - AE_L16_XC(sample, in, in_inc); - - /* Q1.15 -> Q1.31 */ - temp = AE_CVT32X2F16_10(sample); - AE_MULAF16SS_00(temp, delay, coef); - delay = sample; - sample = AE_ROUND16X4F32SSYM(temp, temp); - AE_S16_0_XC1(sample, out, out_inc); - } - emph->delay = delay; - } else { - for (i = 0; i < frames; i++) { - AE_L16_XC(sample, in, in_inc); - AE_S16_0_XC1(sample, out, out_inc); - } - } - - buf->s_avail += frames; - buf->s_free -= frames; - buf->w_ptr = (int16_t *)out; -} -#endif /* CONFIG_FORMAT_S16LE */ - void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data, int prev_data_length) { @@ -148,111 +91,4 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift) } } -#if CONFIG_FORMAT_S24LE -void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel) -{ - struct audio_stream *source = bsource->data; - int num_channels = audio_stream_get_channels(source); - ae_int32 *in = (ae_int32 *)source->r_ptr + source_channel; - ae_int16 *out = (ae_int16 *)buf->w_ptr; - ae_int32x2 sample32; - ae_int16x4 sample; - ae_int32x2 temp; - ae_int16x4 coef; - ae_int16x4 delay; - const int in_inc = sizeof(ae_int32) * num_channels; - const int out_inc = sizeof(ae_int16); - int i; - - set_circular_buf1(buf->addr, buf->end_addr); - set_circular_buf0(source->addr, source->end_addr); - - if (emph->enable) { - delay = emph->delay; - coef = emph->coef; - for (i = 0; i < frames; i++) { - AE_L32_XC(sample32, in, in_inc); - /* Shift left by 8 to sign-extend to Q1.31 */ - sample32 = AE_SLAI32(sample32, 8); - /* Then shift right by 16 to get 16-bit */ - sample32 = AE_SRAI32(sample32, 16); - sample = AE_SAT16X4(sample32, sample32); - /* Q1.15 -> Q1.31 */ - temp = AE_CVT32X2F16_10(sample); - AE_MULAF16SS_00(temp, delay, coef); - delay = sample; - sample = AE_ROUND16X4F32SSYM(temp, temp); - AE_S16_0_XC1(sample, out, out_inc); - } - emph->delay = delay; - } else { - for (i = 0; i < frames; i++) { - AE_L32_XC(sample32, in, in_inc); - /* Shift left by 8 to sign-extend to Q1.31 */ - sample32 = AE_SLAI32(sample32, 8); - /* Then shift right by 16 to get 16-bit */ - sample32 = AE_SRAI32(sample32, 16); - sample = AE_SAT16X4(sample32, sample32); - AE_S16_0_XC1(sample, out, out_inc); - } - } - - buf->s_avail += frames; - buf->s_free -= frames; - buf->w_ptr = (int16_t *)out; -} -#endif /* CONFIG_FORMAT_S24LE */ - -#if CONFIG_FORMAT_S32LE -void mfcc_source_copy_s32(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel) -{ - struct audio_stream *source = bsource->data; - int num_channels = audio_stream_get_channels(source); - ae_int32 *in = (ae_int32 *)source->r_ptr + source_channel; - ae_int16 *out = (ae_int16 *)buf->w_ptr; - ae_int32x2 sample32; - ae_int16x4 sample; - ae_int32x2 temp; - ae_int16x4 coef; - ae_int16x4 delay; - const int in_inc = sizeof(ae_int32) * num_channels; - const int out_inc = sizeof(ae_int16); - int i; - - set_circular_buf1(buf->addr, buf->end_addr); - set_circular_buf0(source->addr, source->end_addr); - - if (emph->enable) { - delay = emph->delay; - coef = emph->coef; - for (i = 0; i < frames; i++) { - AE_L32_XC(sample32, in, in_inc); - /* S32: shift right by 16 to get 16-bit */ - sample32 = AE_SRAI32(sample32, 16); - sample = AE_SAT16X4(sample32, sample32); - /* Q1.15 -> Q1.31 */ - temp = AE_CVT32X2F16_10(sample); - AE_MULAF16SS_00(temp, delay, coef); - delay = sample; - sample = AE_ROUND16X4F32SSYM(temp, temp); - AE_S16_0_XC1(sample, out, out_inc); - } - emph->delay = delay; - } else { - for (i = 0; i < frames; i++) { - AE_L32_XC(sample32, in, in_inc); - sample32 = AE_SRAI32(sample32, 16); - sample = AE_SAT16X4(sample32, sample32); - AE_S16_0_XC1(sample, out, out_inc); - } - } - - buf->s_avail += frames; - buf->s_free -= frames; - buf->w_ptr = (int16_t *)out; -} -#endif /* CONFIG_FORMAT_S32LE */ - #endif /* MFCC_HIFI4 */ diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c index 73e69f6408a8..cc673d29b0da 100644 --- a/src/audio/mfcc/mfcc_setup.c +++ b/src/audio/mfcc/mfcc_setup.c @@ -335,15 +335,18 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i /* Check that output data can be drained within the periods spanned by one * FFT hop. Each hop consumes fft_hop_size input samples and produces - * max_out_per_hop + 12 (magic header) int16_t output values. The sink provides - * at least fft_hop_size * channels int16_t samples per hop (worst case s16). + * max_out_per_hop + header int32_t output values. The sink provides + * at least fft_hop_size * channels int32_t samples per hop (worst case s32). * If output exceeds this, data accumulates and will eventually overflow. + * This check is not needed in compress output mode where only actual data + * bytes are committed without zero padding. */ - int out_per_hop = max_out_per_hop + sizeof(state->header) / sizeof(int16_t); + int out_per_hop = max_out_per_hop + sizeof(state->header) / sizeof(int32_t); int sink_per_hop = fft->fft_hop_size * channels; + bool skip_size_check = config->compress_output; - if (out_per_hop > sink_per_hop) { - comp_err(dev, "Output %d int16 per hop exceeds sink capacity %d (hop %d x ch %d)", + if (!skip_size_check && out_per_hop > sink_per_hop) { + comp_err(dev, "Output %d int32 per hop exceeds sink capacity %d (hop %d x ch %d)", out_per_hop, sink_per_hop, fft->fft_hop_size, channels); ret = -EINVAL; goto free_lifter; @@ -357,8 +360,11 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i memset(&state->header, 0, sizeof(state->header)); state->header.magic = MFCC_MAGIC; state->out_data_ptr = NULL; - state->out_data_ptr_32 = NULL; state->out_remain = 0; + state->vad_silence_count = 0; + state->dtx_trailing_silence = config->dtx_trailing_silence_hops; + state->dtx_silence_interval = config->dtx_silence_hops_interval; + state->dtx_silence_counter = 0; if (config->enable_vad) { ret = mfcc_vad_init(&cd->vad, config->num_mel_bins, sample_rate, mod); diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h index 4e41c8a4df08..b380cd84fdf0 100644 --- a/src/include/sof/audio/mfcc/mfcc_comp.h +++ b/src/include/sof/audio/mfcc/mfcc_comp.h @@ -54,18 +54,6 @@ struct mfcc_data_header { int32_t vad_flag; /**< VAD decision: 1 = speech, 0 = silence */ }; -/** \brief Type definition for processing function select return value. */ -typedef void (*mfcc_func)(struct processing_module *mod, - struct input_stream_buffer *bsource, - struct output_stream_buffer *bsink, - int frames); - -/** \brief MFCC processing functions map item. */ -struct mfcc_func_map { - uint8_t source; /**< source frame format */ - mfcc_func func; /**< processing function */ -}; - struct mfcc_buffer { int16_t *addr; int16_t *end_addr; @@ -82,6 +70,10 @@ struct mfcc_pre_emph { int enable; }; +/** \brief Type definition for source/sink based input copy function. */ +typedef void (*mfcc_source_func)(struct sof_source *source, struct mfcc_buffer *buf, + struct mfcc_pre_emph *emph, int frames, int source_channel); + struct mfcc_fft { struct icomplex32 *fft_buf; /**< fft_padded_size */ struct icomplex32 *fft_out; /**< fft_padded_size */ @@ -130,10 +122,13 @@ struct mfcc_state { bool header_pending; /**< True when data header not yet written for current output */ struct mfcc_data_header header; /**< Data header for current output frame */ size_t sample_buffers_size; /**< bytes */ - int16_t *out_data_ptr; /**< Read pointer into scratch data for multi-period output */ - int32_t *out_data_ptr_32; /**< Read pointer for 32-bit mel-only output */ - int out_remain; /**< Remaining int16_t samples to write to sink from scratch */ + int32_t *out_data_ptr; /**< Read pointer into scratch data for multi-period output */ + int out_remain; /**< Remaining int32_t samples to write to sink from scratch */ uint32_t hop_count; /**< FFT hop counter, increments every processed hop */ + int vad_silence_count; /**< Consecutive VAD=0 hops since last speech */ + int16_t dtx_trailing_silence; /**< Number of trailing silence hops to send, from config */ + int16_t dtx_silence_interval; /**< Send silence frame every Nth hop, 0 = disable */ + int dtx_silence_counter; /**< Counter for periodic silence frame send */ }; /* MFCC component private data */ @@ -144,8 +139,9 @@ struct mfcc_comp_data { struct sof_mfcc_config *config; struct ipc_msg *msg; /**< IPC notification for VAD switch control */ int max_frames; + enum sof_ipc_frame source_format; /**< Source audio format for output sizing */ bool vad_prev; /**< Previous VAD state for edge detection */ - mfcc_func mfcc_func; /**< processing function */ + mfcc_source_func source_func; /**< source copy function */ }; static inline int mfcc_buffer_samples_without_wrap(struct mfcc_buffer *buffer, int16_t *ptr) @@ -172,31 +168,37 @@ void mfcc_fill_fft_buffer(struct mfcc_state *state); void mfcc_apply_window(struct mfcc_state *state, int input_shift); -#if CONFIG_FORMAT_S16LE +/** + * \brief Run STFT and Mel/DCT processing. + * \return Number of output coefficients produced, or 0 if not enough data. + */ +int mfcc_stft_process(struct processing_module *mod, struct mfcc_comp_data *cd); -void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel); +/** + * \brief Prepare and commit MFCC output data after STFT processing. + * + * This handles the output data conversion and dispatches to either the + * compress-output or legacy PCM-output path. + * + * \return 0 on success or a negative error code. + */ +int mfcc_process_output(struct processing_module *mod, struct mfcc_comp_data *cd, + struct sof_source **sources, struct sof_sink **sinks, + int num_ceps, int frames); -void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer *bsource, - struct output_stream_buffer *bsink, int frames); +#if CONFIG_FORMAT_S16LE +void mfcc_source_copy_s16(struct sof_source *source, struct mfcc_buffer *buf, + struct mfcc_pre_emph *emph, int frames, int source_channel); #endif #if CONFIG_FORMAT_S24LE - -void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, +void mfcc_source_copy_s24(struct sof_source *source, struct mfcc_buffer *buf, struct mfcc_pre_emph *emph, int frames, int source_channel); - -void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer *bsource, - struct output_stream_buffer *bsink, int frames); #endif #if CONFIG_FORMAT_S32LE - -void mfcc_source_copy_s32(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, +void mfcc_source_copy_s32(struct sof_source *source, struct mfcc_buffer *buf, struct mfcc_pre_emph *emph, int frames, int source_channel); - -void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer *bsource, - struct output_stream_buffer *bsink, int frames); #endif #if CONFIG_IPC_MAJOR_4 diff --git a/src/include/user/mfcc.h b/src/include/user/mfcc.h index a2f3717daa52..286ee4f5e985 100644 --- a/src/include/user/mfcc.h +++ b/src/include/user/mfcc.h @@ -54,7 +54,9 @@ struct sof_mfcc_config { int16_t mel_scale; /**< Q4.12 default 1.0, use 0.25 for Whisper */ int16_t mmax_init; /**< Q8.7 default 0, with dynamic_mmax false, can sim. Whisper mmax */ int16_t mmax_coef; /**< Q1.15 decay coefficient for dynamic mmax, a small value for slow */ - uint32_t reserved[6]; + uint16_t dtx_trailing_silence_hops; /**< DTX: number of silence hops to send after speech, 0 = send first only */ + uint16_t dtx_silence_hops_interval; /**< DTX: send silence frame every Nth hop during VAD=0, 0 = disable */ + uint32_t reserved[5]; int32_t sample_frequency; /**< Hz. e.g. 16000 */ int32_t pmin; /**< Q1.31 linear power, limit minimum Mel energy, e.g. 1e-9 */ enum sof_mfcc_mel_log_type mel_log; /**< Use MEL_LOG_IS_LOG, LOG10 or DB*/ @@ -87,9 +89,10 @@ struct sof_mfcc_config { bool use_energy; /**< Must be false (0) */ bool dynamic_mmax; /**< Track max Mel value for clamp with top_db value */ bool enable_vad; /**< Run VAD algorithm */ - bool enable_dtx; /**< Reserved (stream once per second non-speech frames) */ + bool enable_dtx; /**< Discontinuous transmission: suppress silence after trailing frames */ bool update_controls; /**< Update controls with VAD decision */ - bool reserved_bool[5]; /* Reserved for future boolean flags, set to false (0) */ + bool compress_output; /**< Use compress PCM output: variable size, no zero padding */ + bool reserved_bool[4]; /* Reserved for future boolean flags, set to false (0) */ } __attribute__((packed)); #endif /* __USER_MFCC_H__ */ From 4048931d6b23a022a56d900c52e0ceab50900254 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Tue, 26 May 2026 15:44:06 +0300 Subject: [PATCH 6/8] base_fw: advertise BESPOKE codec for MFCC compress capture Register SND_AUDIOCODEC_BESPOKE capture in codec info TLV when CONFIG_COMP_MFCC is enabled so the kernel detects compress capture support via IPC4_SOF_CODEC_INFO. Signed-off-by: Seppo Ingalsuo --- src/audio/base_fw.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/audio/base_fw.c b/src/audio/base_fw.c index b86db469765a..c5a874e41c54 100644 --- a/src/audio/base_fw.c +++ b/src/audio/base_fw.c @@ -100,6 +100,10 @@ static void get_codec_info(struct sof_tlv **tuple) codec_info.items[codec_info.count++] = SET_CODEC_INFO_ITEM(SND_AUDIOCODEC_VORBIS, SOF_IPC_STREAM_PLAYBACK); #endif +#ifdef CONFIG_COMP_MFCC + codec_info.items[codec_info.count++] = + SET_CODEC_INFO_ITEM(SND_AUDIOCODEC_BESPOKE, SOF_IPC_STREAM_CAPTURE); +#endif if (!codec_info.count) return; From acad938f65407b9a74f219bd6c1bfd886668241b Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Tue, 26 May 2026 15:44:23 +0300 Subject: [PATCH 7/8] audio: mfcc: update decode tools and add Python compress scripts Update Octave decode scripts for int32 Q9.23 output and DTX gap filling. Add DTX blob generation to setup_mfcc.m. Add Python compress capture tools: sof_mel_spectrogram_compress.py, sof_ceps_spectrogram_compress.py, sof_mel_to_text_live_compress.py. Refactor sof_mel_to_text_live_dsp_vad.py to use shared compress capture code. Add README with usage examples. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/tune/README.md | 37 ++ src/audio/mfcc/tune/decode_all.m | 13 +- src/audio/mfcc/tune/decode_ceps.m | 97 ++-- src/audio/mfcc/tune/decode_mel.m | 141 +++-- src/audio/mfcc/tune/setup_mfcc.m | 41 +- .../tune/sof_ceps_spectrogram_compress.py | 234 +++++++++ .../mfcc/tune/sof_mel_spectrogram_compress.py | 227 +++++++++ .../tune/sof_mel_to_text_live_compress.py | 482 ++++++++++++++++++ .../mfcc/tune/sof_mel_to_text_live_dsp_vad.py | 91 +--- .../components/mfcc/ceps13_compress_dtx.conf | 24 + .../include/components/mfcc/default.conf | 2 +- .../include/components/mfcc/mel80.conf | 2 +- .../components/mfcc/mel80_compress.conf | 24 + .../components/mfcc/mel80_compress_dtx.conf | 24 + 14 files changed, 1227 insertions(+), 212 deletions(-) create mode 100644 src/audio/mfcc/tune/sof_ceps_spectrogram_compress.py create mode 100644 src/audio/mfcc/tune/sof_mel_spectrogram_compress.py create mode 100644 src/audio/mfcc/tune/sof_mel_to_text_live_compress.py create mode 100644 tools/topology/topology2/include/components/mfcc/ceps13_compress_dtx.conf create mode 100644 tools/topology/topology2/include/components/mfcc/mel80_compress.conf create mode 100644 tools/topology/topology2/include/components/mfcc/mel80_compress_dtx.conf diff --git a/src/audio/mfcc/tune/README.md b/src/audio/mfcc/tune/README.md index e50b74d84785..f825afb758a6 100644 --- a/src/audio/mfcc/tune/README.md +++ b/src/audio/mfcc/tune/README.md @@ -150,3 +150,40 @@ Whisper model: whisper-medium-int4-ov (encoder: CPU, decoder: CPU) >> "Hello computer" ``` + +## Live Spectrogram Viewers + +### Mel Spectrogram + +The `sof_mel_spectrogram_compress.py` script captures Mel spectrogram +frames from a SOF compress PCM device and displays them as a live +scrolling spectrogram with VAD status. This is a lightweight viewer +that does not run Whisper inference. + +```bash +python3 sof_mel_spectrogram_compress.py --card 0 --device 48 --width 300 +``` + +### Cepstral Spectrogram + +The `sof_ceps_spectrogram_compress.py` script is similar but displays +cepstral coefficients (MFCC) instead of Mel bands. + +```bash +python3 sof_ceps_spectrogram_compress.py --card 0 --device 48 --num-ceps 13 --width 300 +``` + +## Live Whisper Transcription with Compress PCM + +The `sof_mel_to_text_live_compress.py` script captures Mel spectrogram +frames from a SOF compress PCM device and performs live Whisper +transcription using OpenVINO. Unlike `sof_mel_to_text_live_dsp_vad.py` +which uses `arecord`, this script reads directly from the compress PCM +device with DTX-aware frame handling. + +```bash +python3 sof_mel_to_text_live_compress.py --card 0 --device 48 --model whisper-medium-int4-ov +``` + +The same OpenVINO prerequisites and pip packages apply as described above +for `sof_mel_to_text_live_dsp_vad.py`. diff --git a/src/audio/mfcc/tune/decode_all.m b/src/audio/mfcc/tune/decode_all.m index f5c7e1a06db4..4c377bf5029a 100644 --- a/src/audio/mfcc/tune/decode_all.m +++ b/src/audio/mfcc/tune/decode_all.m @@ -6,12 +6,11 @@ num_ceps = 13; num_mel = 80; -% MFCC cepstral output files +% MFCC cepstral output files (all int32 output, Q9.23) ceps_files = {'mfcc_s16.raw', 'mfcc_s24.raw', 'mfcc_s32.raw'}; -% Mel output files with corresponding format +% Mel output files (all int32 output, Q9.23) mel_files = {'mel_s16.raw', 'mel_s24.raw', 'mel_s32.raw'}; -mel_fmts = {'s16', 's24', 's32'}; % Xtensa prefixed variants xt_ceps_files = {'xt_mfcc_s16.raw', 'xt_mfcc_s24.raw', 'xt_mfcc_s32.raw'}; @@ -19,21 +18,21 @@ all_ceps_files = [ceps_files, xt_ceps_files]; all_mel_files = [mel_files, xt_mel_files]; -all_mel_fmts = [mel_fmts, mel_fmts]; for i = 1:length(all_ceps_files) fn = all_ceps_files{i}; if exist(fn, 'file') fprintf('Decoding MFCC ceps: %s\n', fn); - [ceps, t, n, vad, energy, noise_energy, frame_num] = decode_ceps(fn, num_ceps); + [ceps, t, n, vad, energy, noise_energy, frame_num] = ... + decode_ceps(fn, num_ceps); end end for i = 1:length(all_mel_files) fn = all_mel_files{i}; - fmt = all_mel_fmts{i}; if exist(fn, 'file') fprintf('Decoding Mel: %s\n', fn); - [mel, t, n, vad, energy, noise_energy, frame_num] = decode_mel(fn, num_mel, fmt); + [mel, t, n, vad, energy, noise_energy, frame_num] = ... + decode_mel(fn, num_mel); end end diff --git a/src/audio/mfcc/tune/decode_ceps.m b/src/audio/mfcc/tune/decode_ceps.m index c094ced7c0e1..480eadea2945 100644 --- a/src/audio/mfcc/tune/decode_ceps.m +++ b/src/audio/mfcc/tune/decode_ceps.m @@ -1,9 +1,10 @@ -% [ceps, t, n, vad, energy, noise_energy, frame_number] = decode_ceps(fn, num_ceps, num_channels) +% [ceps, t, n, vad, energy, noise_energy, frame_number] = decode_ceps(fn, num_ceps, hop, num_channels) % % Input % fn - File with MFCC data in .raw or .wav format % num_ceps - number of cepstral coefficients per frame -% num_channels - needed for .raw format, omit for .wav +% hop - STFT hop in seconds, defaults to 10e-3 for 10 ms +% num_channels - needed for .raw format, omit for .wav, default 1 % % Outputs % ceps - cepstral coefficients @@ -18,39 +19,34 @@ % Copyright(c) 2022-2026 Intel Corporation. All rights reserved. function [ceps, t, n, vad, energy, noise_energy, frame_number] = ... - decode_ceps(fn, num_ceps, num_channels) + decode_ceps(fn, num_ceps, hop, num_channels) if nargin < 3 + hop = 10e-3; +end +if nargin < 4 num_channels = 1; end % MFCC stream -fs = 16e3; -qformat = 7; -magic = [25443 28006]; % ASCII 'mfcc' as int16 -num_magic = 2; % magic word is 2 x int16 +qformat = 23; % Q9.23 in int32 +magic = int32(1835426659); % 0x6D666363 as int32 +num_magic = 1; % magic word is 1 x int32 -% Load output data +% Load output data (always int32) [data, num_channels] = get_file(fn, num_channels); -idx1 = find(data == magic(1)); -idx = []; -for i = 1:length(idx1) - if data(idx1(i) + 1) == magic(2) - idx = [idx idx1(i)]; - end -end +idx = find(data == magic); if isempty(idx) error('No magic value markers found from stream'); end -period_ceps = idx(2)-idx(1); num_frames = length(idx); % Header after magic is [frame_number, reserved, energy, noise_energy, vad_flag] -% as int32 (10 int16 slots), followed by num_ceps coefficients. -payload_len = 10 + num_ceps; % 5 int32 = 10 int16, then ceps data +% as int32, followed by num_ceps coefficients (int32). +payload_len = 5 + num_ceps; % Last frame can be incomplete due to span over multiple periods last = idx(end) + num_magic + payload_len - 1; @@ -58,10 +54,6 @@ num_frames = num_frames - 1; end -t_ceps = period_ceps / num_channels / fs; -t = (0:num_frames -1) * t_ceps; -n = 1:num_ceps; - payload = zeros(payload_len, num_frames); for i = 1:num_frames i1 = idx(i) + num_magic; @@ -69,14 +61,51 @@ payload(:,i) = double(data(i1:i2)); end -% Reassemble int32 from pairs of int16 (little-endian). -% Low half must be treated as unsigned with mod() to handle negative int16. -frame_number = mod(payload(1,:), 65536) + payload(2,:) * 65536; -% payload(3:4,:) is reserved, skip -energy = (mod(payload(5,:), 65536) + payload(6,:) * 65536) / 2^23; -noise_energy = (mod(payload(7,:), 65536) + payload(8,:) * 65536) / 2^23; -vad = mod(payload(9,:), 65536) + payload(10,:) * 65536; -ceps = payload(11:payload_len, :) / 2^qformat; +frame_number = payload(1, :); +% payload(2,:) is reserved, skip +energy = payload(3, :) / 2^23; +noise_energy = payload(4, :) / 2^23; +vad = payload(5, :); +ceps = payload(6:payload_len, :) / 2^qformat; + +% Fill gaps from DTX-suppressed VAD=0 frames to create continuous timeline. +% Missing frames are filled with the minimum ceps value found in the data. +first_frame = frame_number(1); +last_frame = frame_number(end); +total_frames = last_frame - first_frame + 1; +if total_frames > num_frames + ceps_fill = min(ceps(:)); + ceps_full = ones(num_ceps, total_frames) * ceps_fill; + vad_full = zeros(1, total_frames); + energy_full = zeros(1, total_frames); + noise_energy_full = zeros(1, total_frames); + frame_number_full = first_frame:last_frame; + has_data = false(1, total_frames); + for i = 1:num_frames + fi = frame_number(i) - first_frame + 1; + ceps_full(:, fi) = ceps(:, i); + vad_full(fi) = vad(i); + energy_full(fi) = energy(i); + noise_energy_full(fi) = noise_energy(i); + has_data(fi) = true; + end + % Forward-fill gaps with last received values + for fi = 2:total_frames + if ~has_data(fi) + ceps_full(:, fi) = ceps_full(:, fi - 1); + energy_full(fi) = energy_full(fi - 1); + noise_energy_full(fi) = noise_energy_full(fi - 1); + end + end + ceps = ceps_full; + vad = vad_full; + energy = energy_full; + noise_energy = noise_energy_full; + frame_number = frame_number_full; +end + +t = (frame_number - first_frame) * hop; +n = 1:num_ceps; figure; surf(t, n, ceps, 'EdgeColor', 'none'); @@ -96,18 +125,18 @@ switch lower(ext) case '.raw' fh = fopen(fn, 'r'); - data = fread(fh, 'int16'); + data = fread(fh, 'int32'); fclose(fh); case '.wav' tmp = audioread(fn, 'native'); t = whos('tmp'); - if ~strcmp(t.class, 'int16') - error('Only 16-bit wav file format is supported'); + if ~strcmp(t.class, 'int32') + error('Expected 32-bit wav for int32 MFCC output format'); end s = size(tmp); num_channels = s(2); if num_channels > 1 - data = int16(zeros(prod(s), 1)); + data = int32(zeros(prod(s), 1)); for i = 1:num_channels data(i:num_channels:end) = tmp(:, i); end diff --git a/src/audio/mfcc/tune/decode_mel.m b/src/audio/mfcc/tune/decode_mel.m index 0b9b8d09c5a8..0aca1e35ec8d 100644 --- a/src/audio/mfcc/tune/decode_mel.m +++ b/src/audio/mfcc/tune/decode_mel.m @@ -1,10 +1,10 @@ -% [mel, t, n, vad, energy, noise_energy, frame_number] = decode_mel(fn, num_mel, fmt, num_channels) +% [mel, t, n, vad, energy, noise_energy, frame_number] = decode_mel(fn, num_mel, hop, num_channels) % % Input % fn - File with Mel data in .raw or .wav format +% hop - STFT hop in seconds, defaults to 10e-3 for 10 ms % num_mel - number of Mel coefficients per frame -% fmt - format of the Mel data ('s16', 's24', 's32') -% num_channels - needed for .raw format, omit for .wav, default 2 +% num_channels - needed for .raw format, omit for .wav, default 1 % % Outputs % mel - Mel coefficients @@ -19,57 +19,32 @@ % Copyright(c) 2026 Intel Corporation. function [mel, t, n, vad, energy, noise_energy, frame_number] = ... - decode_mel(fn, num_mel, fmt, num_channels) + decode_mel(fn, num_mel, hop, num_channels) if nargin < 3 - fmt = 's16'; + hop = 10e-3; end - if nargin < 4 - num_channels = 2; + num_channels = 1; end % MFCC stream fs = 16e3; +qformat = 23; % Q9.23 in int32 -switch fmt - case 's16' - qformat = 7; - magic = [25443 28006]; % ASCII 'mfcc' as two int16 - word_size_multiplier = 2; - case 's24' - qformat = 15; - magic = int32(1835426659); % 0x6D666363 as int32 - word_size_multiplier = 1; - case 's32' - qformat = 23; - magic = int32(1835426659); % 0x6D666363 as int32 - word_size_multiplier = 1; - otherwise - error("Use 's16', 's24', or 's32' as format."); -end - -num_magic = word_size_multiplier; % magic word is 2 x int16 or 1 x int32 -num_other_header = 5 * word_size_multiplier; % frame_number, reserved, energy, noise, vad +magic = int32(1835426659); % 0x6D666363 as int32 +num_magic = 1; % magic word is 1 x int32 +num_other_header = 5; % frame_number, reserved, energy, noise, vad (all int32) -% Load output data -[data, num_channels] = get_file(fn, num_channels, fmt); +% Load output data (always int32) +[data, num_channels] = get_file(fn, num_channels); -if strcmp(fmt, 's16') - idx1 = find(data == magic(1)); - idx = []; - for i = 1:length(idx1) - next_word = idx1(i) + 1; - if next_word <= length(data) - if data(next_word) == magic(2) - idx = [idx idx1(i)]; - end - end - end -else - idx = find(data == magic); +if isempty(data) + error('File %s is empty', fn); end +idx = find(data == magic); + if isempty(idx) error('No magic value markers found from stream'); end @@ -95,26 +70,50 @@ payload(:,i) = double(data(i1:i2)); end -if strcmp(fmt, 's16') - % Reassemble int32 from pairs of int16 (little-endian). - % Low half must be treated as unsigned with mod() to handle negative int16. - frame_number = mod(payload(1,:), 65536) + payload(2,:) * 65536; - % payload(3:4,:) is reserved, skip - energy = (mod(payload(5,:), 65536) + payload(6,:) * 65536) / 2^23; - noise_energy = (mod(payload(7,:), 65536) + payload(8,:) * 65536) / 2^23; - vad = mod(payload(9,:), 65536) + payload(10,:) * 65536; - mel = payload(11:payload_len, :) / 2^qformat; -else - frame_number = payload(1, :); - % payload(2,:) is reserved, skip - energy = payload(3, :) / 2^23; - noise_energy = payload(4, :) / 2^23; - vad = payload(5, :); - mel = payload(6:payload_len, :) / 2^qformat; +frame_number = payload(1, :); +% payload(2,:) is reserved, skip +energy = payload(3, :) / 2^23; +noise_energy = payload(4, :) / 2^23; +vad = payload(5, :); +mel = payload(6:payload_len, :) / 2^qformat; + +% Fill gaps from DTX-suppressed VAD=0 frames to create continuous timeline. +% Missing frames are filled with the minimum Mel value found in the data. +first_frame = frame_number(1); +last_frame = frame_number(end); +total_frames = last_frame - first_frame + 1; +if total_frames > num_frames + mel_fill = min(mel(:)); + mel_full = ones(num_mel, total_frames) * mel_fill; + vad_full = zeros(1, total_frames); + energy_full = zeros(1, total_frames); + noise_energy_full = zeros(1, total_frames); + frame_number_full = first_frame:last_frame; + has_data = false(1, total_frames); + for i = 1:num_frames + fi = frame_number(i) - first_frame + 1; + mel_full(:, fi) = mel(:, i); + vad_full(fi) = vad(i); + energy_full(fi) = energy(i); + noise_energy_full(fi) = noise_energy(i); + has_data(fi) = true; + end + % Forward-fill gaps with last received values + for fi = 2:total_frames + if ~has_data(fi) + mel_full(:, fi) = mel_full(:, fi - 1); + energy_full(fi) = energy_full(fi - 1); + noise_energy_full(fi) = noise_energy_full(fi - 1); + end + end + mel = mel_full; + vad = vad_full; + energy = energy_full; + noise_energy = noise_energy_full; + frame_number = frame_number_full; end -t_mel = period_mel / num_channels / fs; -t = (0:num_frames -1) * t_mel; +t = (frame_number - first_frame) * hop; n = 1:num_mel; figure @@ -145,36 +144,20 @@ end -function [data, num_channels] = get_file(fn, num_channels, fmt) +function [data, num_channels] = get_file(fn, num_channels) [~, ~, ext] = fileparts(fn); -switch fmt - case 's16' - read_fmt = 'int16'; - case {'s24', 's32'} - read_fmt = 'int32'; - otherwise - error("Use 's16', 's24', or 's32' as format."); -end - switch lower(ext) case '.raw' fh = fopen(fn, 'r'); - data = fread(fh, read_fmt); + data = fread(fh, 'int32'); fclose(fh); case '.wav' tmp = audioread(fn, 'native'); t = whos('tmp'); - switch fmt - case 's16' - if ~strcmp(t.class, 'int16') - error('Expected 16-bit wav for s16 format'); - end - case {'s24', 's32'} - if ~strcmp(t.class, 'int32') - error('Expected 32-bit wav for %s format', fmt); - end + if ~strcmp(t.class, 'int32') + error('Expected 32-bit wav for int32 MFCC output format'); end s = size(tmp); num_channels = s(2); diff --git a/src/audio/mfcc/tune/setup_mfcc.m b/src/audio/mfcc/tune/setup_mfcc.m index 3cda3221a4fc..dbf69587a74f 100644 --- a/src/audio/mfcc/tune/setup_mfcc.m +++ b/src/audio/mfcc/tune/setup_mfcc.m @@ -25,6 +25,32 @@ function setup_mfcc() setup.tplg_fn = 'mel80.conf'; export_mfcc_setup(gen_cfg, setup); + % Blob for mel spectrogram with compress PCM output + setup = get_mel_spectrogram_config(); + setup.compress_output = true; + setup.tplg_fn = 'mel80_compress.conf'; + export_mfcc_setup(gen_cfg, setup); + + % Blob for mel spectrogram with compress PCM output and DTX + setup = get_mel_spectrogram_config(); + setup.compress_output = true; + setup.enable_dtx = true; + setup.dtx_trailing_silence_hops = 20; + setup.dtx_silence_hops_interval = 500; + setup.tplg_fn = 'mel80_compress_dtx.conf'; + export_mfcc_setup(gen_cfg, setup); + + % Default MFCC (cepstral) with compress PCM output + setup = get_mfcc_default_config(); + setup.compress_output = true; + setup.enable_vad = true; + setup.enable_dtx = true; + setup.dtx_trailing_silence_hops = 20; + setup.dtx_silence_hops_interval = 500; + setup.update_controls = true; + setup.tplg_fn = 'ceps13_compress_dtx.conf'; + export_mfcc_setup(gen_cfg, setup); + end function cfg = get_mfcc_default_config() @@ -64,7 +90,10 @@ function setup_mfcc() cfg.dynamic_mmax = false; % same cfg.enable_vad = false; cfg.enable_dtx = false; + cfg.dtx_trailing_silence_hops = 0; + cfg.dtx_silence_hops_interval = 0; cfg.update_controls = false; + cfg.compress_output = false; end function cfg = get_mel_spectrogram_config() @@ -104,7 +133,10 @@ function setup_mfcc() cfg.dynamic_mmax = true; cfg.enable_vad = true; cfg.enable_dtx = false; + cfg.dtx_trailing_silence_hops = 0; + cfg.dtx_silence_hops_interval = 0; cfg.update_controls = true; + cfg.compress_output = false; end function export_mfcc_setup(gen_cfg, cfg) @@ -139,8 +171,10 @@ function export_mfcc_setup(gen_cfg, cfg) v = q_convert(cfg.mmax_init, 7); [b8, j] = add_w16b(v, b8, j); v = q_convert(cfg.mmax_coef, 15); [b8, j] = add_w16b(v, b8, j); +v = cfg.dtx_trailing_silence_hops; [b8, j] = add_w16b(v, b8, j); % DTX trailing silence hops +v = cfg.dtx_silence_hops_interval; [b8, j] = add_w16b(v, b8, j); % DTX silence frame interval % Reserved -for i = 1:6 +for i = 1:5 [b8, j] = add_w32b(0, b8, j); end @@ -181,8 +215,9 @@ function export_mfcc_setup(gen_cfg, cfg) v = cfg.enable_vad; [b8, j] = add_w8b(v, b8, j); % bool v = cfg.enable_dtx; [b8, j] = add_w8b(v, b8, j); % bool v = cfg.update_controls; [b8, j] = add_w8b(v, b8, j); % bool -% reserved_bool[5] -for i = 1:5 +v = cfg.compress_output; [b8, j] = add_w8b(v, b8, j); % bool +% reserved_bool[4] +for i = 1:4 [b8, j] = add_w8b(0, b8, j); end diff --git a/src/audio/mfcc/tune/sof_ceps_spectrogram_compress.py b/src/audio/mfcc/tune/sof_ceps_spectrogram_compress.py new file mode 100644 index 000000000000..3a61641c0812 --- /dev/null +++ b/src/audio/mfcc/tune/sof_ceps_spectrogram_compress.py @@ -0,0 +1,234 @@ +"""Live scrolling cepstral coefficient viewer for SOF compress PCM capture. + +Displays a real-time scrolling MFCC (cepstral coefficient) plot and VAD +strip from ALSA compress PCM capture (crecord) with embedded DSP VAD flag. + +Frame format: [magic(int32), frame_number(uint32), reserved(int32), + energy(int32), noise_energy(int32), vad_flag(int32), + ceps[0..N-1](int32)] + +Cepstral coefficients are in Q9.23 fixed-point format. + +Usage: + python sof_ceps_spectrogram_compress.py [--card 0] [--device 48] + python sof_ceps_spectrogram_compress.py --num-ceps 13 --width 300 +""" + +import argparse +import os +import queue +import struct +import subprocess +import threading +import numpy as np +import matplotlib +matplotlib.use('TkAgg') +import matplotlib.pyplot as plt + +# SOF compress frame format constants (with DSP data header) +SOF_MAGIC_BYTES = struct.pack(' 3: + del buf[:-3] + return None, None, None + end = idx + frame_bytes + if end > len(buf): + del buf[:idx] + return None, None, None + + frame_number = struct.unpack_from(' 3: + del buf[:-3] + return None, None, None + end = idx + SOF_FRAME_BYTES + if end > len(buf): + del buf[:idx] + return None, None, None + + frame_number = struct.unpack_from(' 3: + del buf[:-3] + return None, None, None + end = idx + SOF_FRAME_BYTES + if end > len(buf): + del buf[:idx] + return None, None, None + + # Parse header fields + frame_number = struct.unpack_from('> \"{text}\"\n", flush=True) + else: + print(" [Whisper] empty result", flush=True) + + def flush_speech(t_now): + """Flush speech buffer to Whisper.""" + nonlocal speech_buffer, silence_time, pending_queue, pending_t + if not speech_buffer: + silence_time = None + return + if not try_transcribe(transcriber, speech_buffer, t_now, + on_transcription): + pending_queue = list(speech_buffer) + pending_t = t_now + speech_buffer.clear() + silence_time = None + + try: + while True: + # Calculate queue timeout based on patience timer + get_timeout = 0.1 # default polling interval + if silence_time is not None: + remaining = patience - (time.monotonic() - silence_time) + get_timeout = max(remaining, 0.01) + + try: + item = frame_q.get(timeout=get_timeout) + except queue.Empty: + # Patience expired — flush speech to Whisper + if silence_time is not None: + elapsed = time.monotonic() - silence_time + if elapsed >= patience: + t = last_hop * SOF_HOP_SEC + flush_speech(t) + + # Drain pending queue when Whisper becomes free + if pending_queue is not None and not transcriber.is_busy(): + print(f" [{pending_t:7.2f}s] Whisper free, sending " + f"{len(pending_queue)} queued frames", flush=True) + transcriber.transcribe_async(pending_queue, on_transcription) + pending_queue = None + continue + + if item is None: + # Reader thread ended (crecord exited) + stderr_out = proc.stderr.read().decode(errors='replace') + rc = proc.wait() + print(f"\ncrecord exited with code {rc}") + if stderr_out: + print(f"stderr: {stderr_out}") + break + + frame_number, vad_flag, frame_ints = item + recv_frames += 1 + last_hop = frame_number + mel = decode_mel_frame(frame_ints) + speech = vad_flag != 0 + t = frame_number * SOF_HOP_SEC + + # Print VAD transitions + if speech != prev_speech: + tag = "SPEECH" if speech else "SILENCE" + print(f" [{t:7.2f}s] {tag} (hop {frame_number}, " + f"received {recv_frames})", flush=True) + prev_speech = speech + + # Drain pending queue when Whisper becomes free + if pending_queue is not None and not transcriber.is_busy(): + print(f" [{pending_t:7.2f}s] Whisper free, sending " + f"{len(pending_queue)} queued frames", flush=True) + transcriber.transcribe_async(pending_queue, on_transcription) + pending_queue = None + + # --- Speech buffering logic --- + if speech: + if len(speech_buffer) >= MAX_SPEECH_FRAMES: + n = len(speech_buffer) + duration = n * SOF_HOP_SEC + print(f" [{t:7.2f}s] Buffer full ({duration:.1f}s), " + f"forcing transcription", flush=True) + flush_speech(t) + + speech_buffer.append(mel.copy()) + silence_time = None # speech resumed, cancel patience timer + + else: + # VAD=0: start patience timer if we have buffered speech. + # Don't refresh if already running so trailing silence + # frames don't extend the wait. + if speech_buffer and silence_time is None: + silence_time = time.monotonic() + + except (KeyboardInterrupt, BrokenPipeError): + pass + finally: + # Flush remaining speech + if speech_buffer: + t = last_hop * SOF_HOP_SEC + flush_speech(t) + if proc.poll() is None: + proc.terminate() + try: + proc.wait(timeout=3) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + print(f"\n\nCapture stopped. Received {recv_frames} frames.") + + +def main(): + parser = argparse.ArgumentParser( + description="Live SOF mel capture from compress PCM with DTX-aware " + "Whisper transcription") + parser.add_argument('--card', '-c', type=int, default=0, + help='ALSA card number (default: 0)') + parser.add_argument('--device', '-d', type=int, default=54, + help='ALSA compress device number (default: 54)') + parser.add_argument('--model', '-m', default='whisper-medium-int4-ov', + help='Path to Whisper OpenVINO model directory') + parser.add_argument('--encoder-device', default='NPU', + help='OpenVINO device for encoder (default: NPU)') + parser.add_argument('--decoder-device', default='CPU', + help='OpenVINO device for decoder (default: CPU)') + parser.add_argument('--patience', type=float, default=SILENCE_PATIENCE_S, + help=f'Seconds of silence patience before triggering ' + f'transcription (default: {SILENCE_PATIENCE_S})') + args = parser.parse_args() + + model_id = "OpenVINO/" + os.path.basename(args.model) + if not os.path.isdir(args.model): + print(f"Downloading model {model_id} ...") + hf_hub.snapshot_download(model_id, local_dir=args.model) + + print("=== Live SOF Mel → Whisper Transcription (Compress PCM, DTX) ===\n") + run_capture(args.card, args.device, args.model, args.encoder_device, + args.decoder_device, patience=args.patience) + + +if __name__ == '__main__': + main() diff --git a/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py b/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py index 5c267a57ca79..9171df2e3cec 100644 --- a/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py +++ b/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py @@ -8,7 +8,6 @@ Usage: python sof_mel_to_text_live_dsp_vad.py [--device hw:0,47] [--model whisper-medium-int4-ov] - python sof_mel_to_text_live_dsp_vad.py --plot # with live spectrogram """ import argparse @@ -22,10 +21,6 @@ import huggingface_hub as hf_hub from pathlib import Path -# Graphics imports deferred until --plot is used -matplotlib = None -plt = None - # SOF mel_s32.raw format constants (with DSP data header) SOF_MAGIC_BYTES = struct.pack('= MAX_SPEECH_FRAMES: @@ -429,11 +353,6 @@ def on_transcription(text): except subprocess.TimeoutExpired: proc.kill() proc.wait() - if plotter is not None: - try: - plt.close(plotter.fig) - except Exception: - pass print("\n\nCapture stopped.") @@ -450,8 +369,6 @@ def main(): help='OpenVINO device for encoder (default: NPU)') parser.add_argument('--decoder-device', default='CPU', help='OpenVINO device for decoder (default: CPU)') - parser.add_argument('--plot', action='store_true', - help='Show live scrolling mel spectrogram and VAD plot') args = parser.parse_args() model_id = "OpenVINO/" + os.path.basename(args.model) if not os.path.isdir(args.model): @@ -460,7 +377,7 @@ def main(): print("=== Live SOF Mel → Whisper Transcription (DSP VAD) ===\n") run_capture(args.device, args.rate, args.model, args.encoder_device, - args.decoder_device, enable_plot=args.plot) + args.decoder_device) if __name__ == '__main__': diff --git a/tools/topology/topology2/include/components/mfcc/ceps13_compress_dtx.conf b/tools/topology/topology2/include/components/mfcc/ceps13_compress_dtx.conf new file mode 100644 index 000000000000..7056b9e7cb4b --- /dev/null +++ b/tools/topology/topology2/include/components/mfcc/ceps13_compress_dtx.conf @@ -0,0 +1,24 @@ +# Exported MFCC configuration 26-May-2026 +# cd src/audio/mfcc/tune; octave setup_mfcc.m +Object.Base.data."mfcc_config" { + bytes " + 0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00, + 0x74,0x00,0x00,0x00,0x01,0xd0,0x01,0x03, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x74,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x14,0x00,0xf4,0x01, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x80,0x3e,0x00,0x00, + 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x02,0x00,0x00,0x00,0x01,0x00,0x00,0x00, + 0xc3,0x35,0x00,0x2c,0x00,0x00,0x00,0x00, + 0x90,0x01,0xa0,0x00,0x00,0x00,0x14,0x00, + 0x0d,0x00,0x17,0x00,0x00,0x00,0x00,0x64, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x01,0x01,0x01, + 0x01,0x00,0x00,0x00,0x01,0x01,0x01,0x01, + 0x00,0x00,0x00,0x00" +} diff --git a/tools/topology/topology2/include/components/mfcc/default.conf b/tools/topology/topology2/include/components/mfcc/default.conf index 3bbd72696806..0ac19fa71d04 100644 --- a/tools/topology/topology2/include/components/mfcc/default.conf +++ b/tools/topology/topology2/include/components/mfcc/default.conf @@ -1,4 +1,4 @@ -# Exported MFCC configuration 19-May-2026 +# Exported MFCC configuration 26-May-2026 # cd src/audio/mfcc/tune; octave setup_mfcc.m Object.Base.data."mfcc_config" { bytes " diff --git a/tools/topology/topology2/include/components/mfcc/mel80.conf b/tools/topology/topology2/include/components/mfcc/mel80.conf index 480725c2d24f..b18baadd459b 100644 --- a/tools/topology/topology2/include/components/mfcc/mel80.conf +++ b/tools/topology/topology2/include/components/mfcc/mel80.conf @@ -1,4 +1,4 @@ -# Exported MFCC configuration 19-May-2026 +# Exported MFCC configuration 26-May-2026 # cd src/audio/mfcc/tune; octave setup_mfcc.m Object.Base.data."mfcc_config" { bytes " diff --git a/tools/topology/topology2/include/components/mfcc/mel80_compress.conf b/tools/topology/topology2/include/components/mfcc/mel80_compress.conf new file mode 100644 index 000000000000..f26f2af6980c --- /dev/null +++ b/tools/topology/topology2/include/components/mfcc/mel80_compress.conf @@ -0,0 +1,24 @@ +# Exported MFCC configuration 26-May-2026 +# cd src/audio/mfcc/tune; octave setup_mfcc.m +Object.Base.data."mfcc_config" { + bytes " + 0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00, + 0x74,0x00,0x00,0x00,0x01,0xd0,0x01,0x03, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x74,0x00,0x00,0x00,0x00,0x02,0x00,0x04, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x80,0x3e,0x00,0x00, + 0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00, + 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x03,0x00,0x00,0x00,0x01,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x90,0x01,0xa0,0x00,0x40,0x1f,0x00,0x00, + 0x00,0x00,0x50,0x00,0x00,0x00,0x00,0x04, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01, + 0x01,0x00,0x00,0x01,0x01,0x00,0x01,0x01, + 0x00,0x00,0x00,0x00" +} diff --git a/tools/topology/topology2/include/components/mfcc/mel80_compress_dtx.conf b/tools/topology/topology2/include/components/mfcc/mel80_compress_dtx.conf new file mode 100644 index 000000000000..d225811ca4d1 --- /dev/null +++ b/tools/topology/topology2/include/components/mfcc/mel80_compress_dtx.conf @@ -0,0 +1,24 @@ +# Exported MFCC configuration 26-May-2026 +# cd src/audio/mfcc/tune; octave setup_mfcc.m +Object.Base.data."mfcc_config" { + bytes " + 0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00, + 0x74,0x00,0x00,0x00,0x01,0xd0,0x01,0x03, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x74,0x00,0x00,0x00,0x00,0x02,0x00,0x04, + 0x00,0x00,0x00,0x00,0x14,0x00,0xf4,0x01, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x80,0x3e,0x00,0x00, + 0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00, + 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x03,0x00,0x00,0x00,0x01,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x90,0x01,0xa0,0x00,0x40,0x1f,0x00,0x00, + 0x00,0x00,0x50,0x00,0x00,0x00,0x00,0x04, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01, + 0x01,0x00,0x00,0x01,0x01,0x01,0x01,0x01, + 0x00,0x00,0x00,0x00" +} From 71404cecc1477782b2cc1210543014cb177a7896 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Tue, 26 May 2026 15:44:45 +0300 Subject: [PATCH 8/8] tools: topology: add MFCC compress capture for jack and DMIC Add sdw-jack-audio-feature-compress.conf (PCM 53, pipeline 132) and sdw-dmic-audio-feature-compress.conf (PCM 54, pipeline 133) for compress MFCC capture with DTX blobs. Fix buffer sizes: set MFCC obs and host-copier ibs/obs to 344 bytes (24-byte header + 80 x int32). Add mel and ceps compress topology targets for MTL and ARL. Rename normal MFCC topologies to *-mfcc-mel-normal for clarity. Signed-off-by: Seppo Ingalsuo --- tools/topology/topology2/cavs-sdw.conf | 8 +++ .../topology2/development/tplg-targets.cmake | 26 ++++++- .../include/common/common_definitions.conf | 2 + .../topology2/include/components/mfcc.conf | 1 - .../cavs/host-gateway-src-mfcc-capture.conf | 12 ++++ .../sdw-dmic-audio-feature-compress.conf | 71 +++++++++++++++++++ .../intel/sdw-dmic-audio-feature.conf | 3 + .../sdw-jack-audio-feature-compress.conf | 71 +++++++++++++++++++ .../intel/sdw-jack-audio-feature.conf | 3 + 9 files changed, 194 insertions(+), 3 deletions(-) create mode 100644 tools/topology/topology2/platform/intel/sdw-dmic-audio-feature-compress.conf create mode 100644 tools/topology/topology2/platform/intel/sdw-jack-audio-feature-compress.conf diff --git a/tools/topology/topology2/cavs-sdw.conf b/tools/topology/topology2/cavs-sdw.conf index 6932543c06e5..0f597ded3793 100644 --- a/tools/topology/topology2/cavs-sdw.conf +++ b/tools/topology/topology2/cavs-sdw.conf @@ -254,6 +254,14 @@ IncludeByKey.SDW_JACK_AUDIO_FEATURE_CAPTURE { "true" "platform/intel/sdw-jack-audio-feature.conf" } +IncludeByKey.SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE { + "true" "platform/intel/sdw-jack-audio-feature-compress.conf" +} + IncludeByKey.SDW_DMIC_AUDIO_FEATURE_CAPTURE { "true" "platform/intel/sdw-dmic-audio-feature.conf" } + +IncludeByKey.SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE { + "true" "platform/intel/sdw-dmic-audio-feature-compress.conf" +} diff --git a/tools/topology/topology2/development/tplg-targets.cmake b/tools/topology/topology2/development/tplg-targets.cmake index a906852d04f0..155176c16347 100644 --- a/tools/topology/topology2/development/tplg-targets.cmake +++ b/tools/topology/topology2/development/tplg-targets.cmake @@ -479,11 +479,33 @@ SDW_AMP_FEEDBACK=false,SDW_SPK_STREAM=Playback-SmartAmp,SDW_DMIC_STREAM=Capture- SDW_JACK_OUT_STREAM=Playback-SimpleJack,SDW_JACK_IN_STREAM=Capture-SimpleJack,COMPRESSED=true" # Soundwire topologies with MFCC audio features capture -"cavs-sdw\;sof-mtl-rt713-l0-rt1316-l12-mfcc\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,\ +"cavs-sdw\;sof-mtl-rt713-l0-rt1316-l12-mfcc-mel-normal\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,\ HDMI1_ID=4,HDMI2_ID=5,HDMI3_ID=6,SDW_JACK_AUDIO_FEATURE_CAPTURE=true" -"cavs-sdw\;sof-arl-cs42l43-l0-cs35l56-l23-mfcc\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,SDW_DMIC=1,\ +"cavs-sdw\;sof-arl-cs42l43-l0-cs35l56-l23-mfcc-mel-normal\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,SDW_DMIC=1,\ SDW_AMP_FEEDBACK=false,SDW_SPK_STREAM=Playback-SmartAmp,SDW_DMIC_STREAM=Capture-SmartMic,\ SDW_JACK_OUT_STREAM=Playback-SimpleJack,SDW_JACK_IN_STREAM=Capture-SimpleJack,\ SDW_JACK_AUDIO_FEATURE_CAPTURE=true,SDW_DMIC_AUDIO_FEATURE_CAPTURE=true" + +# Soundwire topologies with compress MFCC mel audio features capture +"cavs-sdw\;sof-mtl-rt713-l0-rt1316-l12-mfcc-mel-compr\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,\ +HDMI1_ID=4,HDMI2_ID=5,HDMI3_ID=6,SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE=true,\ +MFCC_FRAME_BYTES=344,MFCC_BLOB=mel" + +# Soundwire topologies with compress MFCC cepstral audio features capture +"cavs-sdw\;sof-mtl-rt713-l0-rt1316-l12-mfcc-ceps-compr\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,\ +HDMI1_ID=4,HDMI2_ID=5,HDMI3_ID=6,SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE=true,\ +MFCC_FRAME_BYTES=76,MFCC_BLOB=ceps" + +"cavs-sdw\;sof-arl-cs42l43-l0-cs35l56-l23-mfcc-mel-compr\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,SDW_DMIC=1,\ +SDW_AMP_FEEDBACK=false,SDW_SPK_STREAM=Playback-SmartAmp,SDW_DMIC_STREAM=Capture-SmartMic,\ +SDW_JACK_OUT_STREAM=Playback-SimpleJack,SDW_JACK_IN_STREAM=Capture-SimpleJack,\ +SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE=true,SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE=true,\ +MFCC_FRAME_BYTES=344,MFCC_BLOB=mel" + +"cavs-sdw\;sof-arl-cs42l43-l0-cs35l56-l23-mfcc-ceps-compr\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,SDW_DMIC=1,\ +SDW_AMP_FEEDBACK=false,SDW_SPK_STREAM=Playback-SmartAmp,SDW_DMIC_STREAM=Capture-SmartMic,\ +SDW_JACK_OUT_STREAM=Playback-SimpleJack,SDW_JACK_IN_STREAM=Capture-SimpleJack,\ +SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE=true,SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE=true,\ +MFCC_FRAME_BYTES=76,MFCC_BLOB=ceps" ) diff --git a/tools/topology/topology2/include/common/common_definitions.conf b/tools/topology/topology2/include/common/common_definitions.conf index 87c69dd41e41..06f0f425c5e2 100644 --- a/tools/topology/topology2/include/common/common_definitions.conf +++ b/tools/topology/topology2/include/common/common_definitions.conf @@ -72,5 +72,7 @@ Define { SDW_JACK_ECHO_REF false # No echo reference for 3.5mm jack SDW_SPK_ECHO_REF false # No echo reference for speaker SDW_JACK_AUDIO_FEATURE_CAPTURE false # No audio features capture for jack + SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE false # No compress audio features capture for jack SDW_DMIC_AUDIO_FEATURE_CAPTURE false # No audio features capture for microphone + SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE false # No compress audio features capture for microphone } diff --git a/tools/topology/topology2/include/components/mfcc.conf b/tools/topology/topology2/include/components/mfcc.conf index 775f0d79b1f5..bf908e685048 100644 --- a/tools/topology/topology2/include/components/mfcc.conf +++ b/tools/topology/topology2/include/components/mfcc.conf @@ -47,7 +47,6 @@ Class.Widget."mfcc" { !immutable [ "uuid" - "type" ] !deprecated [ "preload_count" diff --git a/tools/topology/topology2/include/pipelines/cavs/host-gateway-src-mfcc-capture.conf b/tools/topology/topology2/include/pipelines/cavs/host-gateway-src-mfcc-capture.conf index 793f71b883ab..fe6249018ef1 100644 --- a/tools/topology/topology2/include/pipelines/cavs/host-gateway-src-mfcc-capture.conf +++ b/tools/topology/topology2/include/pipelines/cavs/host-gateway-src-mfcc-capture.conf @@ -22,6 +22,12 @@ +Define { + # Default MFCC output frame size (header + coefficients). + # Can be overridden by feature/platform includes or CMake variable overrides. + MFCC_FRAME_BYTES 344 +} + Class.Pipeline."host-gateway-src-mfcc-capture" { @@ -85,6 +91,9 @@ Class.Pipeline."host-gateway-src-mfcc-capture" { out_bit_depth 32 out_valid_bit_depth 32 out_rate 16000 + # Compress output frame: header + coefficients. + # Size set by MFCC_FRAME_BYTES Define. + obs $MFCC_FRAME_BYTES } ] } @@ -101,6 +110,8 @@ Class.Pipeline."host-gateway-src-mfcc-capture" { in_bit_depth 32 in_valid_bit_depth 32 in_rate 16000 + # Match MFCC compress output frame size + ibs $MFCC_FRAME_BYTES } ] Object.Base.output_audio_format [ @@ -108,6 +119,7 @@ Class.Pipeline."host-gateway-src-mfcc-capture" { out_bit_depth 32 out_valid_bit_depth 32 out_rate 16000 + obs $MFCC_FRAME_BYTES } ] } diff --git a/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature-compress.conf b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature-compress.conf new file mode 100644 index 000000000000..9e307043830b --- /dev/null +++ b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature-compress.conf @@ -0,0 +1,71 @@ +Define { + SDW_DMIC_MODULE_COPIER_ID 41 + SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME "Microphone Compress Audio Features" + SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID 54 + SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_STREAM_NAME "Microphone Compress Audio Features Stream" + SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID 133 + # MFCC compress output frame size in bytes: + # Mel-only (80 bins): 24 + 80*4 = 344 + # Cepstral (13 ceps): 24 + 13*4 = 76 + MFCC_FRAME_BYTES 344 + # MFCC config blob: mel or ceps + MFCC_BLOB mel +} + +Object.Pipeline.host-gateway-src-mfcc-capture [ + { + index $SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID + + Object.Widget.host-copier.1 { + stream_name "$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_STREAM_NAME" + pcm_id $SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID + } + + Object.Widget.mfcc.1 { + type "encoder" + Object.Control { + bytes."1" { + name "$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC bytes" + IncludeByKey.MFCC_BLOB { + "mel" "include/components/mfcc/mel80_compress_dtx.conf" + "ceps" "include/components/mfcc/ceps13_compress_dtx.conf" + } + } + mixer."1" { + name "$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC VAD" + } + } + } + } +] +Object.Base.route [ + { + source "module-copier.$SDW_DMIC_MODULE_COPIER_ID.0" + sink "src.$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID.1" + } + { + source "mfcc.$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID.1" + sink "host-copier.$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID.capture" + } +] + +Object.PCM.pcm [ + { + name "$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME" + id $SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID + direction "capture" + compress "true" + + Object.Base.fe_dai.1 { + name "$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME" + } + + Object.PCM.pcm_caps.1 { + name "$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_STREAM_NAME" + formats 'S32_LE' + rates '16000' + channels_min 2 + channels_max 2 + } + } +] diff --git a/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf index 623574db1784..7d39c11772c1 100644 --- a/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf +++ b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf @@ -4,6 +4,9 @@ Define { SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_ID 48 SDW_DMIC_AUDIO_FEATURE_CAPTURE_STREAM_NAME "Microphone Audio Features Stream" SDW_DMIC_AUDIO_FEATURE_CAPTURE_PIPELINE_ID 131 + # MFCC output frame size in bytes (24-byte header + coefficients): + # Mel-only (80 bins): 24 + 80*4 = 344 + MFCC_FRAME_BYTES 344 } Object.Pipeline.host-gateway-src-mfcc-capture [ diff --git a/tools/topology/topology2/platform/intel/sdw-jack-audio-feature-compress.conf b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature-compress.conf new file mode 100644 index 000000000000..286af8be0323 --- /dev/null +++ b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature-compress.conf @@ -0,0 +1,71 @@ +Define { + SDW_JACK_MODULE_COPIER_ID 11 + SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME "Jack In Compress Audio Features" + SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID 53 + SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_STREAM_NAME "Jack In Compress Audio Features Stream" + SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID 132 + # MFCC compress output frame size in bytes: + # Mel-only (80 bins): 24 + 80*4 = 344 + # Cepstral (13 ceps): 24 + 13*4 = 76 + MFCC_FRAME_BYTES 344 + # MFCC config blob: mel or ceps + MFCC_BLOB mel +} + +Object.Pipeline.host-gateway-src-mfcc-capture [ + { + index $SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID + + Object.Widget.host-copier.1 { + stream_name "$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_STREAM_NAME" + pcm_id $SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID + } + + Object.Widget.mfcc.1 { + type "encoder" + Object.Control { + bytes."1" { + name "$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC bytes" + IncludeByKey.MFCC_BLOB { + "mel" "include/components/mfcc/mel80_compress_dtx.conf" + "ceps" "include/components/mfcc/ceps13_compress_dtx.conf" + } + } + mixer."1" { + name "$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC VAD" + } + } + } + } +] +Object.Base.route [ + { + source "module-copier.$SDW_JACK_MODULE_COPIER_ID.0" + sink "src.$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID.1" + } + { + source "mfcc.$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID.1" + sink "host-copier.$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID.capture" + } +] + +Object.PCM.pcm [ + { + name "$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME" + id $SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID + direction "capture" + compress "true" + + Object.Base.fe_dai.1 { + name "$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME" + } + + Object.PCM.pcm_caps.1 { + name "$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_STREAM_NAME" + formats 'S32_LE' + rates '16000' + channels_min $SDW_JACK_CAPTURE_CH + channels_max $SDW_JACK_CAPTURE_CH + } + } +] diff --git a/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf index e9652f49c17f..a0a44eae4d87 100644 --- a/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf +++ b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf @@ -4,6 +4,9 @@ Define { SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_ID 47 SDW_JACK_AUDIO_FEATURE_CAPTURE_STREAM_NAME "Jack In Audio Features Stream" SDW_JACK_AUDIO_FEATURE_CAPTURE_PIPELINE_ID 130 + # MFCC output frame size in bytes (24-byte header + coefficients): + # Mel-only (80 bins): 24 + 80*4 = 344 + MFCC_FRAME_BYTES 344 } Object.Pipeline.host-gateway-src-mfcc-capture [