Source code for mltk.core.preprocess.audio.audio_feature_generator.audio_feature_generator

import importlib
import numpy as np

from .audio_feature_generator_settings import AudioFeatureGeneratorSettings


[docs]class AudioFeatureGenerator: """Converts raw audio into a spectrogram (gray-scale 2D image) **Example Usage** .. highlight:: python .. code-block:: python import numpy as np from mltk.core.preprocess.audio.audio_feature_generator import AudioFeatureGeneratorSettings from mltk.core.preprocess.utils import audio as audio_utils # Define the settings used to convert the audio into a spectrogram frontend_settings = AudioFeatureGeneratorSettings() frontend_settings.sample_rate_hz = 16000 frontend_settings.sample_length_ms = 1200 frontend_settings.window_size_ms = 30 frontend_settings.window_step_ms = 10 frontend_settings.filterbank_n_channels = 108 frontend_settings.filterbank_upper_band_limit = 7500.0 frontend_settings.filterbank_lower_band_limit = 125.0 frontend_settings.noise_reduction_enable = True frontend_settings.noise_reduction_smoothing_bits = 10 frontend_settings.noise_reduction_even_smoothing = 0.025 frontend_settings.noise_reduction_odd_smoothing = 0.06 frontend_settings.noise_reduction_min_signal_remaining = 0.40 frontend_settings.quantize_dynamic_scale_enable = True # Enable dynamic quantization frontend_settings.quantize_dynamic_scale_range_db = 40.0 # Read the raw audio file sample, original_sample_rate = audio_utils.read_audio_file( 'my_audio.wav', return_numpy=True, return_sample_rate=True ) # Clip/pad the audio so that it's length matches the values configured in "frontend_settings" out_length = int((original_sample_rate * frontend_settings.sample_length_ms) / 1000) sample = audio_utils.adjust_length( sample, out_length=out_length, trim_threshold_db=30, offset=np.random.uniform(0, 1) ) # Convert the sample rate (if necessary) if original_sample_rate != frontend_settings.sample_rate_hz: sample = audio_utils.resample( sample, orig_sr=original_sample_rate, target_sr=frontend_settings.sample_rate_hz ) # Generate a spectrogram from the audio sample # # NOTE: audio_utils.apply_frontend() is a helper function. # Internally, it converts from float32 to int16 (audio_utils.read_audio_file() returns float32) # then calls the AudioFeatureGenerator, e.g.: # sample = sample * 32768 # sample = sample.astype(np.int16) # sample = np.squeeze(sample, axis=-1) # frontend = AudioFeatureGenerator(frontend_settings) # spectrogram = frontend.process_sample(sample, dtype=np.int8) spectrogram = audio_utils.apply_frontend( sample=sample, settings=frontend_settings, dtype=np.int8 ) .. seealso:: - `AudioFeatureGenerator documentation <https://siliconlabs.github.io/mltk/docs/audio/audio_feature_generator.html>`_ - `AudioFeatureGenerator Python Wrapper <https://siliconlabs.github.io/mltk/docs/cpp_development/wrappers/audio_feature_generator_wrapper.html>`_ - `Microfrontend implementation <https://github.com/siliconlabs/mltk/tree/master/cpp/shared/microfrontend>`_ - `ParallelAudioDataGenerator API docs <https://siliconlabs.github.io/mltk/docs/python_api/data_preprocessing/audio_data_generator.html>`_ """
[docs] def __init__(self, settings: AudioFeatureGeneratorSettings): """ Args: settings: The settings to use for processing the audio sample """ try: wrapper_module = importlib.import_module('mltk.core.preprocess.audio.audio_feature_generator._audio_feature_generator_wrapper') except (ImportError, ModuleNotFoundError) as e: raise ImportError(f'Failed to import the AudioFeatureGenerator wrapper C++ shared library, err: {e}\n' \ 'This likely means you need to re-build the AudioFeatureGenerator wrapper package\n\n') from e self._spectrogram_shape = settings.spectrogram_shape self._wrapper = wrapper_module.AudioFeatureGeneratorWrapper(settings)
[docs] def process_sample(self, sample: np.ndarray, dtype=np.float32) -> np.ndarray: """Convert the provided 1D audio sample to a 2D spectrogram using the AudioFeatureGenerator The generated 2D spectrogram dimensions are calculated as follows:: sample_length = len(sample) = int(sample_length_ms*sample_rate_hz / 1000) window_size_length = int(window_size_ms * sample_rate_hz / 1000) window_step_length = int(window_step_ms * sample_rate_hz / 1000) height = n_features = (sample_length - window_size_length) // window_step_length + 1 width = n_channels = AudioFeatureGeneratorSettings.filterbank_n_channels The dtype argument specifies the data type of the returned spectrogram. This must be one of the following: * **uint16**: This the raw value generated by the internal AudioFeatureGenerator library * **float32**: This is the uint16 value directly casted to a float32 * **int8**: This is the int8 value generated by the TFLM "micro features" library. Refer to the following for the magic that happens here: `micro_features_generator.cc#L84 <https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.cc#L84>`_ Args: sample: [sample_length] int16 audio sample dtype: Output data type, must be int8, uint16, or float32 Returns: [n_features, n_channels] int8, uint16, or float32 spectrogram """ spectrogram = np.zeros(self._spectrogram_shape, dtype=dtype) self._wrapper.process_sample(sample, spectrogram) return spectrogram
[docs] def activity_was_detected(self) -> bool: """Return if activity was detected in the previously processed sample""" return self._wrapper.activity_was_detected()