Source code for mltk.core.preprocess.audio.audio_feature_generator.audio_feature_generator
import importlib
import numpy as np
from .audio_feature_generator_settings import AudioFeatureGeneratorSettings
[docs]class AudioFeatureGenerator:
"""Converts raw audio into a spectrogram (gray-scale 2D image)
**Example Usage**
.. highlight:: python
.. code-block:: python
import numpy as np
from mltk.core.preprocess.audio.audio_feature_generator import AudioFeatureGeneratorSettings
from mltk.core.preprocess.utils import audio as audio_utils
# Define the settings used to convert the audio into a spectrogram
frontend_settings = AudioFeatureGeneratorSettings()
frontend_settings.sample_rate_hz = 16000
frontend_settings.sample_length_ms = 1200
frontend_settings.window_size_ms = 30
frontend_settings.window_step_ms = 10
frontend_settings.filterbank_n_channels = 108
frontend_settings.filterbank_upper_band_limit = 7500.0
frontend_settings.filterbank_lower_band_limit = 125.0
frontend_settings.noise_reduction_enable = True
frontend_settings.noise_reduction_smoothing_bits = 10
frontend_settings.noise_reduction_even_smoothing = 0.025
frontend_settings.noise_reduction_odd_smoothing = 0.06
frontend_settings.noise_reduction_min_signal_remaining = 0.40
frontend_settings.quantize_dynamic_scale_enable = True # Enable dynamic quantization
frontend_settings.quantize_dynamic_scale_range_db = 40.0
# Read the raw audio file
sample, original_sample_rate = audio_utils.read_audio_file(
'my_audio.wav',
return_numpy=True,
return_sample_rate=True
)
# Clip/pad the audio so that it's length matches the values configured in "frontend_settings"
out_length = int((original_sample_rate * frontend_settings.sample_length_ms) / 1000)
sample = audio_utils.adjust_length(
sample,
out_length=out_length,
trim_threshold_db=30,
offset=np.random.uniform(0, 1)
)
# Convert the sample rate (if necessary)
if original_sample_rate != frontend_settings.sample_rate_hz:
sample = audio_utils.resample(
sample,
orig_sr=original_sample_rate,
target_sr=frontend_settings.sample_rate_hz
)
# Generate a spectrogram from the audio sample
#
# NOTE: audio_utils.apply_frontend() is a helper function.
# Internally, it converts from float32 to int16 (audio_utils.read_audio_file() returns float32)
# then calls the AudioFeatureGenerator, e.g.:
# sample = sample * 32768
# sample = sample.astype(np.int16)
# sample = np.squeeze(sample, axis=-1)
# frontend = AudioFeatureGenerator(frontend_settings)
# spectrogram = frontend.process_sample(sample, dtype=np.int8)
spectrogram = audio_utils.apply_frontend(
sample=sample,
settings=frontend_settings,
dtype=np.int8
)
.. seealso::
- `AudioFeatureGenerator documentation <https://siliconlabs.github.io/mltk/docs/audio/audio_feature_generator.html>`_
- `AudioFeatureGenerator Python Wrapper <https://siliconlabs.github.io/mltk/docs/cpp_development/wrappers/audio_feature_generator_wrapper.html>`_
- `Microfrontend implementation <https://github.com/siliconlabs/mltk/tree/master/cpp/shared/microfrontend>`_
- `ParallelAudioDataGenerator API docs <https://siliconlabs.github.io/mltk/docs/python_api/data_preprocessing/audio_data_generator.html>`_
"""
[docs] def __init__(self, settings: AudioFeatureGeneratorSettings):
"""
Args:
settings: The settings to use for processing the audio sample
"""
try:
wrapper_module = importlib.import_module('mltk.core.preprocess.audio.audio_feature_generator._audio_feature_generator_wrapper')
except (ImportError, ModuleNotFoundError) as e:
raise ImportError(f'Failed to import the AudioFeatureGenerator wrapper C++ shared library, err: {e}\n' \
'This likely means you need to re-build the AudioFeatureGenerator wrapper package\n\n') from e
self._spectrogram_shape = settings.spectrogram_shape
self._wrapper = wrapper_module.AudioFeatureGeneratorWrapper(settings)
[docs] def process_sample(self, sample: np.ndarray, dtype=np.float32) -> np.ndarray:
"""Convert the provided 1D audio sample to a 2D spectrogram using the AudioFeatureGenerator
The generated 2D spectrogram dimensions are calculated as follows::
sample_length = len(sample) = int(sample_length_ms*sample_rate_hz / 1000)
window_size_length = int(window_size_ms * sample_rate_hz / 1000)
window_step_length = int(window_step_ms * sample_rate_hz / 1000)
height = n_features = (sample_length - window_size_length) // window_step_length + 1
width = n_channels = AudioFeatureGeneratorSettings.filterbank_n_channels
The dtype argument specifies the data type of the returned spectrogram.
This must be one of the following:
* **uint16**: This the raw value generated by the internal AudioFeatureGenerator library
* **float32**: This is the uint16 value directly casted to a float32
* **int8**: This is the int8 value generated by the TFLM "micro features" library.
Refer to the following for the magic that happens here: `micro_features_generator.cc#L84 <https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.cc#L84>`_
Args:
sample: [sample_length] int16 audio sample
dtype: Output data type, must be int8, uint16, or float32
Returns:
[n_features, n_channels] int8, uint16, or float32 spectrogram
"""
spectrogram = np.zeros(self._spectrogram_shape, dtype=dtype)
self._wrapper.process_sample(sample, spectrogram)
return spectrogram
[docs] def activity_was_detected(self) -> bool:
"""Return if activity was detected in the previously processed sample"""
return self._wrapper.activity_was_detected()