Source code for mltk.core.preprocess.utils.audio

"""Utilities for processing audio data"""

import os
from typing import Union
import numpy as np
import tensorflow as tf

from mltk.utils.python import append_exception_msg
from mltk.core.preprocess.audio.audio_feature_generator import (
    AudioFeatureGeneratorSettings,
    AudioFeatureGenerator
)

try:
    import librosa
except Exception as e:
    if os.name != 'nt' and 'sndfile library not found' in f'{e}':
        append_exception_msg(e, '\n\nTry running: sudo apt-get install libsndfile1\n')
    raise


resample = librosa.resample


[docs]def read_audio_file(
    path:Union[str,np.ndarray,tf.Tensor],
    return_sample_rate=False,
    return_numpy=True,
    **kwargs
) -> Union[np.ndarray,tf.Tensor]:
    """Reads and decodes an audio file.

    .. note::
        Only mono data is returned as a 1D array/tensor

    Args:
        path: Path to audio file as a python string, numpy string, or tensorflow string
        return_sample_rate: If true then a tuple is returned:  (audio data, audio sample rate)
        return_numpy: If true then return numpy array, else return TF tensor

    Returns:
        If return_sample_rate = False, Audio data as numpy array or TF tensor
        If return_sample_rate = True, (audio data, sample rate)
    """
    raw = tf.io.read_file(path)
    sample, original_sample_rate = tf.audio.decode_wav(
        raw,
        desired_channels=1,

    )
    sample = tf.squeeze(sample, axis=-1)

    if return_numpy:
        sample = sample.numpy()

    if return_sample_rate:
        if return_numpy:
            original_sample_rate = int(original_sample_rate.numpy())

        return sample, original_sample_rate

    return sample


[docs]def write_audio_file(
    path:str,
    sample:Union[np.ndarray,tf.Tensor],
    sample_rate:int
) -> Union[str,tf.Tensor]:
    """Write audio data to a file

    Args:
        path: File path to save audio
            If this is does NOT end with .wav, then the path is assumed to be a directory.
            In this case, the audio path is generated as: <path>/<timestamp>.wav
        sample: Audio data to write, if the data type is:
            - ``int16`` then it is converted to float32 and scaled by 32768
        sample_rate: Sample rate of audio
    Returns:
        Path to written file. If this is executing in a non-eager TF function
        then the path is a TF Tensor, otherwise it is a Python string

    """
    if isinstance(sample, np.ndarray):
        sample = tf.convert_to_tensor(sample)

    if len(sample.shape) == 1:
        sample = tf.expand_dims(sample, axis=-1)

    if sample.dtype == np.int16:
        sample = tf.cast(sample, tf.float32)
        sample = sample / 32768.

    wav = tf.audio.encode_wav(sample, sample_rate)

    path = tf.strings.join((os.path.abspath(path),))
    if not tf.strings.regex_full_match(path, r'.*\.wav'):
        ts = tf.timestamp() * 1000
        fn = tf.strings.format('{}.wav', ts)
        path = tf.strings.join((path, fn), separator=os.path.sep)

    tf.io.write_file(path, wav)

    if tf.executing_eagerly() and isinstance(path, tf.Tensor):
        path = path.numpy().decode('utf-8').replace('\\', '/')

    return path


[docs]def adjust_length(
    sample:np.ndarray,
    target_sr:int=None,
    original_sr:int=None,
    out_length:int=None,
    offset=0.0,
    trim_threshold_db=30.0,
) -> np.ndarray:
    """Adjust the audio sample length to fit the out_length parameter
    This will audio re-sample the audio to the target sample rate and
    pad with zeros or crop the input sample as necessary.

    Args:
        sample: Audio sample as a numpy array
        target_sr: The sample rate to re-sample the audio. The original_sr arg must also be provided
        original_sr: The original sample rate of teh given audio
        out_length: The length of the output audio sample. If omitted then return the input sample length
        offset: If in_length > out_length, then this is the percentage offset from the beginning of the input to use for the output
            If in_length < out_length, then this is the percentage to pad with zeros before the input sample
        trim_threshold_db: The threshold (in decibels) below reference to consider as silence
    Returns:
        The adjusted audio sample
    """

    if original_sr and original_sr != target_sr:
        sample = librosa.core.resample(sample, orig_sr=original_sr, target_sr=target_sr)

    if trim_threshold_db:
        sample_trimmed, _ = librosa.effects.trim(sample, top_db=int(trim_threshold_db))
    else:
        sample_trimmed = sample

    in_length = sample_trimmed.shape[0]
    if out_length is None:
        out_length = sample.shape[0]

    if len(sample.shape) == 1:
        if in_length > out_length:
            diff = in_length - out_length
            before = int(diff*offset)
            sample = sample_trimmed[before : before + out_length]

        elif in_length < out_length:
            diff = out_length - in_length
            before = int(diff * offset)

            pad_before = np.zeros((before, ), dtype=sample.dtype)
            pad_after = np.zeros((diff - before, ), dtype=sample.dtype)

            sample = np.concatenate((pad_before, sample_trimmed, pad_after), axis=0)

        if sample.shape[0] != out_length:
            sample = sample[:out_length]

    else:
        n_channels = sample_trimmed.shape[1]
        if in_length > out_length:
            diff = in_length - out_length
            before = int(diff*offset)
            sample = sample_trimmed[before : before + out_length, :]

        elif in_length < out_length:
            diff = out_length - in_length
            before = int(diff * offset)

            pad_before = np.zeros((before, n_channels), dtype=sample.dtype)
            pad_after = np.zeros((diff - before, n_channels), dtype=sample.dtype)

            sample = np.concatenate((pad_before, sample_trimmed, pad_after), axis=0)

        if sample.shape[0] != out_length:
            sample = sample[:out_length, :]

    return sample


[docs]def apply_frontend(
    sample:np.ndarray,
    settings:AudioFeatureGeneratorSettings,
    dtype=np.float32
) -> np.ndarray:
    """Send the audio sample through the AudioFeatureGenerator and return the generated spectrogram

    Args:
        sample: The audio sample to process in the AudioFeatureGenerator
        settings: The settings to use in the AudioFeatureGenerator
        dtype: The expected audio output data type, support types are:
            
            * **uint16**: This the raw value generated by the internal AudioFeatureGenerator library
            * **float32**: This is the uint16 value directly casted to a float32
            * **int8**: This is the int8 value generated by the TFLM "micro features" library.
                Refer to the following for the magic that happens here: `micro_features_generator.cc#L84 <https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.cc#L84>`_

    Returns:
        Generated spectrogram of audio
    """

    if np.issubdtype(sample.dtype, np.floating):
        # Convert the floating point data to int16
        # which is what the AudioFeatureGenerator expects it to be
        # sample = librosa.util.normalize(sample, norm=np.inf, axis=None)
        sample = sample * 32768
        sample = sample.astype(np.int16)

    if len(sample.shape) == 2:
        sample = np.squeeze(sample, axis=-1)

    frontend = AudioFeatureGenerator(settings)
    return frontend.process_sample(sample, dtype=dtype)