Source code for mltk.datasets.audio.yes_no

"""Yes/No
=======================

This is a `synthetically <https://siliconlabs.github.io/mltk/mltk/tutorials/synthetic_audio_dataset_generation.html>`_ generated dataset with the keywords:

- **yes**
- **no**

The samples are 16kHz, 16-bit PCM ``.wav`` files.


.. seealso::

    - `AudioDatasetGenerator <https://siliconlabs.github.io/mltk/docs/python_api/utils/audio_dataset_generator/index.html>`_
    - `Synthetic Audio Dataset Generation Tutorial <https://siliconlabs.github.io/mltk/mltk/tutorials/synthetic_audio_dataset_generation.html>`_

"""

import logging
import os
import json

from mltk.utils.archive_downloader import download_verify_extract
from mltk.utils.path import create_user_dir, fullpath
from mltk.utils.audio_dataset_generator import (
    AudioDatasetGenerator,
    Keyword,
    Augmentation,
    VoiceRate,
    VoicePitch
)




DOWNLOAD_URL = 'https://www.dropbox.com/s/83z0pkvtunpw7cy/sl_synthetic_yes_no.7z?dl=1'
"""Public download URL"""
VERIFY_SHA1 = 'abf31f3444f17e94b5bded4d4e2a001b5a6cb1b7'
"""SHA1 hash of the downloaded archive file"""

CLASSES = [
    'yes',
    'no',
]
"""The class labels of the dataset samples"""

[docs]def download( dest_dir:str=None, dest_subdir='datasets/yes_no', logger:logging.Logger=None, clean_dest_dir=False ) -> str: """Download and extract the dataset Returns: The directory path to the extracted dataset """ if dest_dir: dest_subdir = None sample_dir = download_verify_extract( url=DOWNLOAD_URL, dest_dir=dest_dir, dest_subdir=dest_subdir, file_hash=VERIFY_SHA1, show_progress=False, remove_root_dir=False, clean_dest_dir=clean_dest_dir, logger=logger ) return sample_dir
[docs]def generate_dataset(out_dir:str=None): """Generate the dataset This generates the dataset using the `AudioDatasetGenerator <https://siliconlabs.github.io/mltk/docs/python_api/utils/audio_dataset_generator/index.html>`_ Python package provided by the MLTK. """ import tqdm KEYWORDS = [ Keyword('yes', max_count=10000), Keyword('no', max_count=10000), ] AUGMENTATIONS = [ Augmentation(rate=VoiceRate.xslow, pitch=VoicePitch.low), Augmentation(rate=VoiceRate.xslow, pitch=VoicePitch.medium), Augmentation(rate=VoiceRate.xslow, pitch=VoicePitch.high), Augmentation(rate=VoiceRate.medium, pitch=VoicePitch.low), Augmentation(rate=VoiceRate.medium, pitch=VoicePitch.medium), Augmentation(rate=VoiceRate.medium, pitch=VoicePitch.high), # Augmentation(rate=VoiceRate.xfast, pitch=VoicePitch.low), # Augmentation(rate=VoiceRate.xfast, pitch=VoicePitch.medium), # Augmentation(rate=VoiceRate.xfast, pitch=VoicePitch.high), ] out_dir = out_dir or create_user_dir('datasets/generated/yes_no') with AudioDatasetGenerator( out_dir=out_dir, n_jobs=8 ) as generator: # Load the cloud backends, installing the Python packages if necessary # See: https://codelabs.developers.google.com/codelabs/cloud-text-speech-python3 if 'GOOGLE_APPLICATION_CREDENTIALS' in os.environ: try: os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = fullpath(os.environ['GOOGLE_APPLICATION_CREDENTIALS']) except: pass with open(os.environ['GOOGLE_APPLICATION_CREDENTIALS'], 'r') as f: credentials = json.load(f) os.environ['PROJECT_ID'] = credentials['project_id'] generator.load_backend('gcp', install_python_package=True) print('Loaded GCP backend') else: print('GOOGLE_APPLICATION_CREDENTIALS env not found, *not* loading GCP backend') # See: https://docs.aws.amazon.com/polly/latest/dg/get-started-what-next.html if 'AWS_ACCESS_KEY_ID' in os.environ or os.path.exists(os.path.expanduser('~/.aws')): generator.load_backend('aws', install_python_package=True) print('Loaded AWS backend') else: print('AWS_ACCESS_KEY_ID env not found, *not* loading AWS backend') # See: https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/get-started-text-to-speech?pivots=programming-language-python if 'SPEECH_KEY' in os.environ: generator.load_backend('azure', install_python_package=True) print('Loaded Azure backend') else: print('SPEECH_KEY env not found, *not* loading Azure backend') print('Listing voices ...') voices = generator.list_voices() # Generate a list of all possible configurations, randomly shuffle, then truncate # based on the "max_count" specified for each keyword print('Listing configurations ...') all_configurations = generator.list_configurations( keywords=KEYWORDS, augmentations=AUGMENTATIONS, voices=voices, truncate=True, seed=42 ) n_configs = sum(len(x) for x in all_configurations.values()) # Print a summary of the configurations print(generator.get_summary(all_configurations)) input( '\nWARNING: Running this script is NOT FREE!\n\n' 'Each cloud backend charges a different rate per character.\n' 'The character counts are listed above.\n\n' 'Refer to each backend\'s docs for the latest pricing:\n' '- AWS: https://aws.amazon.com/polly/pricing\n' '- Azure: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/speech-services\n' '- Google: https://cloud.google.com/text-to-speech/pricing\n' '\nPress "enter" to continue and generate the dataset\n' ) # Generate the dataset (with pretty progress bars) print(f'Generating keywords at: {generator.out_dir}\n') with tqdm.tqdm(total=n_configs, desc='Overall'.rjust(10), unit='word', position=1) as pb_outer: for keyword, config_list in all_configurations.items(): with tqdm.tqdm(desc=keyword.value.rjust(10), total=len(config_list), unit='word', position=0) as pb_inner: for config in config_list: generator.generate( config, on_finished=lambda _: (pb_inner.update(1), pb_outer.update(1)) ) generator.join() # Wait for the current keyword to finish before continuing to the next
if __name__ == '__main__': generate_dataset()