"""Utilities for processing audio data"""importosfromtypingimportUnionimportnumpyasnpimporttensorflowastffrommltk.utils.pythonimportappend_exception_msgfrommltk.core.preprocess.audio.audio_feature_generatorimport(AudioFeatureGeneratorSettings,AudioFeatureGenerator)try:importlibrosaexceptExceptionase:ifos.name!='nt'and'sndfile library not found'inf'{e}':append_exception_msg(e,'\n\nTry running: sudo apt-get install libsndfile1\n')raiseresample=librosa.resample
[docs]defread_audio_file(path:Union[str,np.ndarray,tf.Tensor],return_sample_rate=False,return_numpy=True,**kwargs)->Union[np.ndarray,tf.Tensor]:"""Reads and decodes an audio file. .. note:: Only mono data is returned as a 1D array/tensor Args: path: Path to audio file as a python string, numpy string, or tensorflow string return_sample_rate: If true then a tuple is returned: (audio data, audio sample rate) return_numpy: If true then return numpy array, else return TF tensor Returns: If return_sample_rate = False, Audio data as numpy array or TF tensor If return_sample_rate = True, (audio data, sample rate) """raw=tf.io.read_file(path)sample,original_sample_rate=tf.audio.decode_wav(raw,desired_channels=1,)sample=tf.squeeze(sample,axis=-1)ifreturn_numpy:sample=sample.numpy()ifreturn_sample_rate:ifreturn_numpy:original_sample_rate=int(original_sample_rate.numpy())returnsample,original_sample_ratereturnsample
[docs]defwrite_audio_file(path:str,sample:Union[np.ndarray,tf.Tensor],sample_rate:int)->Union[str,tf.Tensor]:"""Write audio data to a file Args: path: File path to save audio If this is does NOT end with .wav, then the path is assumed to be a directory. In this case, the audio path is generated as: <path>/<timestamp>.wav sample: Audio data to write, if the data type is: - ``int16`` then it is converted to float32 and scaled by 32768 sample_rate: Sample rate of audio Returns: Path to written file. If this is executing in a non-eager TF function then the path is a TF Tensor, otherwise it is a Python string """ifisinstance(sample,np.ndarray):sample=tf.convert_to_tensor(sample)iflen(sample.shape)==1:sample=tf.expand_dims(sample,axis=-1)ifsample.dtype==np.int16:sample=tf.cast(sample,tf.float32)sample=sample/32768.wav=tf.audio.encode_wav(sample,sample_rate)path=tf.strings.join((os.path.abspath(path),))ifnottf.strings.regex_full_match(path,r'.*\.wav'):ts=tf.timestamp()*1000fn=tf.strings.format('{}.wav',ts)path=tf.strings.join((path,fn),separator=os.path.sep)tf.io.write_file(path,wav)iftf.executing_eagerly()andisinstance(path,tf.Tensor):path=path.numpy().decode('utf-8').replace('\\','/')returnpath
[docs]defadjust_length(sample:np.ndarray,target_sr:int=None,original_sr:int=None,out_length:int=None,offset=0.0,trim_threshold_db=30.0,)->np.ndarray:"""Adjust the audio sample length to fit the out_length parameter This will audio re-sample the audio to the target sample rate and pad with zeros or crop the input sample as necessary. Args: sample: Audio sample as a numpy array target_sr: The sample rate to re-sample the audio. The original_sr arg must also be provided original_sr: The original sample rate of teh given audio out_length: The length of the output audio sample. If omitted then return the input sample length offset: If in_length > out_length, then this is the percentage offset from the beginning of the input to use for the output If in_length < out_length, then this is the percentage to pad with zeros before the input sample trim_threshold_db: The threshold (in decibels) below reference to consider as silence Returns: The adjusted audio sample """iforiginal_srandoriginal_sr!=target_sr:sample=librosa.core.resample(sample,orig_sr=original_sr,target_sr=target_sr)iftrim_threshold_db:sample_trimmed,_=librosa.effects.trim(sample,top_db=int(trim_threshold_db))else:sample_trimmed=samplein_length=sample_trimmed.shape[0]ifout_lengthisNone:out_length=sample.shape[0]iflen(sample.shape)==1:ifin_length>out_length:diff=in_length-out_lengthbefore=int(diff*offset)sample=sample_trimmed[before:before+out_length]elifin_length<out_length:diff=out_length-in_lengthbefore=int(diff*offset)pad_before=np.zeros((before,),dtype=sample.dtype)pad_after=np.zeros((diff-before,),dtype=sample.dtype)sample=np.concatenate((pad_before,sample_trimmed,pad_after),axis=0)ifsample.shape[0]!=out_length:sample=sample[:out_length]else:n_channels=sample_trimmed.shape[1]ifin_length>out_length:diff=in_length-out_lengthbefore=int(diff*offset)sample=sample_trimmed[before:before+out_length,:]elifin_length<out_length:diff=out_length-in_lengthbefore=int(diff*offset)pad_before=np.zeros((before,n_channels),dtype=sample.dtype)pad_after=np.zeros((diff-before,n_channels),dtype=sample.dtype)sample=np.concatenate((pad_before,sample_trimmed,pad_after),axis=0)ifsample.shape[0]!=out_length:sample=sample[:out_length,:]returnsample
[docs]defapply_frontend(sample:np.ndarray,settings:AudioFeatureGeneratorSettings,dtype=np.float32)->np.ndarray:"""Send the audio sample through the AudioFeatureGenerator and return the generated spectrogram Args: sample: The audio sample to process in the AudioFeatureGenerator settings: The settings to use in the AudioFeatureGenerator dtype: The expected audio output data type, support types are: * **uint16**: This the raw value generated by the internal AudioFeatureGenerator library * **float32**: This is the uint16 value directly casted to a float32 * **int8**: This is the int8 value generated by the TFLM "micro features" library. Refer to the following for the magic that happens here: `micro_features_generator.cc#L84 <https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.cc#L84>`_ Returns: Generated spectrogram of audio """ifnp.issubdtype(sample.dtype,np.floating):# Convert the floating point data to int16# which is what the AudioFeatureGenerator expects it to be# sample = librosa.util.normalize(sample, norm=np.inf, axis=None)sample=sample*32768sample=sample.astype(np.int16)iflen(sample.shape)==2:sample=np.squeeze(sample,axis=-1)frontend=AudioFeatureGenerator(settings)returnfrontend.process_sample(sample,dtype=dtype)
Important: We use cookies only for functional and traffic analytics.
We DO NOT use cookies for any marketing purposes. By using our site you acknowledge you have read and understood our Cookie Policy.