| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051 |
- # Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita)
- # Licensed under the MIT license.
- #
- # This module is for computing audio features
- import librosa
- import numpy as np
- def transform(Y, dtype=np.float32):
- Y = np.abs(Y)
- n_fft = 2 * (Y.shape[1] - 1)
- sr = 8000
- n_mels = 23
- mel_basis = librosa.filters.mel(sr, n_fft, n_mels)
- Y = np.dot(Y ** 2, mel_basis.T)
- Y = np.log10(np.maximum(Y, 1e-10))
- mean = np.mean(Y, axis=0)
- Y = Y - mean
- return Y.astype(dtype)
- def subsample(Y, T, subsampling=1):
- Y_ss = Y[::subsampling]
- T_ss = T[::subsampling]
- return Y_ss, T_ss
- def splice(Y, context_size=0):
- Y_pad = np.pad(
- Y,
- [(context_size, context_size), (0, 0)],
- 'constant')
- Y_spliced = np.lib.stride_tricks.as_strided(
- np.ascontiguousarray(Y_pad),
- (Y.shape[0], Y.shape[1] * (2 * context_size + 1)),
- (Y.itemsize * Y.shape[1], Y.itemsize), writeable=False)
- return Y_spliced
- def stft(
- data,
- frame_size=1024,
- frame_shift=256):
- fft_size = 1 << (frame_size - 1).bit_length()
- if len(data) % frame_shift == 0:
- return librosa.stft(data, n_fft=fft_size, win_length=frame_size,
- hop_length=frame_shift).T[:-1]
- else:
- return librosa.stft(data, n_fft=fft_size, win_length=frame_size,
- hop_length=frame_shift).T
|