eend_ola_feature.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. # Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita)
  2. # Licensed under the MIT license.
  3. #
  4. # This module is for computing audio features
  5. import librosa
  6. import numpy as np
  7. def transform(Y, dtype=np.float32):
  8. Y = np.abs(Y)
  9. n_fft = 2 * (Y.shape[1] - 1)
  10. sr = 8000
  11. n_mels = 23
  12. mel_basis = librosa.filters.mel(sr, n_fft, n_mels)
  13. Y = np.dot(Y ** 2, mel_basis.T)
  14. Y = np.log10(np.maximum(Y, 1e-10))
  15. mean = np.mean(Y, axis=0)
  16. Y = Y - mean
  17. return Y.astype(dtype)
  18. def subsample(Y, T, subsampling=1):
  19. Y_ss = Y[::subsampling]
  20. T_ss = T[::subsampling]
  21. return Y_ss, T_ss
  22. def splice(Y, context_size=0):
  23. Y_pad = np.pad(
  24. Y,
  25. [(context_size, context_size), (0, 0)],
  26. 'constant')
  27. Y_spliced = np.lib.stride_tricks.as_strided(
  28. np.ascontiguousarray(Y_pad),
  29. (Y.shape[0], Y.shape[1] * (2 * context_size + 1)),
  30. (Y.itemsize * Y.shape[1], Y.itemsize), writeable=False)
  31. return Y_spliced
  32. def stft(
  33. data,
  34. frame_size=1024,
  35. frame_shift=256):
  36. fft_size = 1 << (frame_size - 1).bit_length()
  37. if len(data) % frame_shift == 0:
  38. return librosa.stft(data, n_fft=fft_size, win_length=frame_size,
  39. hop_length=frame_shift).T[:-1]
  40. else:
  41. return librosa.stft(data, n_fft=fft_size, win_length=frame_size,
  42. hop_length=frame_shift).T