process_opus.py 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. # Copyright 2021 NPU, ASLP Group (Author: Qijie Shao)
  2. # Licensed under the Apache License, Version 2.0 (the "License");
  3. # you may not use this file except in compliance with the License.
  4. # You may obtain a copy of the License at
  5. #
  6. # http://www.apache.org/licenses/LICENSE-2.0
  7. #
  8. # Unless required by applicable law or agreed to in writing, software
  9. # distributed under the License is distributed on an "AS IS" BASIS,
  10. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11. # See the License for the specific language governing permissions and
  12. # limitations under the License.
  13. # process_opus.py: segmentation and downsampling of opus audio
  14. # usage: python3 process_opus.py wav.scp segments output_wav.scp
  15. from pydub import AudioSegment
  16. import sys
  17. import os
  18. def read_file(wav_scp, segments):
  19. wav_scp_dict = {}
  20. with open(wav_scp, "r", encoding="UTF-8") as fin:
  21. for line_str in fin:
  22. wav_id, path = line_str.strip().split()
  23. wav_scp_dict[wav_id] = path
  24. utt_list = []
  25. seg_path_list = []
  26. start_time_list = []
  27. end_time_list = []
  28. with open(segments, "r", encoding="UTF-8") as fin:
  29. for line_str in fin:
  30. arr = line_str.strip().split()
  31. assert len(arr) == 4
  32. utt_list.append(arr[0])
  33. seg_path_list.append(wav_scp_dict[arr[1]])
  34. start_time_list.append(float(arr[2]))
  35. end_time_list.append(float(arr[3]))
  36. return utt_list, seg_path_list, start_time_list, end_time_list
  37. # TODO(Qijie): Fix the process logic
  38. def output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list):
  39. num_utts = len(utt_list)
  40. with open(output_wav_scp, "w", encoding="UTF-8") as fout:
  41. previous_wav_path = ""
  42. for i in range(num_utts):
  43. utt_id = utt_list[i]
  44. current_wav_path = seg_path_list[i]
  45. output_dir = (os.path.dirname(current_wav_path)).replace(
  46. "audio", "audio_seg"
  47. )
  48. seg_wav_path = os.path.join(output_dir, utt_id + ".wav")
  49. os.makedirs(output_dir, exist_ok=True)
  50. if current_wav_path != previous_wav_path:
  51. source_wav = AudioSegment.from_file(current_wav_path)
  52. previous_wav_path = current_wav_path
  53. start = int(start_time_list[i] * 1000)
  54. end = int(end_time_list[i] * 1000)
  55. target_audio = source_wav[start:end].set_frame_rate(16000).set_sample_width(2)
  56. target_audio.export(seg_wav_path, format="wav")
  57. fout.write("{} {}\n".format(utt_id, seg_wav_path))
  58. if i % 200 == 0:
  59. print("seg wav finished: {}%".format(int(i / num_utts)))
  60. def main():
  61. wav_scp = sys.argv[1]
  62. segments = sys.argv[2]
  63. output_wav_scp = sys.argv[3]
  64. utt_list, seg_path_list, start_time_list, end_time_list = read_file(
  65. wav_scp, segments
  66. )
  67. output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list)
  68. if __name__ == "__main__":
  69. main()