modelscope_infer.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. #!/usr/bin/env python3
  2. import argparse
  3. import logging
  4. import os
  5. from modelscope.pipelines import pipeline
  6. from modelscope.utils.constant import Tasks
  7. if __name__ == '__main__':
  8. parser = argparse.ArgumentParser(
  9. description="decoding configs",
  10. formatter_class=argparse.ArgumentDefaultsHelpFormatter,
  11. )
  12. parser.add_argument("--model_name",
  13. type=str,
  14. default="speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
  15. help="model name in modelscope")
  16. parser.add_argument("--model_revision",
  17. type=str,
  18. default="v1.0.4",
  19. help="model revision in modelscope")
  20. parser.add_argument("--local_model_path",
  21. type=str,
  22. default=None,
  23. help="local model path, usually for fine-tuning")
  24. parser.add_argument("--wav_list",
  25. type=str,
  26. help="input wav list")
  27. parser.add_argument("--output_file",
  28. type=str,
  29. help="saving decoding results")
  30. parser.add_argument(
  31. "--njob",
  32. type=int,
  33. default=1,
  34. help="The number of jobs for each gpu",
  35. )
  36. parser.add_argument(
  37. "--gpuid_list",
  38. type=str,
  39. default="",
  40. help="The visible gpus",
  41. )
  42. parser.add_argument(
  43. "--ngpu",
  44. type=int,
  45. default=0,
  46. help="The number of gpus. 0 indicates CPU mode",
  47. )
  48. args = parser.parse_args()
  49. # set logging messages
  50. logging.basicConfig(
  51. level=logging.INFO,
  52. format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
  53. )
  54. logging.info("Decoding args: {}".format(args))
  55. # gpu setting
  56. if args.ngpu > 0:
  57. jobid = int(args.output_file.split(".")[-1])
  58. gpuid = args.gpuid_list.split(",")[(jobid - 1) // args.njob]
  59. os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
  60. os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
  61. if args.local_model_path is None:
  62. inference_pipeline = pipeline(
  63. task=Tasks.auto_speech_recognition,
  64. model="damo/{}".format(args.model_name),
  65. model_revision=args.model_revision)
  66. else:
  67. inference_pipeline = pipeline(
  68. task=Tasks.auto_speech_recognition,
  69. model=args.local_model_path)
  70. with open(args.wav_list, 'r') as f_wav:
  71. wav_lines = f_wav.readlines()
  72. with open(args.output_file, "w") as f_out:
  73. for line in wav_lines:
  74. wav_id, wav_path = line.strip().split()
  75. logging.info("decoding, utt_id: ['{}']".format(wav_id))
  76. rec_result = inference_pipeline(audio_in=wav_path)
  77. if 'text' in rec_result:
  78. text = rec_result["text"]
  79. else:
  80. text = ''
  81. f_out.write(wav_id + " " + text + "\n")
  82. logging.info("best hypo: {} \n".format(text))