run.sh 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. #!/usr/bin/env bash
  2. . ./path.sh || exit 1;
  3. # machines configuration
  4. CUDA_VISIBLE_DEVICES="0,1"
  5. gpu_num=2
  6. count=1
  7. train_cmd=utils/run.pl
  8. infer_cmd=utils/run.pl
  9. # general configuration
  10. lang=zh
  11. nlsyms_txt=none # Non-linguistic symbol list if existing.
  12. cleaner=none # Text cleaner.
  13. g2p=none # g2p method (needed if token_type=phn).
  14. lm_fold_length=150 # fold_length for LM training.
  15. word_vocab_size=10000 # Size of word vocabulary.
  16. token_type=char
  17. lm_token_list=
  18. nj=10
  19. ## path to AISHELL2 trans
  20. lm_train_text=
  21. lm_dev_text=
  22. lm_test_text=
  23. train_data_path_and_name_and_type=${lm_train_text},text,text
  24. train_shape_file=
  25. valid_data_path_and_name_and_type=${lm_dev_text},text,text
  26. valid_shape_file=
  27. lm_config=conf/train_lm_transformer.yaml
  28. exp_dir=./data
  29. tag=exp1
  30. model_dir="baseline_$(basename "${lm_config}" .yaml)_${lang}_${token_type}_${tag}"
  31. lm_exp=${exp_dir}/exp/${model_dir}
  32. inference_lm=valid.loss.ave.pb # Language model path for decoding.
  33. stage=0
  34. stop_stage=3
  35. . utils/parse_options.sh || exit 1;
  36. # Set bash to 'debug' mode, it will exit on :
  37. # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
  38. set -e
  39. set -u
  40. set -o pipefail
  41. min() {
  42. local a b
  43. a=$1
  44. for b in "$@"; do
  45. if [ "${b}" -le "${a}" ]; then
  46. a="${b}"
  47. fi
  48. done
  49. echo "${a}"
  50. }
  51. # you can set gpu num for decoding here
  52. gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
  53. ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')
  54. mkdir -p ${exp_dir}/exp/${model_dir}
  55. token_list=${exp_dir}/exp/${model_dir}/vocab.txt
  56. blank="<blank>" # CTC blank symbole
  57. sos="<s>" # sos symbole
  58. eos="</s>" # eos symbole
  59. oov="<unk>" # Out of vocabulary symbol.
  60. if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
  61. if [ "${token_type}" = char ] || [ "${token_type}" = word ]; then
  62. echo "Stage 0: Generate character level token_list from ${lm_train_text}"
  63. # The first symbol in token_list must be "<blank>":
  64. # 0 is reserved for CTC-blank for ASR and also used as ignore-index in the other task
  65. python -m funasr.bin.tokenize_text \
  66. --token_type "${token_type}" \
  67. --input "${lm_train_text}" \
  68. --output "${token_list}" \
  69. --non_linguistic_symbols "${nlsyms_txt}" \
  70. --field 2- \
  71. --cleaner "${cleaner}" \
  72. --g2p "${g2p}" \
  73. --write_vocabulary true \
  74. --add_symbol "${blank}:0" \
  75. --add_symbol "${sos}:1" \
  76. --add_symbol "${eos}:2" \
  77. --add_symbol "${oov}:-1"
  78. else
  79. echo "Error: not supported --token_type '${token_type}'"
  80. exit 2
  81. fi
  82. lm_token_list="${token_list}"
  83. fi
  84. if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
  85. echo "stage 1: Data preparation"
  86. # 1. Split the key file
  87. _logdir="${exp_dir}/exp/${model_dir}/log"
  88. mkdir -p "${_logdir}"
  89. # Get the minimum number among ${nj} and the number lines of input files
  90. _nj=$(min "${nj}" "$(<${lm_train_text} wc -l)" "$(<${lm_dev_text} wc -l)")
  91. key_file="${lm_train_text}"
  92. split_scps=""
  93. for n in $(seq ${_nj}); do
  94. split_scps+=" ${_logdir}/train.${n}.scp"
  95. done
  96. # shellcheck disable=SC2086
  97. utils/split_scp.pl "${key_file}" ${split_scps}
  98. key_file="${lm_dev_text}"
  99. split_scps=""
  100. for n in $(seq ${_nj}); do
  101. split_scps+=" ${_logdir}/dev.${n}.scp"
  102. done
  103. # shellcheck disable=SC2086
  104. utils/split_scp.pl "${key_file}" ${split_scps}
  105. # 2. Submit jobs
  106. ## python ../../funasr/bin/lm_train.py \
  107. ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
  108. python -m funasr.bin.lm_train \
  109. --collect_stats true \
  110. --use_preprocessor true \
  111. --token_type "${token_type}" \
  112. --token_list "${lm_token_list}" \
  113. --non_linguistic_symbols "${nlsyms_txt}" \
  114. --cleaner "${cleaner}" \
  115. --g2p "${g2p}" \
  116. --train_data_path_and_name_and_type "${lm_train_text},text,text" \
  117. --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
  118. --train_shape_file "${_logdir}/train.JOB.scp" \
  119. --valid_shape_file "${_logdir}/dev.JOB.scp" \
  120. --output_dir "${_logdir}/stats.JOB" \
  121. --config ${lm_config} || { cat "${_logdir}"/stats.*.log; exit 1; }
  122. # 3. Aggregate shape files
  123. _opts=
  124. for i in $(seq "${_nj}"); do
  125. _opts+="--input_dir ${_logdir}/stats.${i} "
  126. done
  127. lm_stats_dir=${exp_dir}/exp/${model_dir}
  128. # shellcheck disable=SC2086
  129. python -m funasr.bin.aggregate_stats_dirs ${_opts} --output_dir "${lm_stats_dir}"
  130. # Append the num-tokens at the last dimensions. This is used for batch-bins count
  131. <"${lm_stats_dir}/train/text_shape" \
  132. awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
  133. >"${lm_stats_dir}/train/text_shape.${token_type}"
  134. <"${lm_stats_dir}/valid/text_shape" \
  135. awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
  136. >"${lm_stats_dir}/valid/text_shape.${token_type}"
  137. train_shape_file=${lm_stats_dir}/train/text_shape.${token_type}
  138. valid_shape_file=${lm_stats_dir}/valid/text_shape.${token_type}
  139. fi
  140. # Training Stage
  141. world_size=$gpu_num # run on one machine
  142. if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
  143. echo "stage 2: Training"
  144. mkdir -p ${lm_exp}
  145. mkdir -p ${lm_exp}/log
  146. INIT_FILE=${lm_exp}/ddp_init
  147. if [ -f $INIT_FILE ];then
  148. rm -f $INIT_FILE
  149. fi
  150. init_method=file://$(readlink -f $INIT_FILE)
  151. echo "$0: init method is $init_method"
  152. for ((i = 0; i < $gpu_num; ++i)); do
  153. {
  154. rank=$i
  155. local_rank=$i
  156. gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
  157. python ../../../funasr/bin/lm_train.py \
  158. --gpu_id ${gpu_id} \
  159. --use_preprocessor true \
  160. --token_type "${token_type}" \
  161. --token_list "${lm_token_list}" \
  162. --non_linguistic_symbols "${nlsyms_txt}" \
  163. --cleaner "${cleaner}" \
  164. --train_data_path_and_name_and_type "${train_data_path_and_name_and_type}" \
  165. --train_shape_file "${train_shape_file}" \
  166. --valid_data_path_and_name_and_type "${valid_data_path_and_name_and_type}" \
  167. --valid_shape_file "${valid_shape_file}" \
  168. --fold_length "${lm_fold_length}" \
  169. --resume true \
  170. --output_dir "${lm_exp}" \
  171. --config ${lm_config} \
  172. --ngpu ${gpu_num} \
  173. --num_worker_count ${count} \
  174. --multiprocessing_distributed true \
  175. --dist_init_method ${init_method} \
  176. --dist_world_size ${world_size} \
  177. --dist_rank ${rank} \
  178. --local_rank ${local_rank} 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1
  179. } &
  180. done
  181. wait
  182. fi
  183. # Testing Stage
  184. gpu_num=1
  185. if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
  186. echo "Stage 3: Calc perplexity: ${lm_test_text}"
  187. python ../../../funasr/bin/lm_inference_launch.py \
  188. --output_dir "${lm_exp}/perplexity_test/output.1" \
  189. --ngpu "${gpu_num}" \
  190. --batch_size 1 \
  191. --train_config "${lm_exp}"/config.yaml \
  192. --model_file "${lm_exp}/${inference_lm}" \
  193. --data_path_and_name_and_type "${lm_test_text},text,text" \
  194. --num_workers 1 \
  195. --gpuid_list 0 \
  196. --mode "transformer" \
  197. --split_with_space false
  198. fi