extract_embeds.sh 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. #!/usr/bin/env bash
  2. stage=1
  3. stop_stage=3
  4. bert_model_name="bert-base-chinese"
  5. raw_dataset_path="../DATA"
  6. nj=64
  7. model_path=${bert_model_name}
  8. . utils/parse_options.sh || exit 1;
  9. for data_set in train dev test;do
  10. scp=$raw_dataset_path/data/${data_set}/text
  11. local_scp_dir_raw=${raw_dataset_path}/data/embeds/${data_set}
  12. local_scp_dir=$local_scp_dir_raw/split$nj
  13. local_records_dir=$local_scp_dir_raw/ark
  14. mkdir -p $local_records_dir
  15. mkdir -p $local_scp_dir
  16. split_scps=""
  17. for JOB in $(seq ${nj}); do
  18. split_scps="$split_scps $local_scp_dir/data.$JOB.text"
  19. done
  20. utils/split_scp.pl $scp ${split_scps}
  21. for num in {0..7};do
  22. tmp=`expr $num \* 4`
  23. if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
  24. for idx in {1..4}; do
  25. JOB=`expr $tmp + $idx`
  26. echo "proces jobid=$JOB"
  27. {
  28. beg=0
  29. gpu=`expr $beg + $idx`
  30. echo ${local_scp_dir}/log.${JOB}
  31. python utils/extract_embeds.py $local_scp_dir/data.$JOB.text ${local_records_dir}/embeds.${JOB}.ark ${local_records_dir}/embeds.${JOB}.scp ${local_records_dir}/embeds.${JOB}.shape ${gpu} ${model_path} &> ${local_scp_dir}/log.${JOB}
  32. } &
  33. done
  34. wait
  35. fi
  36. done
  37. if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
  38. for JOB in $(seq ${nj}); do
  39. cat ${local_records_dir}/embeds.${JOB}.scp || exit 1;
  40. done > ${local_scp_dir_raw}/embeds.scp
  41. for JOB in $(seq ${nj}); do
  42. cat ${local_records_dir}/embeds.${JOB}.shape || exit 1;
  43. done > ${local_scp_dir_raw}/embeds.shape
  44. fi
  45. cp ${local_scp_dir_raw}/embeds.scp ${raw_dataset_path}/data/${data_set}/embeds.scp
  46. done
  47. echo "embeds is in: ${local_scp_dir_raw}"
  48. echo "success"