text_tokenize.sh 726 B

1234567891011121314151617181920212223242526272829303132333435
  1. #!/usr/bin/env bash
  2. # Begin configuration section.
  3. nj=32
  4. cmd=utils/run.pl
  5. echo "$0 $@"
  6. . utils/parse_options.sh || exit 1;
  7. # tokenize configuration
  8. text_dir=$1
  9. seg_file=$2
  10. logdir=$3
  11. output_dir=$4
  12. txt_dir=${output_dir}/txt; mkdir -p ${output_dir}/txt
  13. mkdir -p ${logdir}
  14. $cmd JOB=1:$nj $logdir/text_tokenize.JOB.log \
  15. python utils/text_tokenize.py -t ${text_dir}/txt/text.JOB.txt \
  16. -s ${seg_file} -i JOB -o ${txt_dir} \
  17. || exit 1;
  18. # concatenate the text files together.
  19. for n in $(seq $nj); do
  20. cat ${txt_dir}/text.$n.txt || exit 1
  21. done > ${output_dir}/text || exit 1
  22. for n in $(seq $nj); do
  23. cat ${txt_dir}/len.$n || exit 1
  24. done > ${output_dir}/text_shape || exit 1
  25. echo "$0: Succeeded text tokenize"