| 1234567891011121314151617181920212223242526272829303132333435 |
- #!/usr/bin/env bash
- # Begin configuration section.
- nj=32
- cmd=utils/run.pl
- echo "$0 $@"
- . utils/parse_options.sh || exit 1;
- # tokenize configuration
- text_dir=$1
- seg_file=$2
- logdir=$3
- output_dir=$4
- txt_dir=${output_dir}/txt; mkdir -p ${output_dir}/txt
- mkdir -p ${logdir}
- $cmd JOB=1:$nj $logdir/text_tokenize.JOB.log \
- python utils/text_tokenize.py -t ${text_dir}/txt/text.JOB.txt \
- -s ${seg_file} -i JOB -o ${txt_dir} \
- || exit 1;
- # concatenate the text files together.
- for n in $(seq $nj); do
- cat ${txt_dir}/text.$n.txt || exit 1
- done > ${output_dir}/text || exit 1
- for n in $(seq $nj); do
- cat ${txt_dir}/len.$n || exit 1
- done > ${output_dir}/text_shape || exit 1
- echo "$0: Succeeded text tokenize"
|