| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162 |
- #!/usr/bin/env bash
- stage=1
- stop_stage=3
- bert_model_name="bert-base-chinese"
- raw_dataset_path="../DATA"
- nj=64
- model_path=${bert_model_name}
- . utils/parse_options.sh || exit 1;
- for data_set in train dev test;do
- scp=$raw_dataset_path/data/${data_set}/text
- local_scp_dir_raw=${raw_dataset_path}/data/embeds/${data_set}
- local_scp_dir=$local_scp_dir_raw/split$nj
- local_records_dir=$local_scp_dir_raw/ark
- mkdir -p $local_records_dir
- mkdir -p $local_scp_dir
- split_scps=""
- for JOB in $(seq ${nj}); do
- split_scps="$split_scps $local_scp_dir/data.$JOB.text"
- done
- utils/split_scp.pl $scp ${split_scps}
- for num in {0..7};do
- tmp=`expr $num \* 4`
- if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
- for idx in {1..4}; do
- JOB=`expr $tmp + $idx`
- echo "proces jobid=$JOB"
- {
- beg=0
- gpu=`expr $beg + $idx`
- echo ${local_scp_dir}/log.${JOB}
- python utils/extract_embeds.py $local_scp_dir/data.$JOB.text ${local_records_dir}/embeds.${JOB}.ark ${local_records_dir}/embeds.${JOB}.scp ${local_records_dir}/embeds.${JOB}.shape ${gpu} ${model_path} &> ${local_scp_dir}/log.${JOB}
- } &
- done
- wait
- fi
- done
- if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
- for JOB in $(seq ${nj}); do
- cat ${local_records_dir}/embeds.${JOB}.scp || exit 1;
- done > ${local_scp_dir_raw}/embeds.scp
- for JOB in $(seq ${nj}); do
- cat ${local_records_dir}/embeds.${JOB}.shape || exit 1;
- done > ${local_scp_dir_raw}/embeds.shape
- fi
- cp ${local_scp_dir_raw}/embeds.scp ${raw_dataset_path}/data/${data_set}/embeds.scp
- done
- echo "embeds is in: ${local_scp_dir_raw}"
- echo "success"
|