2 lat temu · 5dd5332fd5
--- a/egs/aishell/transformer/local/download_and_untar.sh
+++ b/egs/aishell/transformer/local/download_and_untar.sh
@@ -0,0 +1,105 @@
 
															+#!/usr/bin/env bash
														
 
															+
														
 
															+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
														
 
															+#             2017  Xingyu Na
														
 
															+# Apache 2.0
														
 
															+
														
 
															+remove_archive=false
														
 
															+
														
 
															+if [ "$1" == --remove-archive ]; then
														
 
															+  remove_archive=true
														
 
															+  shift
														
 
															+fi
														
 
															+
														
 
															+if [ $# -ne 3 ]; then
														
 
															+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
														
 
															+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
														
 
															+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
														
 
															+  echo "<corpus-part> can be one of: data_aishell, resource_aishell."
														
 
															+fi
														
 
															+
														
 
															+data=$1
														
 
															+url=$2
														
 
															+part=$3
														
 
															+
														
 
															+if [ ! -d "$data" ]; then
														
 
															+  echo "$0: no such directory $data"
														
 
															+  exit 1;
														
 
															+fi
														
 
															+
														
 
															+part_ok=false
														
 
															+list="data_aishell resource_aishell"
														
 
															+for x in $list; do
														
 
															+  if [ "$part" == $x ]; then part_ok=true; fi
														
 
															+done
														
 
															+if ! $part_ok; then
														
 
															+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
														
 
															+  exit 1;
														
 
															+fi
														
 
															+
														
 
															+if [ -z "$url" ]; then
														
 
															+  echo "$0: empty URL base."
														
 
															+  exit 1;
														
 
															+fi
														
 
															+
														
 
															+if [ -f $data/$part/.complete ]; then
														
 
															+  echo "$0: data part $part was already successfully extracted, nothing to do."
														
 
															+  exit 0;
														
 
															+fi
														
 
															+
														
 
															+# sizes of the archive files in bytes.
														
 
															+sizes="15582913665 1246920"
														
 
															+
														
 
															+if [ -f $data/$part.tgz ]; then
														
 
															+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
														
 
															+  size_ok=false
														
 
															+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
														
 
															+  if ! $size_ok; then
														
 
															+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
														
 
															+    echo "does not equal the size of one of the archives."
														
 
															+    rm $data/$part.tgz
														
 
															+  else
														
 
															+    echo "$data/$part.tgz exists and appears to be complete."
														
 
															+  fi
														
 
															+fi
														
 
															+
														
 
															+if [ ! -f $data/$part.tgz ]; then
														
 
															+  if ! command -v wget >/dev/null; then
														
 
															+    echo "$0: wget is not installed."
														
 
															+    exit 1;
														
 
															+  fi
														
 
															+  full_url=$url/$part.tgz
														
 
															+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
														
 
															+
														
 
															+  cd $data || exit 1
														
 
															+  if ! wget --no-check-certificate $full_url; then
														
 
															+    echo "$0: error executing wget $full_url"
														
 
															+    exit 1;
														
 
															+  fi
														
 
															+fi
														
 
															+
														
 
															+cd $data || exit 1
														
 
															+
														
 
															+if ! tar -xvzf $part.tgz; then
														
 
															+  echo "$0: error un-tarring archive $data/$part.tgz"
														
 
															+  exit 1;
														
 
															+fi
														
 
															+
														
 
															+touch $data/$part/.complete
														
 
															+
														
 
															+if [ $part == "data_aishell" ]; then
														
 
															+  cd $data/$part/wav || exit 1
														
 
															+  for wav in ./*.tar.gz; do
														
 
															+    echo "Extracting wav from $wav"
														
 
															+    tar -zxf $wav && rm $wav
														
 
															+  done
														
 
															+fi
														
 
															+
														
 
															+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
														
 
															+
														
 
															+if $remove_archive; then
														
 
															+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
														
 
															+  rm $data/$part.tgz
														
 
															+fi
														
 
															+
														
 
															+exit 0;
														
--- a/egs/aishell/transformer/local/prepare_data.sh
+++ b/egs/aishell/transformer/local/prepare_data.sh
@@ -1,53 +0,0 @@
 
															-#!/usr/bin/env bash
														
 
															-# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG)
														
 
															-#           2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU)
														
 
															-# Apache 2.0
														
 
															-
														
 
															-# transform raw AISHELL-2 data to kaldi format
														
 
															-
														
 
															-. ./path.sh || exit 1;
														
 
															-
														
 
															-tmp=
														
 
															-dir=
														
 
															-
														
 
															-if [ $# != 3 ]; then
														
 
															-  echo "Usage: $0 <corpus-data-dir> <tmp-dir> <output-dir>"
														
 
															-  echo " $0 /export/AISHELL-2/iOS/train data/local/train data/train"
														
 
															-  exit 1;
														
 
															-fi
														
 
															-
														
 
															-corpus=$1
														
 
															-tmp=$2
														
 
															-dir=$3
														
 
															-
														
 
															-echo "prepare_data.sh: Preparing data in $corpus"
														
 
															-
														
 
															-mkdir -p $tmp
														
 
															-mkdir -p $dir
														
 
															-
														
 
															-# corpus check
														
 
															-if [ ! -d $corpus ] || [ ! -f $corpus/wav.scp ] || [ ! -f $corpus/trans.txt ]; then
														
 
															-  echo "Error: $0 requires wav.scp and trans.txt under $corpus directory."
														
 
															-  exit 1;
														
 
															-fi
														
 
															-
														
 
															-# validate utt-key list, IC0803W0380 is a bad utterance
														
 
															-awk '{print $1}' $corpus/wav.scp | grep -v 'IC0803W0380' > $tmp/wav_utt.list
														
 
															-awk '{print $1}' $corpus/trans.txt > $tmp/trans_utt.list
														
 
															-utils/filter_scp.pl -f 1 $tmp/wav_utt.list $tmp/trans_utt.list > $tmp/utt.list
														
 
															-
														
 
															-# wav.scp
														
 
															-awk -F'\t' -v path_prefix=$corpus '{printf("%s\t%s/%s\n",$1,path_prefix,$2)}' $corpus/wav.scp > $tmp/tmp_wav.scp
														
 
															-utils/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tmp/wav.scp
														
 
															-
														
 
															-# text
														
 
															-utils/filter_scp.pl -f 1 $tmp/utt.list $corpus/trans.txt | sort -k 1 | uniq > $tmp/text
														
 
															-
														
 
															-# copy prepared resources from tmp_dir to target dir
														
 
															-mkdir -p $dir
														
 
															-for f in wav.scp text; do
														
 
															-  cp $tmp/$f $dir/$f || exit 1;
														
 
															-done
														
 
															-
														
 
															-echo "local/prepare_data.sh succeeded"
														
 
															-exit 0;
														
--- a/egs/aishell/transformer/run.sh
+++ b/egs/aishell/transformer/run.sh
@@ -3,12 +3,12 @@
 
															 . ./path.sh || exit 1;
														
 
															 # machines configuration
														
 
															-CUDA_VISIBLE_DEVICES="2,3"
														
 
															+CUDA_VISIBLE_DEVICES="0,1"
														
 
															 gpu_num=2
														
 
															 count=1
														
 
															 gpu_inference=true  # Whether to perform gpu decoding, set false for cpu decoding
														
 
															 # for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob
														
 
															-njob=1
														
 
															+njob=5
														
 
															 train_cmd=utils/run.pl
														
 
															 infer_cmd=utils/run.pl
														
@@ -16,13 +16,11 @@ infer_cmd=utils/run.pl
 
															 feats_dir="../DATA" #feature output dictionary
														
 
															 exp_dir="."
														
 
															 lang=zh
														
 
															-dumpdir=dump/fbank
														
 
															-feats_type=fbank
														
 
															 token_type=char
														
 
															-scp=wav.scp
														
 
															 type=sound
														
 
															+scp=wav.scp
														
 
															 stage=3
														
 
															-stop_stage=3
														
 
															+stop_stage=4
														
 
															 # feature configuration
														
 
															 feats_dim=80
														
@@ -48,7 +46,7 @@ valid_set=dev
 
															 test_sets="dev test"
														
 
															 asr_config=conf/train_asr_transformer.yaml
														
 
															-model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
														
 
															+model_dir="baseline_$(basename "${asr_config}" .yaml)_${lang}_${token_type}_${tag}"
														
 
															 inference_config=conf/decode_asr_transformer.yaml
														
 
															 inference_asr_model=valid.acc.ave_10best.pb
														
@@ -143,4 +141,61 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
 
															         } &
														
 
															         done
														
 
															         wait
														
 
															+fi
														
 
															+
														
 
															+# Testing Stage
														
 
															+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
														
 
															+    echo "stage 4: Inference"
														
 
															+    for dset in ${test_sets}; do
														
 
															+        asr_exp=${exp_dir}/exp/${model_dir}
														
 
															+        inference_tag="$(basename "${inference_config}" .yaml)"
														
 
															+        _dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}"
														
 
															+        _logdir="${_dir}/logdir"
														
 
															+        if [ -d ${_dir} ]; then
														
 
															+            echo "${_dir} is already exists. if you want to decode again, please delete this dir first."
														
 
															+            exit 0
														
 
															+        fi
														
 
															+        mkdir -p "${_logdir}"
														
 
															+        _data="${feats_dir}/data/${dset}"
														
 
															+        key_file=${_data}/${scp}
														
 
															+        num_scp_file="$(<${key_file} wc -l)"
														
 
															+        _nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file")
														
 
															+        split_scps=
														
 
															+        for n in $(seq "${_nj}"); do
														
 
															+            split_scps+=" ${_logdir}/keys.${n}.scp"
														
 
															+        done
														
 
															+        # shellcheck disable=SC2086
														
 
															+        utils/split_scp.pl "${key_file}" ${split_scps}
														
 
															+        _opts=
														
 
															+        if [ -n "${inference_config}" ]; then
														
 
															+            _opts+="--config ${inference_config} "
														
 
															+        fi
														
 
															+        ${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
														
 
															+            python -m funasr.bin.asr_inference_launch \
														
 
															+                --batch_size 1 \
														
 
															+                --ngpu "${_ngpu}" \
														
 
															+                --njob ${njob} \
														
 
															+                --gpuid_list ${gpuid_list} \
														
 
															+                --data_path_and_name_and_type "${_data}/${scp},speech,${type}" \
														
 
															+                --cmvn_file ${feats_dir}/data/${train_set}/cmvn/cmvn.mvn \
														
 
															+                --key_file "${_logdir}"/keys.JOB.scp \
														
 
															+                --asr_train_config "${asr_exp}"/config.yaml \
														
 
															+                --asr_model_file "${asr_exp}"/"${inference_asr_model}" \
														
 
															+                --output_dir "${_logdir}"/output.JOB \
														
 
															+                --mode asr \
														
 
															+                ${_opts}
														
 
															+
														
 
															+        for f in token token_int score text; do
														
 
															+            if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then
														
 
															+                for i in $(seq "${_nj}"); do
														
 
															+                    cat "${_logdir}/output.${i}/1best_recog/${f}"
														
 
															+                done | sort -k1 >"${_dir}/${f}"
														
 
															+            fi
														
 
															+        done
														
 
															+        python utils/proce_text.py ${_dir}/text ${_dir}/text.proc
														
 
															+        python utils/proce_text.py ${_data}/text ${_data}/text.proc
														
 
															+        python utils/compute_wer.py ${_data}/text.proc ${_dir}/text.proc ${_dir}/text.cer
														
 
															+        tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
														
 
															+        cat ${_dir}/text.cer.txt
														
 
															+    done
														
 
															 fi