compile_dict_token.sh 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. #!/bin/bash
  2. # Copyright 2015 Yajie Miao (Carnegie Mellon University)
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  10. # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  11. # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  12. # MERCHANTABLITY OR NON-INFRINGEMENT.
  13. # See the Apache 2 License for the specific language governing permissions and
  14. # limitations under the License.
  15. # This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the
  16. # phoneme and character-based lexicons.
  17. . fst/parse_options.sh
  18. if [ $# -ne 3 ]; then
  19. echo "usage: utils/compile_dict_token.sh <dict-src-dir> <tmp-dir> <lang-dir>"
  20. echo "e.g.: utils/compile_dict_token.sh data/local/dict_phn data/local/lang_phn_tmp data/lang_phn"
  21. echo "<dict-src-dir> should contain the following files:"
  22. echo "lexicon.out units.txt"
  23. echo "options: "
  24. exit 1;
  25. fi
  26. srcdir=$1
  27. tmpdir=$2
  28. dir=$3
  29. mkdir -p $tmpdir $dir
  30. [ -f path.sh ] && . ./path.sh
  31. cp $srcdir/units.txt $dir
  32. # Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0.
  33. # But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is.
  34. perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.out > $tmpdir/lexiconp.txt || exit 1;
  35. # Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst.
  36. # Without these symbols, determinization will fail.
  37. ndisambig=`fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt`
  38. ndisambig=$[$ndisambig+1];
  39. ( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list
  40. # Get the full list of CTC tokens used in FST. These tokens include <eps>, the blank <blk>, the actual labels (e.g.,
  41. # phonemes), and the disambiguation symbols.
  42. cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list
  43. (echo '<eps>') | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt
  44. # Compile the tokens into FST
  45. fst/ctc_token_fst.py $dir/tokens.txt | fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt \
  46. --keep_isymbols=false --keep_osymbols=false | fstarcsort --sort_type=olabel > $dir/T.fst || exit 1;
  47. # Encode the words with indices. Will be used in lexicon and language model FST compiling.
  48. cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk '
  49. BEGIN {
  50. print "<eps> 0";
  51. }
  52. {
  53. printf("%s %d\n", $1, NR);
  54. }
  55. END {
  56. printf("#0 %d\n", NR+1);
  57. printf("<s> %d\n", NR+2);
  58. printf("</s> %d\n", NR+3);
  59. }' > $dir/words.txt || exit 1;
  60. # Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time.
  61. token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'`
  62. word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'`
  63. fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \
  64. fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \
  65. --keep_isymbols=false --keep_osymbols=false | \
  66. fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \
  67. fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
  68. echo "Dict and token FSTs compiling succeeded"