add_lex_disambig.pl 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. #!/usr/bin/perl
  2. # Copyright 2010-2011 Microsoft Corporation
  3. # 2013 Johns Hopkins University (author: Daniel Povey)
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  11. # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  12. # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  13. # MERCHANTABLITY OR NON-INFRINGEMENT.
  14. # See the Apache 2 License for the specific language governing permissions and
  15. # limitations under the License.
  16. # Adds disambiguation symbols to a lexicon.
  17. # Outputs still in the normal lexicon format.
  18. # Disambig syms are numbered #1, #2, #3, etc. (#0
  19. # reserved for symbol in grammar).
  20. # Outputs the number of disambig syms to the standard output.
  21. # With the --pron-probs option, expects the second field
  22. # of each lexicon line to be a pron-prob.
  23. $pron_probs = 0;
  24. if ($ARGV[0] eq "--pron-probs") {
  25. $pron_probs = 1;
  26. shift @ARGV;
  27. }
  28. if(@ARGV != 2) {
  29. die "Usage: add_lex_disambig.pl [--pron-probs] lexicon.txt lexicon_disambig.txt "
  30. }
  31. $lexfn = shift @ARGV;
  32. $lexoutfn = shift @ARGV;
  33. open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
  34. # (1) Read in the lexicon.
  35. @L = ( );
  36. while(<L>) {
  37. @A = split(" ", $_);
  38. push @L, join(" ", @A);
  39. }
  40. # (2) Work out the count of each phone-sequence in the
  41. # lexicon.
  42. foreach $l (@L) {
  43. @A = split(" ", $l);
  44. shift @A; # Remove word.
  45. if ($pron_probs) {
  46. $p = shift @A;
  47. if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; }
  48. }
  49. $count{join(" ",@A)}++;
  50. }
  51. # (3) For each left sub-sequence of each phone-sequence, note down
  52. # that exists (for identifying prefixes of longer strings).
  53. foreach $l (@L) {
  54. @A = split(" ", $l);
  55. shift @A; # Remove word.
  56. if ($pron_probs) { shift @A; } # remove pron-prob.
  57. while(@A > 0) {
  58. pop @A; # Remove last phone
  59. $issubseq{join(" ",@A)} = 1;
  60. }
  61. }
  62. # (4) For each entry in the lexicon:
  63. # if the phone sequence is unique and is not a
  64. # prefix of another word, no diambig symbol.
  65. # Else output #1, or #2, #3, ... if the same phone-seq
  66. # has already been assigned a disambig symbol.
  67. open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n";
  68. $max_disambig = 0;
  69. foreach $l (@L) {
  70. @A = split(" ", $l);
  71. $word = shift @A;
  72. if ($pron_probs) { $pron_prob = shift @A; }
  73. $phnseq = join(" ",@A);
  74. if(!defined $issubseq{$phnseq}
  75. && $count{$phnseq} == 1) {
  76. ; # Do nothing.
  77. } else {
  78. if($phnseq eq "") { # need disambig symbols for the empty string
  79. # that are not use anywhere else.
  80. $max_disambig++;
  81. $reserved{$max_disambig} = 1;
  82. $phnseq = "#$max_disambig";
  83. } else {
  84. $curnumber = $disambig_of{$phnseq};
  85. if(!defined{$curnumber}) { $curnumber = 0; }
  86. $curnumber++; # now 1 or 2, ...
  87. while(defined $reserved{$curnumber} ) { $curnumber++; } # skip over reserved symbols
  88. if($curnumber > $max_disambig) {
  89. $max_disambig = $curnumber;
  90. }
  91. $disambig_of{$phnseq} = $curnumber;
  92. $phnseq = $phnseq . " #" . $curnumber;
  93. }
  94. }
  95. if ($pron_probs) { print O "$word\t$pron_prob\t$phnseq\n"; }
  96. else { print O "$word\t$phnseq\n"; }
  97. }
  98. print $max_disambig . "\n";