split_scp.pl 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. #!/usr/bin/env perl
  2. # Copyright 2010-2011 Microsoft Corporation
  3. # See ../../COPYING for clarification regarding multiple authors
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  12. # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  13. # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  14. # MERCHANTABLITY OR NON-INFRINGEMENT.
  15. # See the Apache 2 License for the specific language governing permissions and
  16. # limitations under the License.
  17. # This program splits up any kind of .scp or archive-type file.
  18. # If there is no utt2spk option it will work on any text file and
  19. # will split it up with an approximately equal number of lines in
  20. # each but.
  21. # With the --utt2spk option it will work on anything that has the
  22. # utterance-id as the first entry on each line; the utt2spk file is
  23. # of the form "utterance speaker" (on each line).
  24. # It splits it into equal size chunks as far as it can. If you use the utt2spk
  25. # option it will make sure these chunks coincide with speaker boundaries. In
  26. # this case, if there are more chunks than speakers (and in some other
  27. # circumstances), some of the resulting chunks will be empty and it will print
  28. # an error message and exit with nonzero status.
  29. # You will normally call this like:
  30. # split_scp.pl scp scp.1 scp.2 scp.3 ...
  31. # or
  32. # split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
  33. # Note that you can use this script to split the utt2spk file itself,
  34. # e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
  35. # You can also call the scripts like:
  36. # split_scp.pl -j 3 0 scp scp.0
  37. # [note: with this option, it assumes zero-based indexing of the split parts,
  38. # i.e. the second number must be 0 <= n < num-jobs.]
  39. use warnings;
  40. $num_jobs = 0;
  41. $job_id = 0;
  42. $utt2spk_file = "";
  43. $one_based = 0;
  44. for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
  45. if ($ARGV[0] eq "-j") {
  46. shift @ARGV;
  47. $num_jobs = shift @ARGV;
  48. $job_id = shift @ARGV;
  49. }
  50. if ($ARGV[0] =~ /--utt2spk=(.+)/) {
  51. $utt2spk_file=$1;
  52. shift;
  53. }
  54. if ($ARGV[0] eq '--one-based') {
  55. $one_based = 1;
  56. shift @ARGV;
  57. }
  58. }
  59. if ($num_jobs != 0 && ($num_jobs < 0 || $job_id - $one_based < 0 ||
  60. $job_id - $one_based >= $num_jobs)) {
  61. die "$0: Invalid job number/index values for '-j $num_jobs $job_id" .
  62. ($one_based ? " --one-based" : "") . "'\n"
  63. }
  64. $one_based
  65. and $job_id--;
  66. if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
  67. die
  68. "Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
  69. or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]
  70. ... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
  71. }
  72. $error = 0;
  73. $inscp = shift @ARGV;
  74. if ($num_jobs == 0) { # without -j option
  75. @OUTPUTS = @ARGV;
  76. } else {
  77. for ($j = 0; $j < $num_jobs; $j++) {
  78. if ($j == $job_id) {
  79. if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
  80. else { push @OUTPUTS, "-"; }
  81. } else {
  82. push @OUTPUTS, "/dev/null";
  83. }
  84. }
  85. }
  86. if ($utt2spk_file ne "") { # We have the --utt2spk option...
  87. open($u_fh, '<', $utt2spk_file) || die "$0: Error opening utt2spk file $utt2spk_file: $!\n";
  88. while(<$u_fh>) {
  89. @A = split;
  90. @A == 2 || die "$0: Bad line $_ in utt2spk file $utt2spk_file\n";
  91. ($u,$s) = @A;
  92. $utt2spk{$u} = $s;
  93. }
  94. close $u_fh;
  95. open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
  96. @spkrs = ();
  97. while(<$i_fh>) {
  98. @A = split;
  99. if(@A == 0) { die "$0: Empty or space-only line in scp file $inscp\n"; }
  100. $u = $A[0];
  101. $s = $utt2spk{$u};
  102. defined $s || die "$0: No utterance $u in utt2spk file $utt2spk_file\n";
  103. if(!defined $spk_count{$s}) {
  104. push @spkrs, $s;
  105. $spk_count{$s} = 0;
  106. $spk_data{$s} = []; # ref to new empty array.
  107. }
  108. $spk_count{$s}++;
  109. push @{$spk_data{$s}}, $_;
  110. }
  111. # Now split as equally as possible ..
  112. # First allocate spks to files by allocating an approximately
  113. # equal number of speakers.
  114. $numspks = @spkrs; # number of speakers.
  115. $numscps = @OUTPUTS; # number of output files.
  116. if ($numspks < $numscps) {
  117. die "$0: Refusing to split data because number of speakers $numspks " .
  118. "is less than the number of output .scp files $numscps\n";
  119. }
  120. for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
  121. $scparray[$scpidx] = []; # [] is array reference.
  122. }
  123. for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
  124. $scpidx = int(($spkidx*$numscps) / $numspks);
  125. $spk = $spkrs[$spkidx];
  126. push @{$scparray[$scpidx]}, $spk;
  127. $scpcount[$scpidx] += $spk_count{$spk};
  128. }
  129. # Now will try to reassign beginning + ending speakers
  130. # to different scp's and see if it gets more balanced.
  131. # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
  132. # We can show that if considering changing just 2 scp's, we minimize
  133. # this by minimizing the squared difference in sizes. This is
  134. # equivalent to minimizing the absolute difference in sizes. This
  135. # shows this method is bound to converge.
  136. $changed = 1;
  137. while($changed) {
  138. $changed = 0;
  139. for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
  140. # First try to reassign ending spk of this scp.
  141. if($scpidx < $numscps-1) {
  142. $sz = @{$scparray[$scpidx]};
  143. if($sz > 0) {
  144. $spk = $scparray[$scpidx]->[$sz-1];
  145. $count = $spk_count{$spk};
  146. $nutt1 = $scpcount[$scpidx];
  147. $nutt2 = $scpcount[$scpidx+1];
  148. if( abs( ($nutt2+$count) - ($nutt1-$count))
  149. < abs($nutt2 - $nutt1)) { # Would decrease
  150. # size-diff by reassigning spk...
  151. $scpcount[$scpidx+1] += $count;
  152. $scpcount[$scpidx] -= $count;
  153. pop @{$scparray[$scpidx]};
  154. unshift @{$scparray[$scpidx+1]}, $spk;
  155. $changed = 1;
  156. }
  157. }
  158. }
  159. if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
  160. $spk = $scparray[$scpidx]->[0];
  161. $count = $spk_count{$spk};
  162. $nutt1 = $scpcount[$scpidx-1];
  163. $nutt2 = $scpcount[$scpidx];
  164. if( abs( ($nutt2-$count) - ($nutt1+$count))
  165. < abs($nutt2 - $nutt1)) { # Would decrease
  166. # size-diff by reassigning spk...
  167. $scpcount[$scpidx-1] += $count;
  168. $scpcount[$scpidx] -= $count;
  169. shift @{$scparray[$scpidx]};
  170. push @{$scparray[$scpidx-1]}, $spk;
  171. $changed = 1;
  172. }
  173. }
  174. }
  175. }
  176. # Now print out the files...
  177. for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
  178. $scpfile = $OUTPUTS[$scpidx];
  179. ($scpfile ne '-' ? open($f_fh, '>', $scpfile)
  180. : open($f_fh, '>&', \*STDOUT)) ||
  181. die "$0: Could not open scp file $scpfile for writing: $!\n";
  182. $count = 0;
  183. if(@{$scparray[$scpidx]} == 0) {
  184. print STDERR "$0: eError: split_scp.pl producing empty .scp file " .
  185. "$scpfile (too many splits and too few speakers?)\n";
  186. $error = 1;
  187. } else {
  188. foreach $spk ( @{$scparray[$scpidx]} ) {
  189. print $f_fh @{$spk_data{$spk}};
  190. $count += $spk_count{$spk};
  191. }
  192. $count == $scpcount[$scpidx] || die "Count mismatch [code error]";
  193. }
  194. close($f_fh);
  195. }
  196. } else {
  197. # This block is the "normal" case where there is no --utt2spk
  198. # option and we just break into equal size chunks.
  199. open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
  200. $numscps = @OUTPUTS; # size of array.
  201. @F = ();
  202. while(<$i_fh>) {
  203. push @F, $_;
  204. }
  205. $numlines = @F;
  206. if($numlines == 0) {
  207. print STDERR "$0: error: empty input scp file $inscp\n";
  208. $error = 1;
  209. }
  210. $linesperscp = int( $numlines / $numscps); # the "whole part"..
  211. $linesperscp >= 1 || die "$0: You are splitting into too many pieces! [reduce \$nj ($numscps) to be smaller than the number of lines ($numlines) in $inscp]\n";
  212. $remainder = $numlines - ($linesperscp * $numscps);
  213. ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder";
  214. # [just doing int() rounds down].
  215. $n = 0;
  216. for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
  217. $scpfile = $OUTPUTS[$scpidx];
  218. ($scpfile ne '-' ? open($o_fh, '>', $scpfile)
  219. : open($o_fh, '>&', \*STDOUT)) ||
  220. die "$0: Could not open scp file $scpfile for writing: $!\n";
  221. for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
  222. print $o_fh $F[$n++];
  223. }
  224. close($o_fh) || die "$0: Eror closing scp file $scpfile: $!\n";
  225. }
  226. $n == $numlines || die "$n != $numlines [code error]";
  227. }
  228. exit ($error);