data_prep.sh 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. #!/usr/bin/env bash
  2. # Copyright 2014 Vassil Panayotov
  3. # 2014 Johns Hopkins University (author: Daniel Povey)
  4. # Apache 2.0
  5. if [ "$#" -ne 2 ]; then
  6. echo "Usage: $0 <src-dir> <dst-dir>"
  7. echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean"
  8. exit 1
  9. fi
  10. src=$1
  11. dst=$2
  12. # all utterances are FLAC compressed
  13. if ! which flac >&/dev/null; then
  14. echo "Please install 'flac' on ALL worker nodes!"
  15. exit 1
  16. fi
  17. spk_file=$src/../SPEAKERS.TXT
  18. mkdir -p $dst || exit 1
  19. [ ! -d $src ] && echo "$0: no such directory $src" && exit 1
  20. [ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1
  21. wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
  22. trans=$dst/text; [[ -f "$trans" ]] && rm $trans
  23. for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
  24. reader=$(basename $reader_dir)
  25. if ! [ $reader -eq $reader ]; then # not integer.
  26. echo "$0: unexpected subdirectory name $reader"
  27. exit 1
  28. fi
  29. for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
  30. chapter=$(basename $chapter_dir)
  31. if ! [ "$chapter" -eq "$chapter" ]; then
  32. echo "$0: unexpected chapter-subdirectory name $chapter"
  33. exit 1
  34. fi
  35. find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
  36. awk -v "dir=$chapter_dir" '{printf "%s %s/%s.flac \n", $0, dir, $0}' >>$wav_scp|| exit 1
  37. chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
  38. [ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
  39. cat $chapter_trans >>$trans
  40. done
  41. done
  42. echo "$0: successfully prepared data in $dst"
  43. exit 0