s3prl.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. import copy
  2. import logging
  3. import os
  4. from argparse import Namespace
  5. from typing import Optional
  6. from typing import Tuple
  7. from typing import Union
  8. import humanfriendly
  9. import torch
  10. from typeguard import check_argument_types
  11. from funasr.models.frontend.abs_frontend import AbsFrontend
  12. from funasr.modules.frontends.frontend import Frontend
  13. from funasr.modules.nets_utils import pad_list
  14. from funasr.utils.get_default_kwargs import get_default_kwargs
  15. def base_s3prl_setup(args):
  16. args.upstream_feature_selection = getattr(args, "upstream_feature_selection", None)
  17. args.upstream_model_config = getattr(args, "upstream_model_config", None)
  18. args.upstream_refresh = getattr(args, "upstream_refresh", False)
  19. args.upstream_ckpt = getattr(args, "upstream_ckpt", None)
  20. args.init_ckpt = getattr(args, "init_ckpt", None)
  21. args.verbose = getattr(args, "verbose", False)
  22. args.tile_factor = getattr(args, "tile_factor", 1)
  23. return args
  24. class S3prlFrontend(AbsFrontend):
  25. """Speech Pretrained Representation frontend structure for ASR."""
  26. def __init__(
  27. self,
  28. fs: Union[int, str] = 16000,
  29. frontend_conf: Optional[dict] = get_default_kwargs(Frontend),
  30. download_dir: str = None,
  31. multilayer_feature: bool = False,
  32. ):
  33. assert check_argument_types()
  34. super().__init__()
  35. if isinstance(fs, str):
  36. fs = humanfriendly.parse_size(fs)
  37. if download_dir is not None:
  38. torch.hub.set_dir(download_dir)
  39. self.multilayer_feature = multilayer_feature
  40. self.upstream, self.featurizer = self._get_upstream(frontend_conf)
  41. self.pretrained_params = copy.deepcopy(self.upstream.state_dict())
  42. self.output_dim = self.featurizer.output_dim
  43. self.frontend_type = "s3prl"
  44. self.hop_length = self.upstream.get_downsample_rates("key")
  45. def _get_upstream(self, frontend_conf):
  46. """Get S3PRL upstream model."""
  47. s3prl_args = base_s3prl_setup(
  48. Namespace(**frontend_conf, device="cpu"),
  49. )
  50. self.args = s3prl_args
  51. s3prl_path = None
  52. python_path_list = os.environ.get("PYTHONPATH", "(None)").split(":")
  53. for p in python_path_list:
  54. if p.endswith("s3prl"):
  55. s3prl_path = p
  56. break
  57. assert s3prl_path is not None
  58. s3prl_upstream = torch.hub.load(
  59. s3prl_path,
  60. s3prl_args.upstream,
  61. ckpt=s3prl_args.upstream_ckpt,
  62. model_config=s3prl_args.upstream_model_config,
  63. refresh=s3prl_args.upstream_refresh,
  64. source="local",
  65. ).to("cpu")
  66. if getattr(
  67. s3prl_upstream, "model", None
  68. ) is not None and s3prl_upstream.model.__class__.__name__ in [
  69. "Wav2Vec2Model",
  70. "HubertModel",
  71. ]:
  72. s3prl_upstream.model.encoder.layerdrop = 0.0
  73. from s3prl.upstream.interfaces import Featurizer
  74. if self.multilayer_feature is None:
  75. feature_selection = "last_hidden_state"
  76. else:
  77. feature_selection = "hidden_states"
  78. s3prl_featurizer = Featurizer(
  79. upstream=s3prl_upstream,
  80. feature_selection=feature_selection,
  81. upstream_device="cpu",
  82. )
  83. return s3prl_upstream, s3prl_featurizer
  84. def _tile_representations(self, feature):
  85. """Tile up the representations by `tile_factor`.
  86. Input - sequence of representations
  87. shape: (batch_size, seq_len, feature_dim)
  88. Output - sequence of tiled representations
  89. shape: (batch_size, seq_len * factor, feature_dim)
  90. """
  91. assert (
  92. len(feature.shape) == 3
  93. ), "Input argument `feature` has invalid shape: {}".format(feature.shape)
  94. tiled_feature = feature.repeat(1, 1, self.args.tile_factor)
  95. tiled_feature = tiled_feature.reshape(
  96. feature.size(0), feature.size(1) * self.args.tile_factor, feature.size(2)
  97. )
  98. return tiled_feature
  99. def output_size(self) -> int:
  100. return self.output_dim
  101. def forward(
  102. self, input: torch.Tensor, input_lengths: torch.Tensor
  103. ) -> Tuple[torch.Tensor, torch.Tensor]:
  104. wavs = [wav[: input_lengths[i]] for i, wav in enumerate(input)]
  105. self.upstream.eval()
  106. with torch.no_grad():
  107. feats = self.upstream(wavs)
  108. feats = self.featurizer(wavs, feats)
  109. if self.args.tile_factor != 1:
  110. feats = self._tile_representations(feats)
  111. input_feats = pad_list(feats, 0.0)
  112. feats_lens = torch.tensor([f.shape[0] for f in feats], dtype=torch.long)
  113. # Saving CUDA Memory
  114. del feats
  115. return input_feats, feats_lens
  116. def reload_pretrained_parameters(self):
  117. self.upstream.load_state_dict(self.pretrained_params)
  118. logging.info("Pretrained S3PRL frontend model parameters reloaded!")