train.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
  1. import argparse
  2. import logging
  3. import os
  4. import sys
  5. from io import BytesIO
  6. from collections.abc import Sequence
  7. import torch
  8. import hydra
  9. from omegaconf import DictConfig, OmegaConf
  10. from funasr.train_utils.set_all_random_seed import set_all_random_seed
  11. from funasr.models.lora.utils import mark_only_lora_as_trainable
  12. from funasr.optimizers import optim_classes
  13. from funasr.schedulers import scheduler_classes
  14. from funasr.train_utils.load_pretrained_model import load_pretrained_model
  15. from funasr.train_utils.initialize import initialize
  16. # from funasr.tokenizer.build_tokenizer import build_tokenizer
  17. # from funasr.tokenizer.token_id_converter import TokenIDConverter
  18. # from funasr.tokenizer.funtoken import build_tokenizer
  19. from funasr.train_utils.trainer import Trainer
  20. import torch.distributed as dist
  21. from torch.nn.parallel import DistributedDataParallel as DDP
  22. from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
  23. from funasr.download.download_from_hub import download_model
  24. from funasr.register import tables
  25. @hydra.main(config_name=None, version_base=None)
  26. def main_hydra(kwargs: DictConfig):
  27. if kwargs.get("debug", False):
  28. import pdb; pdb.set_trace()
  29. assert "model" in kwargs
  30. if "model_conf" not in kwargs:
  31. logging.info("download models from model hub: {}".format(kwargs.get("model_hub", "ms")))
  32. kwargs = download_model(is_training=kwargs.get("is_training", True), **kwargs)
  33. main(**kwargs)
  34. def main(**kwargs):
  35. # preprocess_config(kwargs)
  36. # import pdb; pdb.set_trace()
  37. # set random seed
  38. tables.print()
  39. set_all_random_seed(kwargs.get("seed", 0))
  40. torch.backends.cudnn.enabled = kwargs.get("cudnn_enabled", torch.backends.cudnn.enabled)
  41. torch.backends.cudnn.benchmark = kwargs.get("cudnn_benchmark", torch.backends.cudnn.benchmark)
  42. torch.backends.cudnn.deterministic = kwargs.get("cudnn_deterministic", True)
  43. local_rank = int(os.environ.get('LOCAL_RANK', 0))
  44. # Check if we are using DDP or FSDP
  45. use_ddp = 'WORLD_SIZE' in os.environ and int(os.environ["WORLD_SIZE"]) > 1
  46. use_fsdp = kwargs.get("use_fsdp", None)
  47. if use_ddp or use_fsdp:
  48. dist.init_process_group(backend=kwargs.get("backend", "nccl"), init_method='env://')
  49. torch.cuda.set_device(local_rank)
  50. # save config.yaml
  51. if (use_ddp or use_fsdp) and dist.get_rank() == 0 or not (use_ddp or use_fsdp) and local_rank == 0:
  52. os.makedirs(kwargs.get("output_dir", "./"), exist_ok=True)
  53. yaml_file = os.path.join(kwargs.get("output_dir", "./"), "config.yaml")
  54. OmegaConf.save(config=kwargs, f=yaml_file)
  55. logging.info("config.yaml is saved to: %s", yaml_file)
  56. tokenizer = kwargs.get("tokenizer", None)
  57. if tokenizer is not None:
  58. tokenizer_class = tables.tokenizer_classes.get(tokenizer)
  59. tokenizer = tokenizer_class(**kwargs["tokenizer_conf"])
  60. kwargs["tokenizer"] = tokenizer
  61. # build frontend if frontend is none None
  62. frontend = kwargs.get("frontend", None)
  63. if frontend is not None:
  64. frontend_class = tables.frontend_classes.get(frontend)
  65. frontend = frontend_class(**kwargs["frontend_conf"])
  66. kwargs["frontend"] = frontend
  67. kwargs["input_size"] = frontend.output_size()
  68. # import pdb;
  69. # pdb.set_trace()
  70. # build model
  71. model_class = tables.model_classes.get(kwargs["model"])
  72. model = model_class(**kwargs, **kwargs["model_conf"], vocab_size=len(tokenizer.token_list))
  73. # init_param
  74. init_param = kwargs.get("init_param", None)
  75. if init_param is not None:
  76. if not isinstance(init_param, (list, tuple)):
  77. init_param = (init_param,)
  78. logging.info("init_param is not None: %s", init_param)
  79. for p in init_param:
  80. logging.info(f"Loading pretrained params from {p}")
  81. load_pretrained_model(
  82. model=model,
  83. init_param=p,
  84. ignore_init_mismatch=kwargs.get("ignore_init_mismatch", True),
  85. oss_bucket=kwargs.get("oss_bucket", None),
  86. )
  87. else:
  88. initialize(model, kwargs.get("init", "kaiming_normal"))
  89. # freeze_param
  90. freeze_param = kwargs.get("freeze_param", None)
  91. if freeze_param is not None:
  92. freeze_param = eval(freeze_param)
  93. if isinstance(freeze_param, Sequence):
  94. freeze_param = (freeze_param,)
  95. logging.info("freeze_param is not None: %s", freeze_param)
  96. for t in freeze_param:
  97. for k, p in model.named_parameters():
  98. if k.startswith(t + ".") or k == t:
  99. logging.info(f"Setting {k}.requires_grad = False")
  100. p.requires_grad = False
  101. if use_ddp:
  102. model = model.cuda(local_rank)
  103. model = DDP(model, device_ids=[local_rank],
  104. find_unused_parameters=kwargs.get("train_conf", {}).get("find_unused_parameters", False))
  105. elif use_fsdp:
  106. model = FSDP(model).cuda(local_rank)
  107. else:
  108. model = model.to(device=kwargs.get("device", "cuda"))
  109. # optim
  110. optim = kwargs.get("optim", "adam")
  111. assert optim in optim_classes
  112. optim_class = optim_classes.get(optim)
  113. optim = optim_class(model.parameters(), **kwargs.get("optim_conf"))
  114. # scheduler
  115. scheduler = kwargs.get("scheduler", "warmuplr")
  116. assert scheduler in scheduler_classes
  117. scheduler_class = scheduler_classes.get(scheduler)
  118. scheduler = scheduler_class(optim, **kwargs.get("scheduler_conf"))
  119. # import pdb;
  120. # pdb.set_trace()
  121. # dataset
  122. dataset_class = tables.dataset_classes.get(kwargs.get("dataset", "AudioDataset"))
  123. dataset_tr = dataset_class(kwargs.get("train_data_set_list"), frontend=frontend, tokenizer=tokenizer, **kwargs.get("dataset_conf"))
  124. # dataloader
  125. batch_sampler = kwargs["dataset_conf"].get("batch_sampler", "DynamicBatchLocalShuffleSampler")
  126. batch_sampler_class = tables.batch_sampler_classes.get(batch_sampler)
  127. if batch_sampler is not None:
  128. batch_sampler = batch_sampler_class(dataset_tr, **kwargs.get("dataset_conf"))
  129. dataloader_tr = torch.utils.data.DataLoader(dataset_tr,
  130. collate_fn=dataset_tr.collator,
  131. batch_sampler=batch_sampler,
  132. num_workers=kwargs.get("dataset_conf").get("num_workers", 4),
  133. pin_memory=True)
  134. trainer = Trainer(
  135. model=model,
  136. optim=optim,
  137. scheduler=scheduler,
  138. dataloader_train=dataloader_tr,
  139. dataloader_val=None,
  140. local_rank=local_rank,
  141. use_ddp=use_ddp,
  142. use_fsdp=use_fsdp,
  143. **kwargs.get("train_conf"),
  144. )
  145. trainer.run()
  146. if use_ddp or use_fsdp:
  147. torch.distributed.destroy_process_group()
  148. if __name__ == "__main__":
  149. main_hydra()