train.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. import os
  2. import sys
  3. import torch
  4. import hydra
  5. import logging
  6. import argparse
  7. from io import BytesIO
  8. import torch.distributed as dist
  9. from collections.abc import Sequence
  10. from omegaconf import DictConfig, OmegaConf
  11. from torch.nn.parallel import DistributedDataParallel as DDP
  12. from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
  13. from funasr.register import tables
  14. from funasr.optimizers import optim_classes
  15. from funasr.train_utils.trainer import Trainer
  16. from funasr.schedulers import scheduler_classes
  17. from funasr.train_utils.initialize import initialize
  18. from funasr.download.download_from_hub import download_model
  19. from funasr.models.lora.utils import mark_only_lora_as_trainable
  20. from funasr.train_utils.set_all_random_seed import set_all_random_seed
  21. from funasr.train_utils.load_pretrained_model import load_pretrained_model
  22. # from funasr.tokenizer.build_tokenizer import build_tokenizer
  23. # from funasr.tokenizer.token_id_converter import TokenIDConverter
  24. # from funasr.tokenizer.funtoken import build_tokenizer
  25. @hydra.main(config_name=None, version_base=None)
  26. def main_hydra(kwargs: DictConfig):
  27. if kwargs.get("debug", False):
  28. import pdb; pdb.set_trace()
  29. assert "model" in kwargs
  30. if "model_conf" not in kwargs:
  31. logging.info("download models from model hub: {}".format(kwargs.get("model_hub", "ms")))
  32. kwargs = download_model(is_training=kwargs.get("is_training", True), **kwargs)
  33. main(**kwargs)
  34. def main(**kwargs):
  35. # preprocess_config(kwargs)
  36. # import pdb; pdb.set_trace()
  37. # set random seed
  38. tables.print()
  39. set_all_random_seed(kwargs.get("seed", 0))
  40. torch.backends.cudnn.enabled = kwargs.get("cudnn_enabled", torch.backends.cudnn.enabled)
  41. torch.backends.cudnn.benchmark = kwargs.get("cudnn_benchmark", torch.backends.cudnn.benchmark)
  42. torch.backends.cudnn.deterministic = kwargs.get("cudnn_deterministic", True)
  43. local_rank = int(os.environ.get('LOCAL_RANK', 0))
  44. # Check if we are using DDP or FSDP
  45. use_ddp = 'WORLD_SIZE' in os.environ and int(os.environ["WORLD_SIZE"]) > 1
  46. use_fsdp = kwargs.get("use_fsdp", None)
  47. if use_ddp or use_fsdp:
  48. dist.init_process_group(backend=kwargs.get("backend", "nccl"), init_method='env://')
  49. torch.cuda.set_device(local_rank)
  50. # save config.yaml
  51. if (use_ddp or use_fsdp) and dist.get_rank() == 0 or not (use_ddp or use_fsdp) and local_rank == 0:
  52. os.makedirs(kwargs.get("output_dir", "./"), exist_ok=True)
  53. yaml_file = os.path.join(kwargs.get("output_dir", "./"), "config.yaml")
  54. OmegaConf.save(config=kwargs, f=yaml_file)
  55. logging.info("config.yaml is saved to: %s", yaml_file)
  56. tokenizer = kwargs.get("tokenizer", None)
  57. if tokenizer is not None:
  58. tokenizer_class = tables.tokenizer_classes.get(tokenizer)
  59. tokenizer = tokenizer_class(**kwargs["tokenizer_conf"])
  60. kwargs["tokenizer"] = tokenizer
  61. # build frontend if frontend is none None
  62. frontend = kwargs.get("frontend", None)
  63. if frontend is not None:
  64. frontend_class = tables.frontend_classes.get(frontend)
  65. frontend = frontend_class(**kwargs["frontend_conf"])
  66. kwargs["frontend"] = frontend
  67. kwargs["input_size"] = frontend.output_size()
  68. # import pdb;
  69. # pdb.set_trace()
  70. # build model
  71. model_class = tables.model_classes.get(kwargs["model"])
  72. model = model_class(**kwargs, **kwargs["model_conf"], vocab_size=len(tokenizer.token_list))
  73. # init_param
  74. init_param = kwargs.get("init_param", None)
  75. if init_param is not None:
  76. if not isinstance(init_param, (list, tuple)):
  77. init_param = (init_param,)
  78. logging.info("init_param is not None: %s", init_param)
  79. for p in init_param:
  80. logging.info(f"Loading pretrained params from {p}")
  81. load_pretrained_model(
  82. model=model,
  83. init_param=p,
  84. ignore_init_mismatch=kwargs.get("ignore_init_mismatch", True),
  85. oss_bucket=kwargs.get("oss_bucket", None),
  86. )
  87. else:
  88. initialize(model, kwargs.get("init", "kaiming_normal"))
  89. # freeze_param
  90. freeze_param = kwargs.get("freeze_param", None)
  91. if freeze_param is not None:
  92. freeze_param = eval(freeze_param)
  93. if isinstance(freeze_param, Sequence):
  94. freeze_param = (freeze_param,)
  95. logging.info("freeze_param is not None: %s", freeze_param)
  96. for t in freeze_param:
  97. for k, p in model.named_parameters():
  98. if k.startswith(t + ".") or k == t:
  99. logging.info(f"Setting {k}.requires_grad = False")
  100. p.requires_grad = False
  101. if use_ddp:
  102. model = model.cuda(local_rank)
  103. model = DDP(model, device_ids=[local_rank],
  104. find_unused_parameters=kwargs.get("train_conf", {}).get("find_unused_parameters", False))
  105. elif use_fsdp:
  106. model = FSDP(model).cuda(local_rank)
  107. else:
  108. model = model.to(device=kwargs.get("device", "cuda"))
  109. # optim
  110. optim = kwargs.get("optim", "adam")
  111. assert optim in optim_classes
  112. optim_class = optim_classes.get(optim)
  113. optim = optim_class(model.parameters(), **kwargs.get("optim_conf"))
  114. # scheduler
  115. scheduler = kwargs.get("scheduler", "warmuplr")
  116. assert scheduler in scheduler_classes
  117. scheduler_class = scheduler_classes.get(scheduler)
  118. scheduler = scheduler_class(optim, **kwargs.get("scheduler_conf"))
  119. # import pdb;
  120. # pdb.set_trace()
  121. # dataset
  122. dataset_class = tables.dataset_classes.get(kwargs.get("dataset", "AudioDataset"))
  123. dataset_tr = dataset_class(kwargs.get("train_data_set_list"), frontend=frontend, tokenizer=tokenizer, **kwargs.get("dataset_conf"))
  124. # dataloader
  125. batch_sampler = kwargs["dataset_conf"].get("batch_sampler", "DynamicBatchLocalShuffleSampler")
  126. batch_sampler_class = tables.batch_sampler_classes.get(batch_sampler)
  127. if batch_sampler is not None:
  128. batch_sampler = batch_sampler_class(dataset_tr, **kwargs.get("dataset_conf"))
  129. dataloader_tr = torch.utils.data.DataLoader(dataset_tr,
  130. collate_fn=dataset_tr.collator,
  131. batch_sampler=batch_sampler,
  132. num_workers=kwargs.get("dataset_conf").get("num_workers", 4),
  133. pin_memory=True)
  134. trainer = Trainer(
  135. model=model,
  136. optim=optim,
  137. scheduler=scheduler,
  138. dataloader_train=dataloader_tr,
  139. dataloader_val=None,
  140. local_rank=local_rank,
  141. use_ddp=use_ddp,
  142. use_fsdp=use_fsdp,
  143. **kwargs.get("train_conf"),
  144. )
  145. trainer.run()
  146. if use_ddp or use_fsdp:
  147. torch.distributed.destroy_process_group()
  148. if __name__ == "__main__":
  149. main_hydra()