ema_module.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. # Copyright (c) Facebook, Inc. and its affiliates.
  2. #
  3. # This source code is licensed under the MIT license found in the
  4. # LICENSE file in the root directory of this source tree.
  5. """
  6. Used for EMA tracking a given pytorch module. The user is responsible for calling step()
  7. and setting the appropriate decay
  8. """
  9. import copy
  10. import logging
  11. import torch
  12. class EMAModule:
  13. """Exponential Moving Average of Fairseq Models"""
  14. def __init__(self, model, ema_decay=0.9999, ema_fp32=False, device=None, skip_keys=None):
  15. """
  16. @param model model to initialize the EMA with
  17. @param config EMAConfig object with configuration like
  18. ema_decay, ema_update_freq, ema_fp32
  19. @param device If provided, copy EMA to this device (e.g. gpu).
  20. Otherwise EMA is in the same device as the model.
  21. """
  22. self.decay = ema_decay
  23. self.ema_fp32 = ema_fp32
  24. self.model = copy.deepcopy(model)
  25. self.model.requires_grad_(False)
  26. self.skip_keys = skip_keys or set()
  27. self.fp32_params = {}
  28. if device is not None:
  29. logging.info(f"Copying EMA model to device {device}")
  30. self.model = self.model.to(device=device)
  31. if self.ema_fp32:
  32. self.build_fp32_params()
  33. self.update_freq_counter = 0
  34. def build_fp32_params(self, state_dict=None):
  35. """
  36. Store a copy of the EMA params in fp32.
  37. If state dict is passed, the EMA params is copied from
  38. the provided state dict. Otherwise, it is copied from the
  39. current EMA model parameters.
  40. """
  41. if not self.ema_fp32:
  42. raise RuntimeError(
  43. "build_fp32_params should not be called if ema_fp32=False. "
  44. "Use ema_fp32=True if this is really intended."
  45. )
  46. if state_dict is None:
  47. state_dict = self.model.state_dict()
  48. def _to_float(t):
  49. return t.float() if torch.is_floating_point(t) else t
  50. for param_key in state_dict:
  51. if param_key in self.fp32_params:
  52. self.fp32_params[param_key].copy_(state_dict[param_key])
  53. else:
  54. self.fp32_params[param_key] = _to_float(state_dict[param_key])
  55. def restore(self, state_dict, build_fp32_params=False):
  56. """Load data from a model spec into EMA model"""
  57. self.model.load_state_dict(state_dict, strict=False)
  58. if build_fp32_params:
  59. self.build_fp32_params(state_dict)
  60. def set_decay(self, decay):
  61. self.decay = decay
  62. def get_decay(self):
  63. return self.decay
  64. def _step_internal(self, new_model):
  65. """One update of the EMA model based on new model weights"""
  66. decay = self.decay
  67. ema_state_dict = {}
  68. ema_params = (
  69. self.fp32_params if self.ema_fp32 else self.model.state_dict()
  70. )
  71. for key, param in new_model.state_dict().items():
  72. if isinstance(param, dict):
  73. continue
  74. try:
  75. ema_param = ema_params[key]
  76. except KeyError:
  77. ema_param = (
  78. param.float().clone() if param.ndim == 1 else copy.deepcopy(param)
  79. )
  80. if param.shape != ema_param.shape:
  81. raise ValueError(
  82. "incompatible tensor shapes between model param and ema param"
  83. + "{} vs. {}".format(param.shape, ema_param.shape)
  84. )
  85. if "version" in key:
  86. # Do not decay a model.version pytorch param
  87. continue
  88. if key in self.skip_keys or ("num_batches_tracked" in key and ema_param.dtype == torch.int64):
  89. ema_param = param.to(dtype=ema_param.dtype).clone()
  90. ema_params[key].copy_(ema_param)
  91. else:
  92. ema_param.mul_(decay)
  93. ema_param.add_(param.to(dtype=ema_param.dtype), alpha=1 - decay)
  94. ema_state_dict[key] = ema_param
  95. self.restore(ema_state_dict, build_fp32_params=False)
  96. def step(self, new_model):
  97. self._step_internal(new_model)
  98. def reverse(self, model):
  99. """
  100. Load the model parameters from EMA model.
  101. Useful for inference or fine-tuning from the EMA model.
  102. """
  103. d = self.model.state_dict()
  104. if "_ema" in d:
  105. del d["_ema"]
  106. model.load_state_dict(d, strict=False)
  107. return model