diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index c2a44b6ff08..ca8263cb3f0 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -151,7 +151,6 @@ from .utils import ( check_torch_load_is_safe, find_labels, is_accelerate_available, - is_apex_available, is_apollo_torch_available, is_bitsandbytes_available, is_datasets_available, @@ -191,9 +190,6 @@ if is_in_notebook(): DEFAULT_PROGRESS_CALLBACK = NotebookProgressCallback -if is_apex_available(): - from apex import amp - if is_datasets_available(): import datasets @@ -761,11 +757,6 @@ class Trainer: self.use_cpu_amp = True self.amp_dtype = torch.bfloat16 elif args.half_precision_backend == "apex": - if not is_apex_available(): - raise ImportError( - "Using FP16 with APEX but APEX is not installed, please refer to" - " https://www.github.com/nvidia/apex." - ) self.use_apex = True # Label smoothing @@ -1992,6 +1983,8 @@ class Trainer: # Mixed precision training with apex if self.use_apex and training: + from apex import amp + model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level) # Multi-gpu training (should be after apex fp16 initialization) / 8bit models does not support DDP @@ -2579,6 +2572,8 @@ class Trainer: if is_sagemaker_mp_enabled() and args.fp16: _grad_norm = self.optimizer.clip_master_grads(args.max_grad_norm) elif self.use_apex: + from apex import amp + # Revert to normal clipping otherwise, handling Apex or full precision _grad_norm = nn.utils.clip_grad_norm_( amp.master_params(self.optimizer), @@ -3776,6 +3771,8 @@ class Trainer: loss = loss.mean() # mean() to average on multi-gpu parallel training if self.use_apex: + from apex import amp + with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 65ce588b7d2..2d58e26af31 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -39,6 +39,7 @@ from .utils import ( ExplicitEnum, cached_property, is_accelerate_available, + is_apex_available, is_ipex_available, is_safetensors_available, is_sagemaker_dp_enabled, @@ -1702,6 +1703,19 @@ class TrainingArguments: if self.half_precision_backend == "apex": raise ValueError(" `--half_precision_backend apex`: GPU bf16 is not supported by apex.") + if self.half_precision_backend == "apex": + if not is_apex_available(): + raise ImportError( + "Using FP16 with APEX but APEX is not installed, please refer to" + " https://www.github.com/nvidia/apex." + ) + try: + from apex import amp # noqa: F401 + except ImportError as e: + raise ImportError( + f"apex.amp is deprecated in the latest version of apex, causing this error {e}. Either revert to an older version or use pytorch amp by setting half_precision_backend='auto' instead. See https://github.com/NVIDIA/apex/pull/1896 " + ) + if self.lr_scheduler_type == SchedulerType.REDUCE_ON_PLATEAU: if self.eval_strategy == IntervalStrategy.NO: raise ValueError("lr_scheduler_type reduce_lr_on_plateau requires an eval strategy")