diff --git a/examples/seq2seq/builtin_trainer/finetune.sh b/examples/seq2seq/builtin_trainer/finetune.sh
index 8c2d13d5adf..65f207c21a3 100644
--- a/examples/seq2seq/builtin_trainer/finetune.sh
+++ b/examples/seq2seq/builtin_trainer/finetune.sh
@@ -3,8 +3,7 @@
 python finetune_trainer.py \
     --learning_rate=3e-5 \
     --fp16 \
-    --do_train --do_eval --do_predict \
-    --evaluation_strategy steps \
+    --do_train --do_eval --do_predict --evaluate_during_training \
     --predict_with_generate \
     --n_val 1000 \
     "$@"
diff --git a/examples/seq2seq/builtin_trainer/finetune_tpu.sh b/examples/seq2seq/builtin_trainer/finetune_tpu.sh
index 577f99fc7a2..8bd367c852d 100644
--- a/examples/seq2seq/builtin_trainer/finetune_tpu.sh
+++ b/examples/seq2seq/builtin_trainer/finetune_tpu.sh
@@ -5,8 +5,7 @@ export TPU_NUM_CORES=8
 python xla_spawn.py --num_cores $TPU_NUM_CORES \
     finetune_trainer.py \
     --learning_rate=3e-5 \
-    --do_train --do_eval \
-    --evaluation_strategy steps \
+    --do_train --do_eval --evaluate_during_training \
     --prediction_loss_only \
     --n_val 1000 \
     "$@"
diff --git a/examples/seq2seq/builtin_trainer/train_distil_marian_enro.sh b/examples/seq2seq/builtin_trainer/train_distil_marian_enro.sh
index 10c809b0e3a..1503e821a84 100644
--- a/examples/seq2seq/builtin_trainer/train_distil_marian_enro.sh
+++ b/examples/seq2seq/builtin_trainer/train_distil_marian_enro.sh
@@ -16,8 +16,7 @@ python finetune_trainer.py \
     --num_train_epochs=6 \
     --save_steps 3000 --eval_steps 3000 \
     --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
-    --do_train --do_eval --do_predict \
-    --evaluation_strategy steps \
+    --do_train --do_eval --do_predict --evaluate_during_training\
     --predict_with_generate --logging_first_step \
     --task translation --label_smoothing 0.1 \
     "$@"
diff --git a/examples/seq2seq/builtin_trainer/train_distil_marian_enro_tpu.sh b/examples/seq2seq/builtin_trainer/train_distil_marian_enro_tpu.sh
index 098425d65f1..ca9a57fa432 100644
--- a/examples/seq2seq/builtin_trainer/train_distil_marian_enro_tpu.sh
+++ b/examples/seq2seq/builtin_trainer/train_distil_marian_enro_tpu.sh
@@ -17,8 +17,7 @@ python xla_spawn.py --num_cores $TPU_NUM_CORES \
     --save_steps 500 --eval_steps 500 \
     --logging_first_step --logging_steps 200 \
     --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
-    --do_train --do_eval \
-    --evaluation_strategy steps \
+    --do_train --do_eval --evaluate_during_training \
     --prediction_loss_only \
     --task translation --label_smoothing 0.1 \
     "$@"
diff --git a/examples/seq2seq/builtin_trainer/train_distilbart_cnn.sh b/examples/seq2seq/builtin_trainer/train_distilbart_cnn.sh
index d29f6b8037c..dbb85cbe1b8 100644
--- a/examples/seq2seq/builtin_trainer/train_distilbart_cnn.sh
+++ b/examples/seq2seq/builtin_trainer/train_distilbart_cnn.sh
@@ -19,7 +19,6 @@ python finetune_trainer.py \
     --save_steps 3000 --eval_steps 3000 \
     --logging_first_step \
     --max_target_length 56 --val_max_target_length $MAX_TGT_LEN --test_max_target_length $MAX_TGT_LEN \
-    --do_train --do_eval --do_predict \
-    --evaluation_strategy steps \
+    --do_train --do_eval --do_predict --evaluate_during_training \
     --predict_with_generate --sortish_sampler \
     "$@"
diff --git a/examples/seq2seq/builtin_trainer/train_mbart_cc25_enro.sh b/examples/seq2seq/builtin_trainer/train_mbart_cc25_enro.sh
index 3dc711f2035..7a2a5c72209 100644
--- a/examples/seq2seq/builtin_trainer/train_mbart_cc25_enro.sh
+++ b/examples/seq2seq/builtin_trainer/train_mbart_cc25_enro.sh
@@ -15,8 +15,7 @@ python finetune_trainer.py \
     --sortish_sampler \
     --num_train_epochs 6 \
     --save_steps 25000 --eval_steps 25000 --logging_steps 1000 \
-    --do_train --do_eval --do_predict \
-    --evaluation_strategy steps \
+    --do_train --do_eval --do_predict --evaluate_during_training \
     --predict_with_generate --logging_first_step \
     --task translation \
     "$@"
diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index 4c813e1ce29..d14e6e7ce13 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -2,7 +2,6 @@
 import math
 import os
 
-from .trainer_utils import EvaluationStrategy
 from .utils import logging
 
 
@@ -213,13 +212,13 @@ def run_hp_search_ray(trainer, n_trials: int, direction: str, **kwargs) -> BestR
         # Check for `do_eval` and `eval_during_training` for schedulers that require intermediate reporting.
         if isinstance(
             kwargs["scheduler"], (ASHAScheduler, MedianStoppingRule, HyperBandForBOHB, PopulationBasedTraining)
-        ) and (not trainer.args.do_eval or trainer.args.evaluation_strategy == EvaluationStrategy.NO):
+        ) and (not trainer.args.do_eval or not trainer.args.evaluate_during_training):
             raise RuntimeError(
                 "You are using {cls} as a scheduler but you haven't enabled evaluation during training. "
                 "This means your trials will not report intermediate results to Ray Tune, and "
                 "can thus not be stopped early or used to exploit other trials parameters. "
                 "If this is what you want, do not use {cls}. If you would like to use {cls}, "
-                "make sure you pass `do_eval=True` and `evaluation_strategy='steps'` in the "
+                "make sure you pass `do_eval=True` and `evaluate_during_training=True` in the "
                 "Trainer `args`.".format(cls=type(kwargs["scheduler"]).__name__)
             )
 
diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py
index 162815dbc63..6275ceafe5a 100644
--- a/src/transformers/trainer_tf.py
+++ b/src/transformers/trainer_tf.py
@@ -19,7 +19,7 @@ from tensorflow.python.distribute.values import PerReplica
 
 from .modeling_tf_utils import TFPreTrainedModel
 from .optimization_tf import GradientAccumulator, create_optimizer
-from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, EvaluationStrategy, PredictionOutput, set_seed
+from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, PredictionOutput, set_seed
 from .training_args_tf import TFTrainingArguments
 from .utils import logging
 
@@ -561,7 +561,7 @@ class TFTrainer:
 
                     if (
                         self.args.eval_steps > 0
-                        and self.args.evaluate_strategy == EvaluationStrategy.STEPS
+                        and self.args.evaluate_during_training
                         and self.global_step % self.args.eval_steps == 0
                     ):
                         self.evaluate()
diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py
index 2efe7a6becc..91890605da4 100644
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -34,12 +34,8 @@ class TFTrainingArguments(TrainingArguments):
             Whether to run evaluation on the dev set or not.
         do_predict (:obj:`bool`, `optional`, defaults to :obj:`False`):
             Whether to run predictions on the test set or not.
-        evaluation_strategy (:obj:`str` or :class:`~transformers.trainer_utils.EvaluationStrategy`, `optional`, defaults to :obj:`"no"`):
-            The evaluation strategy to adopt during training. Possible values are:
-
-                * :obj:`"no"`: No evaluation is done during training.
-                * :obj:`"steps"`: Evaluation is done (and logged) every :obj:`eval_steps`.
-
+        evaluate_during_training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to run evaluation during training at each logging step or not.
         per_device_train_batch_size (:obj:`int`, `optional`, defaults to 8):
             The batch size per GPU/TPU core/CPU for training.
         per_device_eval_batch_size (:obj:`int`, `optional`, defaults to 8):