[TF 2.2 compat] use tf.VariableAggregation.ONLY_FIRST_REPLICA (#4283)

* Fix the issue to properly run the accumulator with TF 2.2 * Apply style * Fix training_args_tf for TF 2.2 * Fix the TF training args when only one GPU is available * Remove the fixed version of TF in setup.py
2025-07-31 02:02:21 +06:00 · 2020-05-11 17:28:37 +02:00 · 2020-05-11 17:28:37 +02:00 · 94b57bf796
commit 94b57bf796
parent cffbb3d8ed
3 changed files with 14 additions and 6 deletions
--- a/setup.py
+++ b/setup.py
@ -67,8 +67,8 @@ extras = {}

 extras["mecab"] = ["mecab-python3"]
 extras["sklearn"] = ["scikit-learn"]
-extras["tf"] = ["tensorflow<=2.1.0"]
-extras["tf-cpu"] = ["tensorflow-cpu<=2.1.0"]
+extras["tf"] = ["tensorflow"]
+extras["tf-cpu"] = ["tensorflow-cpu"]
 extras["torch"] = ["torch"]

 extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"]
@ -81,7 +81,7 @@ extras["quality"] = [
    "isort @ git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort",
    "flake8",
 ]
-extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3", "scikit-learn", "tensorflow<=2.1.0", "torch"]
+extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3", "scikit-learn", "tensorflow", "torch"]

 setup(
    name="transformers",
--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@ -204,7 +204,10 @@ class GradientAccumulator(object):
        """Number of accumulated steps."""
        if self._accum_steps is None:
            self._accum_steps = tf.Variable(
-                tf.constant(0, dtype=tf.int64), trainable=False, synchronization=tf.VariableSynchronization.ON_READ,
+                tf.constant(0, dtype=tf.int64),
+                trainable=False,
+                synchronization=tf.VariableSynchronization.ON_READ,
+                aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
            )

        return self._accum_steps.value()
@ -223,7 +226,10 @@ class GradientAccumulator(object):
            self._gradients.extend(
                [
                    tf.Variable(
-                        tf.zeros_like(gradient), trainable=False, synchronization=tf.VariableSynchronization.ON_READ,
+                        tf.zeros_like(gradient),
+                        trainable=False,
+                        synchronization=tf.VariableSynchronization.ON_READ,
+                        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
                    )
                    for gradient in gradients
                ]
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@ -56,9 +56,11 @@ class TFTrainingArguments(TrainingArguments):
                strategy = tf.distribute.experimental.TPUStrategy(tpu)
            elif len(gpus) == 0:
                strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
+            elif len(gpus) == 1:
+                strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
            elif len(gpus) > 1:
                # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
-                strategy = tf.distribute.MirroredStrategy(gpus)
+                strategy = tf.distribute.MirroredStrategy()
            else:
                raise ValueError("Cannot find the proper strategy please check your environment properties.")