TF: purge TFTrainer (#28483)

2025-08-01 10:41:07 +06:00 · 2024-01-12 16:56:34 +00:00 · 2024-01-12 16:56:34 +00:00 · 4fb3d3a0f6
commit 4fb3d3a0f6
parent afc45b13ca
15 changed files with 233 additions and 1682 deletions
--- a/docs/source/en/main_classes/deepspeed.md
+++ b/docs/source/en/main_classes/deepspeed.md
@ -2049,7 +2049,6 @@ In this case you usually need to raise the value of `initial_scale_power`. Setti

 ### Notes

- DeepSpeed works with the PyTorch [`Trainer`] but not TF [`TFTrainer`].
 - While DeepSpeed has a pip installable PyPI package, it is highly recommended that it gets installed from [source](https://github.com/microsoft/deepspeed#installation) to best match your hardware and also if you need to enable
  certain features, like 1-bit Adam, which aren't available in the pypi distribution.
 - You don't have to use the [`Trainer`] to use DeepSpeed with 🤗 Transformers - you can use any model
--- a/docs/source/it/migration.md
+++ b/docs/source/it/migration.md
@ -166,13 +166,6 @@ Per quanto riguarda la classe `Trainer`:
 - Il metodo `is_local_master` di `Trainer` è deprecato a favore di `is_local_process_zero`.
 - Il metodo `is_world_master` di `Trainer` è deprecato a favore di `is_world_process_zero`.

-Per quanto riguarda la classe `TFTrainer`: 
- L'argomento `prediction_loss_only` di `TFTrainer` è stato rimosso a favore dell'argomento di classe `args.prediction_loss_only`. 
- Il metodo `_log` di `Trainer` è deprecato a favore di `log`. 
- Il metodo `_prediction_loop` di `TFTrainer` è deprecato a favore di `prediction_loop`. 
- Il metodo `_setup_wandb` di `TFTrainer` è deprecato a favore di `setup_wandb`. 
- Il metodo `_run_model` di `TFTrainer` è deprecato a favore di `run_model`. 
-
 Per quanto riguarda la classe `TrainingArguments`:
 - L'argomento `evaluate_during_training` di `TrainingArguments` è deprecato a favore di `evaluation_strategy`.

--- a/docs/source/ja/main_classes/deepspeed.md
+++ b/docs/source/ja/main_classes/deepspeed.md
@ -1994,7 +1994,6 @@ SW: Model with 2783M total params, 65M largest layer params.

 ### Notes

- DeepSpeed は PyTorch [`Trainer`] では動作しますが、TF [`TFTrainer`] では動作しません。
 - DeepSpeed には pip でインストール可能な PyPI パッケージがありますが、ハードウェアに最も適合するように、また有効にする必要がある場合は、[ソース](https://github.com/microsoft/deepspeed#installation) からインストールすることを強くお勧めします。
  1 ビット Adam などの特定の機能は、pypi ディストリビューションでは利用できません。
 - 🤗 Transformers で DeepSpeed を使用するために [`Trainer`] を使用する必要はありません - 任意のモデルを使用できます
--- a/docs/source/zh/main_classes/deepspeed.md
+++ b/docs/source/zh/main_classes/deepspeed.md
@ -1845,7 +1845,6 @@ SW: Model with 2783M total params, 65M largest layer params.

 ### 注意事项

- DeepSpeed 与 PyTorch [`Trainer`] 一起工作，但不与 TF [`TFTrainer`] 一起工作。
 - 尽管 DeepSpeed 有一个可安装的 PyPI 包，但强烈建议从源代码安装它，以最好地匹配您的硬件，如果您需要启用某些功能，如 1-bit Adam，这些功能在 pypi 发行版中不可用。
 - 您不必使用🤗  Transformers的 [`Trainer`] 来使用 DeepSpeed   - 您可以使用任何模型与自己的训练器，您还需要根据 [DeepSpeed 集成说明](https://www.deepspeed.ai/getting-started/#writing-deepspeed-models) 调整后者。

--- a/examples/legacy/text-classification/run_tf_text_classification.py
+++ b/examples/legacy/text-classification/run_tf_text_classification.py
@ -1,313 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Fine-tuning the library models for sequence classification."""
-
-
-import logging
-import os
-from dataclasses import dataclass, field
-from typing import Dict, Optional
-
-import datasets
-import numpy as np
-import tensorflow as tf
-
-from transformers import (
-    AutoConfig,
-    AutoTokenizer,
-    EvalPrediction,
-    HfArgumentParser,
-    PreTrainedTokenizer,
-    TFAutoModelForSequenceClassification,
-    TFTrainer,
-    TFTrainingArguments,
-)
-from transformers.utils import logging as hf_logging
-
-
-hf_logging.set_verbosity_info()
-hf_logging.enable_default_handler()
-hf_logging.enable_explicit_format()
-
-
-def get_tfds(
-    train_file: str,
-    eval_file: str,
-    test_file: str,
-    tokenizer: PreTrainedTokenizer,
-    label_column_id: int,
-    max_seq_length: Optional[int] = None,
-):
-    files = {}
-
-    if train_file is not None:
-        files[datasets.Split.TRAIN] = [train_file]
-    if eval_file is not None:
-        files[datasets.Split.VALIDATION] = [eval_file]
-    if test_file is not None:
-        files[datasets.Split.TEST] = [test_file]
-
-    ds = datasets.load_dataset("csv", data_files=files)
-    features_name = list(ds[list(files.keys())[0]].features.keys())
-    label_name = features_name.pop(label_column_id)
-    label_list = list(set(ds[list(files.keys())[0]][label_name]))
-    label2id = {label: i for i, label in enumerate(label_list)}
-    input_names = tokenizer.model_input_names
-    transformed_ds = {}
-
-    if len(features_name) == 1:
-        for k in files.keys():
-            transformed_ds[k] = ds[k].map(
-                lambda example: tokenizer.batch_encode_plus(
-                    example[features_name[0]], truncation=True, max_length=max_seq_length, padding="max_length"
-                ),
-                batched=True,
-            )
-    elif len(features_name) == 2:
-        for k in files.keys():
-            transformed_ds[k] = ds[k].map(
-                lambda example: tokenizer.batch_encode_plus(
-                    (example[features_name[0]], example[features_name[1]]),
-                    truncation=True,
-                    max_length=max_seq_length,
-                    padding="max_length",
-                ),
-                batched=True,
-            )
-
-    def gen_train():
-        for ex in transformed_ds[datasets.Split.TRAIN]:
-            d = {k: v for k, v in ex.items() if k in input_names}
-            label = label2id[ex[label_name]]
-            yield (d, label)
-
-    def gen_val():
-        for ex in transformed_ds[datasets.Split.VALIDATION]:
-            d = {k: v for k, v in ex.items() if k in input_names}
-            label = label2id[ex[label_name]]
-            yield (d, label)
-
-    def gen_test():
-        for ex in transformed_ds[datasets.Split.TEST]:
-            d = {k: v for k, v in ex.items() if k in input_names}
-            label = label2id[ex[label_name]]
-            yield (d, label)
-
-    train_ds = (
-        tf.data.Dataset.from_generator(
-            gen_train,
-            ({k: tf.int32 for k in input_names}, tf.int64),
-            ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])),
-        )
-        if datasets.Split.TRAIN in transformed_ds
-        else None
-    )
-
-    if train_ds is not None:
-        train_ds = train_ds.apply(tf.data.experimental.assert_cardinality(len(ds[datasets.Split.TRAIN])))
-
-    val_ds = (
-        tf.data.Dataset.from_generator(
-            gen_val,
-            ({k: tf.int32 for k in input_names}, tf.int64),
-            ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])),
-        )
-        if datasets.Split.VALIDATION in transformed_ds
-        else None
-    )
-
-    if val_ds is not None:
-        val_ds = val_ds.apply(tf.data.experimental.assert_cardinality(len(ds[datasets.Split.VALIDATION])))
-
-    test_ds = (
-        tf.data.Dataset.from_generator(
-            gen_test,
-            ({k: tf.int32 for k in input_names}, tf.int64),
-            ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])),
-        )
-        if datasets.Split.TEST in transformed_ds
-        else None
-    )
-
-    if test_ds is not None:
-        test_ds = test_ds.apply(tf.data.experimental.assert_cardinality(len(ds[datasets.Split.TEST])))
-
-    return train_ds, val_ds, test_ds, label2id
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-
-    Using `HfArgumentParser` we can turn this class
-    into argparse arguments to be able to specify them on
-    the command line.
-    """
-
-    label_column_id: int = field(metadata={"help": "Which column contains the label"})
-    train_file: str = field(default=None, metadata={"help": "The path of the training file"})
-    dev_file: Optional[str] = field(default=None, metadata={"help": "The path of the development file"})
-    test_file: Optional[str] = field(default=None, metadata={"help": "The path of the test file"})
-    max_seq_length: int = field(
-        default=128,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    use_fast: bool = field(default=False, metadata={"help": "Set this flag to use fast tokenization."})
-    # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
-    # or just modify its tokenizer_config.json.
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    if (
-        os.path.exists(training_args.output_dir)
-        and os.listdir(training_args.output_dir)
-        and training_args.do_train
-        and not training_args.overwrite_output_dir
-    ):
-        raise ValueError(
-            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use"
-            " --overwrite_output_dir to overcome."
-        )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(
-        f"n_replicas: {training_args.n_replicas}, distributed training: {bool(training_args.n_replicas > 1)}, "
-        f"16-bits training: {training_args.fp16}"
-    )
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-    )
-
-    train_dataset, eval_dataset, test_ds, label2id = get_tfds(
-        train_file=data_args.train_file,
-        eval_file=data_args.dev_file,
-        test_file=data_args.test_file,
-        tokenizer=tokenizer,
-        label_column_id=data_args.label_column_id,
-        max_seq_length=data_args.max_seq_length,
-    )
-
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        num_labels=len(label2id),
-        label2id=label2id,
-        id2label={id: label for label, id in label2id.items()},
-        finetuning_task="text-classification",
-        cache_dir=model_args.cache_dir,
-    )
-
-    with training_args.strategy.scope():
-        model = TFAutoModelForSequenceClassification.from_pretrained(
-            model_args.model_name_or_path,
-            from_pt=bool(".bin" in model_args.model_name_or_path),
-            config=config,
-            cache_dir=model_args.cache_dir,
-        )
-
-    def compute_metrics(p: EvalPrediction) -> Dict:
-        preds = np.argmax(p.predictions, axis=1)
-
-        return {"acc": (preds == p.label_ids).mean()}
-
-    # Initialize our Trainer
-    trainer = TFTrainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        compute_metrics=compute_metrics,
-    )
-
-    # Training
-    if training_args.do_train:
-        trainer.train()
-        trainer.save_model()
-        tokenizer.save_pretrained(training_args.output_dir)
-
-    # Evaluation
-    results = {}
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-        result = trainer.evaluate()
-        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
-
-        with open(output_eval_file, "w") as writer:
-            logger.info("***** Eval results *****")
-
-            for key, value in result.items():
-                logger.info(f"  {key} = {value}")
-                writer.write(f"{key} = {value}\n")
-
-            results.update(result)
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/legacy/token-classification/run_tf_ner.py
+++ b/examples/legacy/token-classification/run_tf_ner.py
@ -1,310 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Fine-tuning the library models for named entity recognition."""
-
-
-import logging
-import os
-from dataclasses import dataclass, field
-from importlib import import_module
-from typing import Dict, List, Optional, Tuple
-
-import numpy as np
-from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
-from utils_ner import Split, TFTokenClassificationDataset, TokenClassificationTask
-
-from transformers import (
-    AutoConfig,
-    AutoTokenizer,
-    EvalPrediction,
-    HfArgumentParser,
-    TFAutoModelForTokenClassification,
-    TFTrainer,
-    TFTrainingArguments,
-)
-from transformers.utils import logging as hf_logging
-
-
-hf_logging.set_verbosity_info()
-hf_logging.enable_default_handler()
-hf_logging.enable_explicit_format()
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    task_type: Optional[str] = field(
-        default="NER", metadata={"help": "Task type to fine tune in training (e.g. NER, POS, etc)"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    use_fast: bool = field(default=False, metadata={"help": "Set this flag to use fast tokenization."})
-    # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
-    # or just modify its tokenizer_config.json.
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    data_dir: str = field(
-        metadata={"help": "The input data dir. Should contain the .txt files for a CoNLL-2003-formatted task."}
-    )
-    labels: Optional[str] = field(
-        metadata={"help": "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."}
-    )
-    max_seq_length: int = field(
-        default=128,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    if (
-        os.path.exists(training_args.output_dir)
-        and os.listdir(training_args.output_dir)
-        and training_args.do_train
-        and not training_args.overwrite_output_dir
-    ):
-        raise ValueError(
-            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use"
-            " --overwrite_output_dir to overcome."
-        )
-
-    module = import_module("tasks")
-
-    try:
-        token_classification_task_clazz = getattr(module, model_args.task_type)
-        token_classification_task: TokenClassificationTask = token_classification_task_clazz()
-    except AttributeError:
-        raise ValueError(
-            f"Task {model_args.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. "
-            f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}"
-        )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(
-        "n_replicas: %s, distributed training: %s, 16-bits training: %s",
-        training_args.n_replicas,
-        bool(training_args.n_replicas > 1),
-        training_args.fp16,
-    )
-    logger.info("Training/evaluation parameters %s", training_args)
-
-    # Prepare Token Classification task
-    labels = token_classification_task.get_labels(data_args.labels)
-    label_map: Dict[int, str] = dict(enumerate(labels))
-    num_labels = len(labels)
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        num_labels=num_labels,
-        id2label=label_map,
-        label2id={label: i for i, label in enumerate(labels)},
-        cache_dir=model_args.cache_dir,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        use_fast=model_args.use_fast,
-    )
-
-    with training_args.strategy.scope():
-        model = TFAutoModelForTokenClassification.from_pretrained(
-            model_args.model_name_or_path,
-            from_pt=bool(".bin" in model_args.model_name_or_path),
-            config=config,
-            cache_dir=model_args.cache_dir,
-        )
-
-    # Get datasets
-    train_dataset = (
-        TFTokenClassificationDataset(
-            token_classification_task=token_classification_task,
-            data_dir=data_args.data_dir,
-            tokenizer=tokenizer,
-            labels=labels,
-            model_type=config.model_type,
-            max_seq_length=data_args.max_seq_length,
-            overwrite_cache=data_args.overwrite_cache,
-            mode=Split.train,
-        )
-        if training_args.do_train
-        else None
-    )
-    eval_dataset = (
-        TFTokenClassificationDataset(
-            token_classification_task=token_classification_task,
-            data_dir=data_args.data_dir,
-            tokenizer=tokenizer,
-            labels=labels,
-            model_type=config.model_type,
-            max_seq_length=data_args.max_seq_length,
-            overwrite_cache=data_args.overwrite_cache,
-            mode=Split.dev,
-        )
-        if training_args.do_eval
-        else None
-    )
-
-    def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
-        preds = np.argmax(predictions, axis=2)
-        batch_size, seq_len = preds.shape
-        out_label_list = [[] for _ in range(batch_size)]
-        preds_list = [[] for _ in range(batch_size)]
-
-        for i in range(batch_size):
-            for j in range(seq_len):
-                if label_ids[i, j] != -100:
-                    out_label_list[i].append(label_map[label_ids[i][j]])
-                    preds_list[i].append(label_map[preds[i][j]])
-
-        return preds_list, out_label_list
-
-    def compute_metrics(p: EvalPrediction) -> Dict:
-        preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
-
-        return {
-            "precision": precision_score(out_label_list, preds_list),
-            "recall": recall_score(out_label_list, preds_list),
-            "f1": f1_score(out_label_list, preds_list),
-        }
-
-    # Initialize our Trainer
-    trainer = TFTrainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_dataset.get_dataset() if train_dataset else None,
-        eval_dataset=eval_dataset.get_dataset() if eval_dataset else None,
-        compute_metrics=compute_metrics,
-    )
-
-    # Training
-    if training_args.do_train:
-        trainer.train()
-        trainer.save_model()
-        tokenizer.save_pretrained(training_args.output_dir)
-
-    # Evaluation
-    results = {}
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-
-        result = trainer.evaluate()
-        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
-
-        with open(output_eval_file, "w") as writer:
-            logger.info("***** Eval results *****")
-
-            for key, value in result.items():
-                logger.info("  %s = %s", key, value)
-                writer.write("%s = %s\n" % (key, value))
-
-            results.update(result)
-
-    # Predict
-    if training_args.do_predict:
-        test_dataset = TFTokenClassificationDataset(
-            token_classification_task=token_classification_task,
-            data_dir=data_args.data_dir,
-            tokenizer=tokenizer,
-            labels=labels,
-            model_type=config.model_type,
-            max_seq_length=data_args.max_seq_length,
-            overwrite_cache=data_args.overwrite_cache,
-            mode=Split.test,
-        )
-
-        predictions, label_ids, metrics = trainer.predict(test_dataset.get_dataset())
-        preds_list, labels_list = align_predictions(predictions, label_ids)
-        report = classification_report(labels_list, preds_list)
-
-        logger.info("\n%s", report)
-
-        output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt")
-
-        with open(output_test_results_file, "w") as writer:
-            writer.write("%s\n" % report)
-
-        # Save predictions
-        output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt")
-
-        with open(output_test_predictions_file, "w") as writer:
-            with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f:
-                example_id = 0
-
-                for line in f:
-                    if line.startswith("-DOCSTART-") or line == "" or line == "\n":
-                        writer.write(line)
-
-                        if not preds_list[example_id]:
-                            example_id += 1
-                    elif preds_list[example_id]:
-                        output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n"
-
-                        writer.write(output_line)
-                    else:
-                        logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/pytorch/README.md
+++ b/examples/pytorch/README.md
@ -226,7 +226,7 @@ wandb.login()

 To enable logging to W&B, include `"wandb"` in the `report_to` of your `TrainingArguments` or script. Or just pass along `--report_to_all` if you have `wandb` installed.

-Whenever you use `Trainer` or `TFTrainer` classes, your losses, evaluation metrics, model topology and gradients (for `Trainer` only) will automatically be logged.
+Whenever you use the `Trainer` class, your losses, evaluation metrics, model topology and gradients will automatically be logged.

 Advanced configuration is possible by setting environment variables:

--- a/examples/tensorflow/README.md
+++ b/examples/tensorflow/README.md
@ -15,7 +15,7 @@ limitations under the License.

 # Examples

-This folder contains actively maintained examples of the use of 🤗 Transformers organized into different ML tasks. All examples in this folder are **TensorFlow** examples and are written using native Keras rather than classes like `TFTrainer`, which we now consider deprecated. If you've previously only used 🤗 Transformers via `TFTrainer`, we highly recommend taking a look at the new style - we think it's a big improvement!
+This folder contains actively maintained examples of the use of 🤗 Transformers organized into different ML tasks. All examples in this folder are **TensorFlow** examples and are written using native Keras. If you've previously only used 🤗 Transformers via `TFTrainer`, we highly recommend taking a look at the new style - we think it's a big improvement!

 In addition, all scripts here now support the [🤗 Datasets](https://github.com/huggingface/datasets) library - you can grab entire datasets just by changing one command-line argument!

--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -4401,7 +4401,6 @@ else:
        "create_optimizer",
    ]
    _import_structure["tf_utils"] = []
-    _import_structure["trainer_tf"] = ["TFTrainer"]


 try:
@ -8560,9 +8559,6 @@ if TYPE_CHECKING:
            create_optimizer,
        )

-        # Trainer
-        from .trainer_tf import TFTrainer
-
    try:
        if not (
            is_librosa_available()
--- a/src/transformers/trainer_tf.py
+++ b/src/transformers/trainer_tf.py
@ -1,801 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tensorflow trainer class."""
-
-import datetime
-import math
-import os
-import warnings
-from typing import Callable, Dict, Optional, Tuple
-
-from .utils import ENV_VARS_TRUE_VALUES
-
-
-# Integrations must be imported before ML frameworks:
-# isort: off
-from .integrations import (
-    is_comet_available,
-    is_wandb_available,
-)
-
-# isort: on
-
-import numpy as np
-import tensorflow as tf
-from tensorflow.python.distribute.values import PerReplica
-
-from .modeling_tf_utils import TFPreTrainedModel
-from .optimization_tf import GradientAccumulator, create_optimizer
-from .trainer_utils import (
-    PREFIX_CHECKPOINT_DIR,
-    EvalPrediction,
-    IntervalStrategy,
-    PredictionOutput,
-    enable_full_determinism,
-    set_seed,
-)
-from .training_args_tf import TFTrainingArguments
-from .utils import logging
-
-
-if is_wandb_available():
-    import wandb
-
-if is_comet_available():
-    import comet_ml
-
-logger = logging.get_logger(__name__)
-
-
-class TFTrainer:
-    """
-    TFTrainer is a simple but feature-complete training and eval loop for TensorFlow, optimized for 🤗 Transformers.
-
-    Args:
-        model ([`TFPreTrainedModel`]):
-            The model to train, evaluate or use for predictions.
-        args ([`TFTrainingArguments`]):
-            The arguments to tweak training.
-        train_dataset ([`~tf.data.Dataset`], *optional*):
-            The dataset to use for training. The dataset should yield tuples of `(features, labels)` where `features`
-            is a dict of input features and `labels` is the labels. If `labels` is a tensor, the loss is calculated by
-            the model by calling `model(features, labels=labels)`. If `labels` is a dict, such as when using a
-            QuestionAnswering head model with multiple targets, the loss is instead calculated by calling
-            `model(features, **labels)`.
-        eval_dataset ([`~tf.data.Dataset`], *optional*):
-            The dataset to use for evaluation. The dataset should yield tuples of `(features, labels)` where `features`
-            is a dict of input features and `labels` is the labels. If `labels` is a tensor, the loss is calculated by
-            the model by calling `model(features, labels=labels)`. If `labels` is a dict, such as when using a
-            QuestionAnswering head model with multiple targets, the loss is instead calculated by calling
-            `model(features, **labels)`.
-        compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
-            The function that will be used to compute metrics at evaluation. Must take a [`EvalPrediction`] and return
-            a dictionary string to metric values.
-        tb_writer (`tf.summary.SummaryWriter`, *optional*):
-            Object to write to TensorBoard.
-        optimizers (`Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule]`, *optional*):
-            A tuple containing the optimizer and the scheduler to use. The optimizer default to an instance of
-            [`tf.keras.optimizers.Adam`] if `args.weight_decay_rate` is 0 else an instance of [`AdamWeightDecay`]. The
-            scheduler will default to an instance of [`tf.keras.optimizers.schedules.PolynomialDecay`] if
-            `args.num_warmup_steps` is 0 else an instance of [`WarmUp`].
-    """
-
-    def __init__(
-        self,
-        model: TFPreTrainedModel,
-        args: TFTrainingArguments,
-        train_dataset: Optional[tf.data.Dataset] = None,
-        eval_dataset: Optional[tf.data.Dataset] = None,
-        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
-        tb_writer: Optional[tf.summary.SummaryWriter] = None,
-        optimizers: Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule] = (
-            None,
-            None,
-        ),
-    ):
-        self.model = model
-        self.args = args
-        self.train_dataset = train_dataset
-        self.eval_dataset = eval_dataset
-        self.compute_metrics = compute_metrics
-        self.optimizer, self.lr_scheduler = optimizers
-        self.gradient_accumulator = GradientAccumulator()
-        self.global_step = 0
-        self.epoch_logging = 0
-        self.eval_loss = tf.keras.metrics.Sum()
-
-        warnings.warn(
-            "The class `TFTrainer` is deprecated and will be removed in version 5 of Transformers. "
-            "We recommend using native Keras instead, by calling methods like `fit()` and `predict()` "
-            "directly on the model object. Detailed examples of the Keras style can be found in our "
-            "examples at https://github.com/huggingface/transformers/tree/main/examples/tensorflow",
-            FutureWarning,
-        )
-
-        if tb_writer is not None:
-            self.tb_writer = tb_writer
-        else:
-            self.tb_writer = tf.summary.create_file_writer(self.args.logging_dir)
-
-        if is_wandb_available():
-            self.setup_wandb()
-        elif os.getenv("WANDB_DISABLED", "").upper() not in ENV_VARS_TRUE_VALUES:
-            logger.info(
-                "You are instantiating a Trainer but W&B is not installed. To use wandb logging, "
-                "run `pip install wandb && wandb login` see https://docs.wandb.com/huggingface."
-            )
-
-        if is_comet_available():
-            self.setup_comet()
-        elif os.environ.get("COMET_MODE") != "DISABLED":
-            logger.info(
-                "To use comet_ml logging, run `pip/conda install comet_ml` "
-                "see https://www.comet.ml/docs/python-sdk/huggingface/"
-            )
-
-        enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)
-
-    def get_train_tfdataset(self) -> tf.data.Dataset:
-        """
-        Returns the training [`~tf.data.Dataset`].
-
-        Subclass and override this method if you want to inject some custom behavior.
-        """
-        if self.train_dataset is None:
-            raise ValueError("Trainer: training requires a train_dataset.")
-
-        self.total_train_batch_size = self.args.train_batch_size * self.args.gradient_accumulation_steps
-        self.num_train_examples = self.train_dataset.cardinality().numpy()
-
-        if self.num_train_examples < 0:
-            raise ValueError("The training dataset must have an asserted cardinality")
-
-        ds = (
-            self.train_dataset.repeat()
-            .shuffle(self.num_train_examples, seed=self.args.seed)
-            .batch(self.total_train_batch_size, drop_remainder=self.args.dataloader_drop_last)
-            .prefetch(tf.data.experimental.AUTOTUNE)
-        )
-
-        return self.args.strategy.experimental_distribute_dataset(ds)
-
-    def get_eval_tfdataset(self, eval_dataset: Optional[tf.data.Dataset] = None) -> tf.data.Dataset:
-        """
-        Returns the evaluation [`~tf.data.Dataset`].
-
-        Args:
-            eval_dataset ([`~tf.data.Dataset`], *optional*):
-                If provided, will override *self.eval_dataset*. The dataset should yield tuples of `(features, labels)`
-                where `features` is a dict of input features and `labels` is the labels. If `labels` is a tensor, the
-                loss is calculated by the model by calling `model(features, labels=labels)`. If `labels` is a dict,
-                such as when using a QuestionAnswering head model with multiple targets, the loss is instead calculated
-                by calling `model(features, **labels)`.
-
-        Subclass and override this method if you want to inject some custom behavior.
-        """
-        if eval_dataset is None and self.eval_dataset is None:
-            raise ValueError("Trainer: evaluation requires an eval_dataset.")
-
-        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
-        num_examples = eval_dataset.cardinality().numpy()
-
-        if num_examples < 0:
-            raise ValueError("The training dataset must have an asserted cardinality")
-
-        approx = math.floor if self.args.dataloader_drop_last else math.ceil
-        steps = approx(num_examples / self.args.eval_batch_size)
-        ds = (
-            eval_dataset.repeat()
-            .batch(self.args.eval_batch_size, drop_remainder=self.args.dataloader_drop_last)
-            .prefetch(tf.data.experimental.AUTOTUNE)
-        )
-
-        return self.args.strategy.experimental_distribute_dataset(ds), steps, num_examples
-
-    def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset:
-        """
-        Returns a test [`~tf.data.Dataset`].
-
-        Args:
-            test_dataset ([`~tf.data.Dataset`]):
-                The dataset to use. The dataset should yield tuples of `(features, labels)` where `features` is a dict
-                of input features and `labels` is the labels. If `labels` is a tensor, the loss is calculated by the
-                model by calling `model(features, labels=labels)`. If `labels` is a dict, such as when using a
-                QuestionAnswering head model with multiple targets, the loss is instead calculated by calling
-                `model(features, **labels)`.
-
-        Subclass and override this method if you want to inject some custom behavior.
-        """
-
-        num_examples = test_dataset.cardinality().numpy()
-
-        if num_examples < 0:
-            raise ValueError("The training dataset must have an asserted cardinality")
-
-        steps = math.ceil(num_examples / self.args.eval_batch_size)
-        ds = test_dataset.batch(self.args.eval_batch_size).prefetch(tf.data.experimental.AUTOTUNE)
-
-        return self.args.strategy.experimental_distribute_dataset(ds), steps, num_examples
-
-    def create_optimizer_and_scheduler(self, num_training_steps: int):
-        """
-        Setup the optimizer and the learning rate scheduler.
-
-        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
-        TFTrainer's init through `optimizers`, or subclass and override this method.
-        """
-        if not self.optimizer and not self.lr_scheduler:
-            warmup_steps = (
-                self.args.warmup_steps
-                if self.args.warmup_steps > 0
-                else math.ceil(num_training_steps * self.args.warmup_ratio)
-            )
-
-            self.optimizer, self.lr_scheduler = create_optimizer(
-                self.args.learning_rate,
-                num_training_steps,
-                warmup_steps,
-                adam_beta1=self.args.adam_beta1,
-                adam_beta2=self.args.adam_beta2,
-                adam_epsilon=self.args.adam_epsilon,
-                weight_decay_rate=self.args.weight_decay,
-                power=self.args.poly_power,
-            )
-
-    def setup_wandb(self):
-        """
-        Setup the optional Weights & Biases (`wandb`) integration.
-
-        One can subclass and override this method to customize the setup if needed. Find more information `here
-        <https://docs.wandb.com/huggingface>`__. You can also override the following environment variables:
-
-        Environment:
-            WANDB_PROJECT:
-                (Optional): str - "huggingface" by default, set this to a custom string to store results in a different
-                project.
-            WANDB_DISABLED:
-                (Optional): boolean - defaults to false, set to "true" to disable wandb entirely.
-        """
-
-        logger.info('Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"')
-        combined_dict = {**self.model.config.to_dict(), **self.args.to_sanitized_dict()}
-        wandb.init(project=os.getenv("WANDB_PROJECT", "huggingface"), config=combined_dict, name=self.args.run_name)
-
-    def setup_comet(self):
-        """
-        Setup the optional Comet.ml integration.
-
-        Environment:
-            COMET_MODE:
-                (Optional): str - "OFFLINE", "ONLINE", or "DISABLED"
-            COMET_PROJECT_NAME:
-                (Optional): str - Comet.ml project name for experiments
-            COMET_OFFLINE_DIRECTORY:
-                (Optional): str - folder to use for saving offline experiments when `COMET_MODE` is "OFFLINE"
-
-        For a number of configurable items in the environment, see `here
-        <https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables>`__
-        """
-        comet_mode = os.getenv("COMET_MODE", "ONLINE").upper()
-        args = {"project_name": os.getenv("COMET_PROJECT_NAME", "huggingface")}
-        experiment = None
-        if comet_mode == "ONLINE":
-            experiment = comet_ml.Experiment(**args)
-            logger.info("Automatic Comet.ml online logging enabled")
-        elif comet_mode == "OFFLINE":
-            args["offline_directory"] = os.getenv("COMET_OFFLINE_DIRECTORY", "./")
-            experiment = comet_ml.OfflineExperiment(**args)
-            logger.info("Automatic Comet.ml offline logging enabled; use `comet upload` when finished")
-        if experiment is not None:
-            experiment._set_model_graph(self.model, framework="transformers")
-            experiment._log_parameters(self.args, prefix="args/", framework="transformers")
-            experiment._log_parameters(self.model.config, prefix="config/", framework="transformers")
-
-    def prediction_loop(
-        self,
-        dataset: tf.data.Dataset,
-        steps: int,
-        num_examples: int,
-        description: str,
-        prediction_loss_only: Optional[bool] = None,
-    ) -> PredictionOutput:
-        """
-        Prediction/evaluation loop, shared by [`~TFTrainer.evaluate`] and [`~TFTrainer.predict`].
-
-        Works both with or without labels.
-        """
-
-        prediction_loss_only = (
-            prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only
-        )
-
-        logger.info(f"***** Running {description} *****")
-        logger.info(f"  Num examples in dataset = {num_examples}")
-        if description == "Evaluation":
-            logger.info(f"  Num examples in used in evaluation = {self.args.eval_batch_size * steps}")
-        logger.info(f"  Batch size = {self.args.eval_batch_size}")
-
-        label_ids: np.ndarray = None
-        preds: np.ndarray = None
-        self.eval_loss.reset_states()
-
-        # Reset the past mems state at the beginning of the evaluation if necessary.
-        if self.args.past_index >= 0:
-            self._past = None
-
-        for step, batch in enumerate(dataset):
-            logits = self.distributed_prediction_steps(batch)
-            _, labels = batch
-
-            if not prediction_loss_only:
-                if isinstance(logits, tuple):
-                    logits = logits[0]
-
-                if isinstance(labels, tuple):
-                    labels = labels[0]
-
-                if self.args.n_replicas > 1:
-                    for val in logits.values:
-                        if preds is None:
-                            preds = val.numpy()
-                        else:
-                            preds = np.append(preds, val.numpy(), axis=0)
-
-                    for val in labels.values:
-                        if label_ids is None:
-                            label_ids = val.numpy()
-                        else:
-                            label_ids = np.append(label_ids, val.numpy(), axis=0)
-                else:
-                    if preds is None:
-                        preds = logits.numpy()
-                    else:
-                        preds = np.append(preds, logits.numpy(), axis=0)
-
-                    if label_ids is None:
-                        label_ids = labels.numpy()
-                    else:
-                        label_ids = np.append(label_ids, labels.numpy(), axis=0)
-
-                if step == steps - 1:
-                    break
-
-        if self.compute_metrics is not None and preds is not None and label_ids is not None:
-            metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
-        else:
-            metrics = {}
-
-        metrics["eval_loss"] = self.eval_loss.result().numpy() / steps
-
-        for key in list(metrics.keys()):
-            if not key.startswith("eval_"):
-                metrics[f"eval_{key}"] = metrics.pop(key)
-
-        if self.args.past_index and hasattr(self, "_past"):
-            # Clean the state at the end of training
-            delattr(self, "_past")
-
-        return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
-
-    def log(self, logs: Dict[str, float]) -> None:
-        """
-        Log `logs` on the various objects watching training.
-
-        Subclass and override this method to inject custom behavior.
-
-        Args:
-            logs (`Dict[str, float]`):
-                The values to log.
-        """
-        logs["epoch"] = self.epoch_logging
-
-        if self.tb_writer:
-            with self.tb_writer.as_default():
-                for k, v in logs.items():
-                    tf.summary.scalar(k, v, step=self.global_step)
-            self.tb_writer.flush()
-
-        if is_wandb_available():
-            wandb.log(logs, step=self.global_step)
-
-        if is_comet_available():
-            experiment = comet_ml.config.get_global_experiment()
-            if experiment is not None:
-                experiment._log_metrics(
-                    logs, step=self.global_step, epoch=self.epoch_logging, framework="transformers"
-                )
-
-        output = {**logs, **{"step": self.global_step}}
-
-        logger.info(output)
-
-    def evaluate(self, eval_dataset: Optional[tf.data.Dataset] = None) -> Dict[str, float]:
-        """
-        Run evaluation and returns metrics.
-
-        The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
-        (pass it to the init `compute_metrics` argument).
-
-        Args:
-            eval_dataset ([`~tf.data.Dataset`], *optional*):
-                Pass a dataset if you wish to override `self.eval_dataset`. The dataset should yield tuples of
-                `(features, labels)` where `features` is a dict of input features and `labels` is the labels. If
-                `labels` is a tensor, the loss is calculated by the model by calling `model(features, labels=labels)`.
-                If `labels` is a dict, such as when using a QuestionAnswering head model with multiple targets, the
-                loss is instead calculated by calling `model(features, **labels)`.
-
-        Returns:
-            A dictionary containing the evaluation loss and the potential metrics computed from the predictions.
-        """
-        eval_ds, steps, num_examples = self.get_eval_tfdataset(eval_dataset)
-
-        output = self.prediction_loop(eval_ds, steps, num_examples, description="Evaluation")
-        logs = {**output.metrics}
-        logs["epoch"] = self.epoch_logging
-
-        self.log(logs)
-
-        return output.metrics
-
-    def prediction_step(
-        self, features: tf.Tensor, labels: tf.Tensor, nb_instances_in_global_batch: tf.Tensor
-    ) -> tf.Tensor:
-        """
-        Compute the prediction on features and update the loss with labels.
-
-        Subclass and override to inject some custom behavior.
-        """
-        per_example_loss, logits = self.run_model(features, labels, False)
-        scaled_loss = per_example_loss / tf.cast(nb_instances_in_global_batch, dtype=per_example_loss.dtype)
-
-        self.eval_loss.update_state(scaled_loss)
-
-        return logits
-
-    @tf.function
-    def distributed_prediction_steps(self, batch):
-        nb_instances_in_batch = self._compute_nb_instances(batch)
-        inputs = self._get_step_inputs(batch, nb_instances_in_batch)
-
-        logits = self.args.strategy.run(self.prediction_step, inputs)
-
-        return logits
-
-    def train(self) -> None:
-        """
-        Train method to train the model.
-        """
-        train_ds = self.get_train_tfdataset()
-
-        if self.args.debug:
-            tf.summary.trace_on(graph=True, profiler=True)
-
-        self.gradient_accumulator.reset()
-
-        num_update_steps_per_epoch = self.num_train_examples / self.total_train_batch_size
-
-        # In fact, ``self.args.dataloader_drop_last`` has no effect in `trainer_tf.py`, because
-        # the dataset is repeated before being batched.
-        # It has the effect only when TPU is used which requires explicit tensor shape in order to make
-        # the gradient accumulation implementation work.
-        approx = math.floor if self.args.dataloader_drop_last else math.ceil
-        num_update_steps_per_epoch = approx(num_update_steps_per_epoch)
-
-        # At least one update for each epoch.
-        num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1)
-        self.steps_per_epoch = num_update_steps_per_epoch
-
-        if self.args.max_steps > 0:
-            t_total = self.args.max_steps
-            epochs = (self.args.max_steps // self.steps_per_epoch) + int(
-                self.args.max_steps % self.steps_per_epoch > 0
-            )
-        else:
-            t_total = self.steps_per_epoch * self.args.num_train_epochs
-            epochs = self.args.num_train_epochs
-
-        # Since ``self.args.num_train_epochs`` can be `float`, we make ``epochs`` be a `float` always.
-        epochs = float(epochs)
-
-        with self.args.strategy.scope():
-            self.create_optimizer_and_scheduler(num_training_steps=t_total)
-            folder = os.path.join(self.args.output_dir, PREFIX_CHECKPOINT_DIR)
-            ckpt = tf.train.Checkpoint(optimizer=self.optimizer, model=self.model)
-            self.model.ckpt_manager = tf.train.CheckpointManager(ckpt, folder, max_to_keep=self.args.save_total_limit)
-
-            iterations = self.optimizer.iterations
-            epochs_trained = 0
-            steps_trained_in_current_epoch = 0
-            if self.model.ckpt_manager.latest_checkpoint:
-                logger.info(
-                    f"Checkpoint file {self.model.ckpt_manager.latest_checkpoint} found and restoring from checkpoint"
-                )
-                ckpt.restore(self.model.ckpt_manager.latest_checkpoint).expect_partial()
-
-                self.global_step = iterations.numpy()
-
-                epochs_trained = self.global_step // self.steps_per_epoch
-                steps_trained_in_current_epoch = self.global_step % self.steps_per_epoch
-
-                logger.info("  Continuing training from checkpoint, will skip to saved global_step")
-                logger.info(f"  Continuing training from epoch {epochs_trained}")
-                logger.info(f"  Continuing training from global step {self.global_step}")
-                logger.info(f"  Will skip the first {steps_trained_in_current_epoch} steps in the first epoch")
-
-            tf.summary.experimental.set_step(self.global_step)
-
-            with self.tb_writer.as_default():
-                tf.summary.text("args", self.args.to_json_string())
-
-            self.tb_writer.flush()
-
-            logger.info("***** Running training *****")
-            logger.info(f"  Num examples = {self.num_train_examples}")
-            # TODO: We might want to print a more precise ``epochs`` if self.args.max_steps > 0 ?
-            logger.info(f"  Num Epochs = {epochs}")
-            logger.info(f"  Instantaneous batch size per device = {self.args.per_device_train_batch_size}")
-            logger.info(
-                f"  Total train batch size (w. parallel, distributed & accumulation) = {self.total_train_batch_size}"
-            )
-            logger.info(f"  Gradient Accumulation steps = {self.args.gradient_accumulation_steps}")
-            logger.info(f"  Steps per epoch = {self.steps_per_epoch}")
-            logger.info(f"  Total optimization steps = {t_total}")
-
-            self.train_loss = tf.keras.metrics.Sum()
-            start_time = datetime.datetime.now()
-
-            for epoch_iter in range(epochs_trained, int(epochs)):
-                # Reset the past mems state at the beginning of each epoch if necessary.
-                if self.args.past_index >= 0:
-                    self._past = None
-
-                for step, batch in enumerate(train_ds):
-                    # Skip past any already trained steps if resuming training
-                    if steps_trained_in_current_epoch > 0:
-                        steps_trained_in_current_epoch -= 1
-                        continue
-
-                    self.distributed_training_steps(batch)
-
-                    self.global_step = iterations.numpy()
-                    self.epoch_logging = epoch_iter + (step + 1) / self.steps_per_epoch
-
-                    training_loss = self.train_loss.result() / (step + 1)
-
-                    if self.args.debug:
-                        logs = {}
-                        logs["loss"] = training_loss.numpy()
-                        logs["epoch"] = self.epoch_logging
-
-                        self.log(logs)
-
-                    if self.global_step == 1 and self.args.debug:
-                        with self.tb_writer.as_default():
-                            tf.summary.trace_export(
-                                name="training", step=self.global_step, profiler_outdir=self.args.logging_dir
-                            )
-
-                    if (
-                        self.args.eval_steps > 0
-                        and self.args.evaluation_strategy == IntervalStrategy.STEPS
-                        and self.global_step % self.args.eval_steps == 0
-                    ):
-                        self.evaluate()
-
-                    if (self.args.logging_steps > 0 and self.global_step % self.args.logging_steps == 0) or (
-                        self.global_step == 1 and self.args.logging_first_step
-                    ):
-                        logs = {}
-                        logs["loss"] = training_loss.numpy()
-                        logs["learning_rate"] = self.lr_scheduler(self.global_step).numpy()
-                        logs["epoch"] = self.epoch_logging
-
-                        self.log(logs)
-
-                    if self.args.save_steps > 0 and self.global_step % self.args.save_steps == 0:
-                        ckpt_save_path = self.model.ckpt_manager.save()
-
-                        logger.info(f"Saving checkpoint for step {self.global_step} at {ckpt_save_path}")
-
-                    if self.args.max_steps > 0 and self.global_step >= t_total:
-                        break
-
-                    if self.global_step % self.steps_per_epoch == 0:
-                        break
-
-                self.train_loss.reset_states()
-
-                if self.args.max_steps > 0 and self.global_step >= self.args.max_steps:
-                    break
-
-            end_time = datetime.datetime.now()
-
-            logger.info(f"Training took: {str(end_time - start_time)}")
-
-        if self.args.past_index and hasattr(self, "_past"):
-            # Clean the state at the end of training
-            delattr(self, "_past")
-
-    def training_step(self, features, labels, nb_instances_in_global_batch):
-        """
-        Perform a training step on features and labels.
-
-        Subclass and override to inject some custom behavior.
-        """
-        per_example_loss, _ = self.run_model(features, labels, True)
-        scaled_loss = per_example_loss / tf.cast(nb_instances_in_global_batch, dtype=per_example_loss.dtype)
-        gradients = tf.gradients(scaled_loss, self.model.trainable_variables)
-        gradients = [
-            g if g is not None else tf.zeros_like(v) for g, v in zip(gradients, self.model.trainable_variables)
-        ]
-
-        if self.args.gradient_accumulation_steps > 1:
-            self.gradient_accumulator(gradients)
-
-        self.train_loss.update_state(scaled_loss)
-
-        if self.args.gradient_accumulation_steps == 1:
-            return gradients
-
-    def apply_gradients(self, features, labels, nb_instances_in_global_batch):
-        if self.args.gradient_accumulation_steps == 1:
-            gradients = self.training_step(features, labels, nb_instances_in_global_batch)
-
-            self.optimizer.apply_gradients(list(zip(gradients, self.model.trainable_variables)))
-        else:
-            for _ in tf.range(self.args.gradient_accumulation_steps):
-                reduced_features = {
-                    k: ft[: self.args.train_batch_size // self.args.n_replicas] for k, ft in features.items()
-                }
-
-                if tf.is_tensor(labels):
-                    reduced_labels = labels[: self.args.train_batch_size // self.args.n_replicas]
-                elif isinstance(labels, dict):
-                    reduced_labels = {
-                        k: lbl[: self.args.train_batch_size // self.args.n_replicas] for k, lbl in labels.items()
-                    }
-                else:
-                    raise ValueError("The labels must be either a tf.Tensor or a dict.")
-
-                self.training_step(reduced_features, reduced_labels, nb_instances_in_global_batch)
-
-                features = {
-                    k: tf.concat(
-                        [ft[self.args.train_batch_size // self.args.n_replicas :], reduced_features[k]],
-                        axis=0,
-                    )
-                    for k, ft in features.items()
-                }
-
-                if tf.is_tensor(labels):
-                    labels = tf.concat(
-                        [labels[self.args.train_batch_size // self.args.n_replicas :], reduced_labels], axis=0
-                    )
-                elif isinstance(labels, dict):
-                    labels = {
-                        k: tf.concat(
-                            [lbl[self.args.train_batch_size // self.args.n_replicas :], reduced_labels[k]],
-                            axis=0,
-                        )
-                        for k, lbl in labels.items()
-                    }
-                else:
-                    raise ValueError("The labels must be either a tf.Tensor or a dict.")
-
-            gradients = self.gradient_accumulator.gradients
-            gradients = [
-                (tf.clip_by_value(grad, -self.args.max_grad_norm, self.args.max_grad_norm)) for grad in gradients
-            ]
-
-            self.optimizer.apply_gradients(list(zip(gradients, self.model.trainable_variables)))
-            self.gradient_accumulator.reset()
-
-    @tf.function
-    def distributed_training_steps(self, batch):
-        with self.args.strategy.scope():
-            nb_instances_in_batch = self._compute_nb_instances(batch)
-            inputs = self._get_step_inputs(batch, nb_instances_in_batch)
-
-            self.args.strategy.run(self.apply_gradients, inputs)
-
-    @staticmethod
-    def _compute_nb_instances(batch):
-        labels = batch[-1]
-        if isinstance(labels, PerReplica):
-            labels = tf.concat(labels.values, axis=0)
-
-        nb_instances = tf.reduce_sum(tf.cast(labels != -100, dtype=tf.int32))
-
-        return nb_instances
-
-    @staticmethod
-    def _get_step_inputs(batch, nb_instances):
-        features, labels = batch
-
-        if isinstance(labels, PerReplica):
-            # need to make a `PerReplica` objects for ``nb_instances``
-            nb_instances = PerReplica([nb_instances] * len(labels.values))
-
-        step_inputs = (features, labels, nb_instances)
-
-        return step_inputs
-
-    def run_model(self, features, labels, training):
-        """
-        Computes the loss of the given features and labels pair.
-
-        Subclass and override this method if you want to inject some custom behavior.
-
-        Args:
-            features (`tf.Tensor`): A batch of input features.
-            labels (`tf.Tensor`): A batch of labels.
-            training (`bool`): Whether or not to run the model in training mode.
-
-        Returns:
-            A tuple of two `tf.Tensor`: The loss and logits.
-        """
-
-        if self.args.past_index >= 0 and getattr(self, "_past", None) is not None:
-            features["mems"] = self._past
-
-        if isinstance(labels, (dict)):
-            outputs = self.model(features, training=training, **labels)[:2]
-        else:
-            outputs = self.model(features, labels=labels, training=training)[:2]
-
-        loss, logits = outputs[:2]
-
-        if self.args.past_index >= 0:
-            self._past = outputs[self.args.past_index]
-
-        return loss, logits
-
-    def predict(self, test_dataset: tf.data.Dataset) -> PredictionOutput:
-        """
-        Run prediction and returns predictions and potential metrics.
-
-        Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
-        will also return metrics, like in `evaluate()`.
-
-        Args:
-            test_dataset ([`~tf.data.Dataset`]):
-                Dataset to run the predictions on. The dataset should yield tuples of `(features, labels)` where
-                `features` is a dict of input features and `labels` is the labels. If `labels` is a tensor, the loss is
-                calculated by the model by calling `model(features, labels=labels)`. If `labels` is a dict, such as
-                when using a QuestionAnswering head model with multiple targets, the loss is instead calculated by
-                calling `model(features, **labels)`
-
-        Returns: *NamedTuple* A namedtuple with the following keys:
-
-            - predictions (`np.ndarray`): The predictions on `test_dataset`.
-            - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
-            - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
-              labels).
-        """
-        test_ds, steps, num_examples = self.get_test_tfdataset(test_dataset)
-
-        return self.prediction_loop(test_ds, steps, num_examples, description="Prediction")
-
-    def save_model(self, output_dir: Optional[str] = None):
-        """
-        Will save the model, so you can reload it using `from_pretrained()`.
-        """
-        output_dir = output_dir if output_dir is not None else self.args.output_dir
-
-        logger.info(f"Saving model in {output_dir}")
-
-        if not isinstance(self.model, TFPreTrainedModel):
-            raise ValueError("Trainer.model appears to not be a PreTrainedModel")
-
-        self.model.save_pretrained(output_dir)
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Utilities for the Trainer and TFTrainer class. Should be independent from PyTorch and TensorFlow.
+PyTorch-independent utilities for the Trainer class.
 """

 import copy
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@ -379,8 +379,6 @@ class TrainingArguments:
            set to warn or lower (default), `False` otherwise.
        remove_unused_columns (`bool`, *optional*, defaults to `True`):
            Whether or not to automatically remove the columns unused by the model forward method.
-
-            (Note that this behavior is not implemented for [`TFTrainer`] yet.)
        label_names (`List[str]`, *optional*):
            The list of keys in your dictionary of inputs that correspond to the labels.

--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@ -2993,10 +2993,3 @@ class WarmUp(metaclass=DummyObject):

 def create_optimizer(*args, **kwargs):
    requires_backends(create_optimizer, ["tf"])
-
-
-class TFTrainer(metaclass=DummyObject):
-    _backends = ["tf"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tf"])
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@ -944,7 +944,6 @@ DEPRECATED_OBJECTS = [
    "xnli_output_modes",
    "xnli_processors",
    "xnli_tasks_num_labels",
-    "TFTrainer",
    "TFTrainingArguments",
 ]

--- a/utils/not_doctested.txt
+++ b/utils/not_doctested.txt
@ -965,7 +965,6 @@ src/transformers/trainer.py
 src/transformers/trainer_callback.py
 src/transformers/trainer_pt_utils.py
 src/transformers/trainer_seq2seq.py
-src/transformers/trainer_tf.py
 src/transformers/trainer_utils.py
 src/transformers/training_args.py
 src/transformers/training_args_seq2seq.py