diff --git a/src/transformers/activations.py b/src/transformers/activations.py index 15f0397535e..2dab2fb32cd 100644 --- a/src/transformers/activations.py +++ b/src/transformers/activations.py @@ -16,7 +16,6 @@ import math from collections import OrderedDict import torch -from packaging import version from torch import Tensor, nn from .utils import logging @@ -34,14 +33,6 @@ class PytorchGELUTanh(nn.Module): match due to rounding errors. """ - def __init__(self): - super().__init__() - if version.parse(torch.__version__) < version.parse("1.12.0"): - raise ImportError( - f"You are using torch=={torch.__version__}, but torch>=1.12.0 is required to use " - "PytorchGELUTanh. Please upgrade torch." - ) - def forward(self, input: Tensor) -> Tensor: return nn.functional.gelu(input, approximate="tanh") @@ -145,10 +136,7 @@ class MishActivation(nn.Module): def __init__(self): super().__init__() - if version.parse(torch.__version__) < version.parse("1.9.0"): - self.act = self._mish_python - else: - self.act = nn.functional.mish + self.act = nn.functional.mish def _mish_python(self, input: Tensor) -> Tensor: return input * torch.tanh(nn.functional.softplus(input)) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index bd27041a403..226ca6ee616 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1500,7 +1500,6 @@ class ModuleUtilsMixin: seq_ids = torch.arange(seq_length, device=device) causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] # in case past_key_values are used we need to add a prefix ones mask to the causal mask - # causal and attention masks must have same type with pytorch version < 1.3 causal_mask = causal_mask.to(attention_mask.dtype) if causal_mask.shape[1] < attention_mask.shape[1]: diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py index db8ad939725..15087677e26 100644 --- a/src/transformers/models/blip/modeling_blip_text.py +++ b/src/transformers/models/blip/modeling_blip_text.py @@ -633,7 +633,6 @@ class BlipTextModel(BlipTextPreTrainedModel): seq_ids = torch.arange(seq_length, device=device) causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] # in case past_key_values are used we need to add a prefix ones mask to the causal mask - # causal and attention masks must have same type with pytorch version < 1.3 causal_mask = causal_mask.to(attention_mask.dtype) if causal_mask.shape[1] < attention_mask.shape[1]: diff --git a/src/transformers/models/code_llama/tokenization_code_llama_fast.py b/src/transformers/models/code_llama/tokenization_code_llama_fast.py index c9d4c34b86a..f6a17ebc6d1 100644 --- a/src/transformers/models/code_llama/tokenization_code_llama_fast.py +++ b/src/transformers/models/code_llama/tokenization_code_llama_fast.py @@ -20,11 +20,8 @@ from tokenizers import normalizers, processors from ...tokenization_utils_fast import PreTrainedTokenizerFast from ...utils import is_sentencepiece_available, logging -from ...utils.versions import require_version -require_version("tokenizers>=0.13.3") - if is_sentencepiece_available(): from .tokenization_code_llama import CodeLlamaTokenizer else: diff --git a/src/transformers/models/cohere/tokenization_cohere_fast.py b/src/transformers/models/cohere/tokenization_cohere_fast.py index c8b0f6d3fed..d679f0c4e3a 100644 --- a/src/transformers/models/cohere/tokenization_cohere_fast.py +++ b/src/transformers/models/cohere/tokenization_cohere_fast.py @@ -23,11 +23,8 @@ from tokenizers import processors from ...tokenization_utils_base import BatchEncoding from ...tokenization_utils_fast import PreTrainedTokenizerFast from ...utils import logging -from ...utils.versions import require_version -require_version("tokenizers>=0.13.3") - logger = logging.get_logger(__name__) VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"} diff --git a/src/transformers/models/gemma/tokenization_gemma_fast.py b/src/transformers/models/gemma/tokenization_gemma_fast.py index cb15e47d30a..24e2c90c307 100644 --- a/src/transformers/models/gemma/tokenization_gemma_fast.py +++ b/src/transformers/models/gemma/tokenization_gemma_fast.py @@ -20,11 +20,8 @@ from tokenizers import processors from ...tokenization_utils_fast import PreTrainedTokenizerFast from ...utils import is_sentencepiece_available, logging -from ...utils.versions import require_version -require_version("tokenizers>=0.13.3") - if is_sentencepiece_available(): from .tokenization_gemma import GemmaTokenizer else: diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py index 9151590b926..2f723e4698f 100755 --- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py +++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py @@ -42,7 +42,6 @@ from ...utils import ( add_start_docstrings, add_start_docstrings_to_model_forward, is_torch_flex_attn_available, - is_torch_fx_available, logging, ) from .configuration_gpt_neo import GPTNeoConfig @@ -60,8 +59,7 @@ if is_flash_attn_available(): # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph. # It means that the function will not be traced through and simply appear as a node in the graph. -if is_torch_fx_available(): - _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask) +_prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask) logger = logging.get_logger(__name__) diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py index 417a2078d27..c348322f2b0 100644 --- a/src/transformers/models/llama/tokenization_llama_fast.py +++ b/src/transformers/models/llama/tokenization_llama_fast.py @@ -20,11 +20,8 @@ from tokenizers import processors from ...tokenization_utils_fast import PreTrainedTokenizerFast from ...utils import is_sentencepiece_available, logging -from ...utils.versions import require_version -require_version("tokenizers>=0.13.3") - if is_sentencepiece_available(): from .tokenization_llama import LlamaTokenizer else: diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py index 7e94c27cf9b..0769b91909f 100644 --- a/src/transformers/models/phimoe/modeling_phimoe.py +++ b/src/transformers/models/phimoe/modeling_phimoe.py @@ -42,7 +42,6 @@ from ...utils import ( replace_return_docstrings, ) from ...utils.deprecation import deprecate_kwarg -from ...utils.import_utils import is_torch_fx_available from .configuration_phimoe import PhimoeConfig @@ -51,8 +50,7 @@ if is_flash_attn_available(): # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph. # It means that the function will not be traced through and simply appear as a node in the graph. -if is_torch_fx_available(): - _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask) +_prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask) logger = logging.get_logger(__name__) diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py index 00dc6f5ce7e..1aa529c26dc 100755 --- a/src/transformers/models/vilt/modeling_vilt.py +++ b/src/transformers/models/vilt/modeling_vilt.py @@ -171,7 +171,7 @@ class ViltEmbeddings(nn.Module): select = torch.cat(select, dim=0) x = x[select[:, 0], select[:, 1]].view(batch_size, -1, num_channels) x_mask = x_mask[select[:, 0], select[:, 1]].view(batch_size, -1) - # `patch_index` should be on the same device as `select` (for torch>=1.13), which is ensured at definition time. + # `patch_index` should be on the same device as `select`, which is ensured at definition time. patch_index = patch_index[select[:, 0], select[:, 1]].view(batch_size, -1, 2) pos_embed = pos_embed[select[:, 0], select[:, 1]].view(batch_size, -1, num_channels) diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py index ba9d9920c1c..98e07f78f71 100644 --- a/src/transformers/optimization.py +++ b/src/transformers/optimization.py @@ -25,7 +25,6 @@ from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau from .trainer_pt_utils import LayerWiseDummyOptimizer, LayerWiseDummyScheduler from .trainer_utils import SchedulerType from .utils import logging -from .utils.versions import require_version logger = logging.get_logger(__name__) @@ -701,7 +700,6 @@ class Adafactor(Optimizer): relative_step=True, warmup_init=False, ): - require_version("torch>=1.5.0") # add_ with alpha if lr is not None and relative_step: raise ValueError("Cannot combine manual `lr` and `relative_step=True` options") if warmup_init and not relative_step: diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 18b2a9527d2..49ea8ef1777 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -138,7 +138,6 @@ from .utils import ( is_tokenizers_available, is_torch_available, is_torch_bf16_available_on_device, - is_torch_bf16_cpu_available, is_torch_bf16_gpu_available, is_torch_deterministic, is_torch_fp16_available_on_device, @@ -1073,14 +1072,6 @@ def require_torch_bf16_gpu(test_case): )(test_case) -def require_torch_bf16_cpu(test_case): - """Decorator marking a test that requires torch>=1.10, using CPU.""" - return unittest.skipUnless( - is_torch_bf16_cpu_available(), - "test requires torch>=1.10, using CPU", - )(test_case) - - def require_deterministic_for_xpu(test_case): if is_torch_xpu_available(): return unittest.skipUnless(is_torch_deterministic(), "test requires torch to use deterministic algorithms")( diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 20e8d3389bf..af2cae600b1 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -164,7 +164,6 @@ from .utils import ( is_sagemaker_dp_enabled, is_sagemaker_mp_enabled, is_schedulefree_available, - is_torch_compile_available, is_torch_hpu_available, is_torch_mlu_available, is_torch_mps_available, @@ -257,7 +256,7 @@ if is_accelerate_available("0.28.0"): def _is_peft_model(model): if is_peft_available(): - classes_to_check = (PeftModel,) if is_peft_available() else () + classes_to_check = (PeftModel,) # Here we also check if the model is an instance of `PeftMixedModel` introduced in peft>=0.7.0: https://github.com/huggingface/transformers/pull/28321 if version.parse(importlib.metadata.version("peft")) >= version.parse("0.7.0"): from peft import PeftMixedModel @@ -797,10 +796,6 @@ class Trainer: # very last self._memory_tracker.stop_and_update_metrics() - # torch.compile - if args.torch_compile and not is_torch_compile_available(): - raise RuntimeError("Using torch.compile requires PyTorch 2.0 or higher.") - self.is_fsdp_xla_v2_enabled = args.fsdp_config.get("xla_fsdp_v2", False) if self.is_fsdp_xla_v2_enabled: if not IS_XLA_FSDPV2_POST_2_2: @@ -1987,7 +1982,7 @@ class Trainer: if self.accelerator.unwrap_model(model) is not model: return model - # Mixed precision training with apex (torch < 1.6) + # Mixed precision training with apex if self.use_apex and training: model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level) @@ -3739,7 +3734,7 @@ class Trainer: torch.musa.empty_cache() elif is_torch_npu_available(): torch.npu.empty_cache() - elif is_torch_mps_available(min_version="2.0"): + elif is_torch_mps_available(): torch.mps.empty_cache() elif is_torch_hpu_available(): logger.warning( diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index bd3426614c8..65fd93a79ea 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -44,7 +44,6 @@ from .utils import ( is_sagemaker_dp_enabled, is_sagemaker_mp_enabled, is_torch_available, - is_torch_bf16_cpu_available, is_torch_bf16_gpu_available, is_torch_hpu_available, is_torch_mlu_available, @@ -1161,7 +1160,6 @@ class TrainingArguments: "help": ( "Number of batches loaded in advance by each worker. " "2 means there will be a total of 2 * num_workers batches prefetched across all workers. " - "Default is 2 for PyTorch < 2.0.0 and otherwise None." ) }, ) @@ -1681,7 +1679,7 @@ class TrainingArguments: self.half_precision_backend = self.fp16_backend if self.bf16 or self.bf16_full_eval: - if self.use_cpu and not is_torch_bf16_cpu_available() and not is_torch_xla_available(): + if self.use_cpu and not is_torch_available() and not is_torch_xla_available(): # cpu raise ValueError("Your setup doesn't support bf16/(cpu, tpu, neuroncore). You need torch>=1.10") elif not self.use_cpu: diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py index 6ab0c45d996..4063d90c6aa 100755 --- a/src/transformers/utils/fx.py +++ b/src/transformers/utils/fx.py @@ -61,10 +61,7 @@ from ..models.auto.modeling_auto import ( ) from .import_utils import ( ENV_VARS_TRUE_VALUES, - TORCH_FX_REQUIRED_VERSION, - get_torch_version, is_peft_available, - is_torch_fx_available, ) @@ -891,12 +888,6 @@ class HFTracer(Tracer): def __init__(self, autowrap_modules=(math,), autowrap_functions=()): super().__init__(autowrap_modules=autowrap_modules, autowrap_functions=autowrap_functions) - if not is_torch_fx_available(): - raise ImportError( - f"Found an incompatible version of torch. Found version {get_torch_version()}, but only version " - f"{TORCH_FX_REQUIRED_VERSION} is supported." - ) - def _generate_dummy_input( self, model: "PreTrainedModel", input_name: str, shape: list[int], input_names: list[str] ) -> dict[str, torch.Tensor]: diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index 75c88dd019e..87a43692b2f 100644 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -222,6 +222,10 @@ _torch_version = "N/A" _torch_available = False if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES: _torch_available, _torch_version = _is_package_available("torch", return_version=True) + if _torch_available: + _torch_available = version.parse(_torch_version) >= version.parse("2.1.0") + if not _torch_available: + logger.warning(f"Disabling PyTorch because PyTorch >= 2.1 is required but found {_torch_version}") else: logger.info("Disabling PyTorch because USE_TF is set") _torch_available = False @@ -310,15 +314,6 @@ if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES: _jax_version = _flax_version = "N/A" -_torch_fx_available = False -if _torch_available: - torch_version = version.parse(_torch_version) - _torch_fx_available = (torch_version.major, torch_version.minor) >= ( - TORCH_FX_REQUIRED_VERSION.major, - TORCH_FX_REQUIRED_VERSION.minor, - ) - - _torch_xla_available = False if USE_TORCH_XLA in ENV_VARS_TRUE_VALUES: _torch_xla_available, _torch_xla_version = _is_package_available("torch_xla", return_version=True) @@ -526,19 +521,8 @@ def is_torch_bf16_gpu_available(): return torch.cuda.is_available() and torch.cuda.is_bf16_supported() -def is_torch_bf16_cpu_available(): - if not is_torch_available(): - return False - - import torch - - try: - # multiple levels of AttributeError depending on the pytorch version so do them all in one check - _ = torch.cpu.amp.autocast - except AttributeError: - return False - - return True +def is_torch_bf16_cpu_available() -> bool: + return is_torch_available() def is_torch_bf16_available(): @@ -618,16 +602,11 @@ def is_torch_tf32_available(): return False if torch.cuda.get_device_properties(torch.cuda.current_device()).major < 8: return False - if int(torch.version.cuda.split(".")[0]) < 11: - return False - if version.parse(version.parse(torch.__version__).base_version) < version.parse("1.7"): - return False - return True def is_torch_fx_available(): - return _torch_fx_available + return is_torch_available() def is_peft_available(): @@ -832,21 +811,11 @@ def is_habana_gaudi1(): def is_torchdynamo_available(): - if not is_torch_available(): - return False - - return True + return is_torch_available() def is_torch_compile_available(): - if not is_torch_available(): - return False - - import torch - - # We don't do any version check here to support nighlies marked as 1.14. Ultimately needs to check version against - # 2.0 but let's do it later. - return hasattr(torch, "compile") + return is_torch_available() def is_torchdynamo_compiling(): @@ -979,10 +948,10 @@ def is_torch_xpu_available(check_device=False): return False torch_version = version.parse(_torch_version) - if torch_version.major < 2 or (torch_version.major == 2 and torch_version.minor < 6): + if torch_version.major == 2 and torch_version.minor < 6: if is_ipex_available(): import intel_extension_for_pytorch # noqa: F401 - elif torch_version.major < 2 or (torch_version.major == 2 and torch_version.minor < 4): + elif torch_version.major == 2 and torch_version.minor < 4: return False import torch diff --git a/tests/fsdp/test_fsdp.py b/tests/fsdp/test_fsdp.py index 68309d94ea5..96c1860860f 100644 --- a/tests/fsdp/test_fsdp.py +++ b/tests/fsdp/test_fsdp.py @@ -323,7 +323,6 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon): @require_torch_multi_accelerator @slow - @require_fsdp @require_fsdp_v2_version @require_accelerate_fsdp2 def test_accelerate_fsdp2_integration(self): diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py index 20d7b602385..a6aac8e3829 100644 --- a/tests/models/bert/test_modeling_bert.py +++ b/tests/models/bert/test_modeling_bert.py @@ -510,7 +510,6 @@ class BertModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) def test_model_as_decoder_with_default_input_mask(self): - # This regression test was failing with PyTorch < 1.3 ( config, input_ids, diff --git a/tests/models/bert_generation/test_modeling_bert_generation.py b/tests/models/bert_generation/test_modeling_bert_generation.py index 04a774ae8b0..e639f31073a 100644 --- a/tests/models/bert_generation/test_modeling_bert_generation.py +++ b/tests/models/bert_generation/test_modeling_bert_generation.py @@ -273,7 +273,6 @@ class BertGenerationEncoderTest(ModelTesterMixin, GenerationTesterMixin, Pipelin self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) def test_model_as_decoder_with_default_input_mask(self): - # This regression test was failing with PyTorch < 1.3 ( config, input_ids, diff --git a/tests/models/big_bird/test_modeling_big_bird.py b/tests/models/big_bird/test_modeling_big_bird.py index e43c076bec8..bdab0f73b65 100644 --- a/tests/models/big_bird/test_modeling_big_bird.py +++ b/tests/models/big_bird/test_modeling_big_bird.py @@ -506,7 +506,6 @@ class BigBirdModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) def test_model_as_decoder_with_default_input_mask(self): - # This regression test was failing with PyTorch < 1.3 ( config, input_ids, diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py index 9442be5462c..520ff2af3dd 100644 --- a/tests/models/chinese_clip/test_modeling_chinese_clip.py +++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -354,7 +354,6 @@ class ChineseCLIPTextModelTest(ModelTesterMixin, unittest.TestCase): self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) def test_model_as_decoder_with_default_input_mask(self): - # This regression test was failing with PyTorch < 1.3 ( config, input_ids, diff --git a/tests/models/data2vec/test_modeling_data2vec_text.py b/tests/models/data2vec/test_modeling_data2vec_text.py index fa9a53fb816..acb18b3d8e8 100644 --- a/tests/models/data2vec/test_modeling_data2vec_text.py +++ b/tests/models/data2vec/test_modeling_data2vec_text.py @@ -409,7 +409,6 @@ class Data2VecTextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTes self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) def test_model_as_decoder_with_default_input_mask(self): - # This regression test was failing with PyTorch < 1.3 ( config, input_ids, diff --git a/tests/models/ernie/test_modeling_ernie.py b/tests/models/ernie/test_modeling_ernie.py index aec833a7f4d..7e99ba8e81d 100644 --- a/tests/models/ernie/test_modeling_ernie.py +++ b/tests/models/ernie/test_modeling_ernie.py @@ -492,7 +492,6 @@ class ErnieModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) def test_model_as_decoder_with_default_input_mask(self): - # This regression test was failing with PyTorch < 1.3 ( config, input_ids, diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py index a9d7e84dd08..33c79f2a7b1 100644 --- a/tests/models/gpt_neox/test_modeling_gpt_neox.py +++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py @@ -306,7 +306,6 @@ class GPTNeoXModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi self.model_tester.create_and_check_model_as_decoder(config, input_ids, input_mask) def test_model_as_decoder_with_default_input_mask(self): - # This regression test was failing with PyTorch < 1.3 config, input_ids, input_mask, token_labels = self.model_tester.prepare_config_and_inputs_for_decoder() input_mask = None diff --git a/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py b/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py index d8d20278630..168a0f2eebf 100644 --- a/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py +++ b/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py @@ -223,7 +223,6 @@ class GPTNeoXModelJapaneseTest(ModelTesterMixin, GenerationTesterMixin, Pipeline self.model_tester.create_and_check_model_as_decoder(config, input_ids, input_mask) def test_model_as_decoder_with_default_input_mask(self): - # This regression test was failing with PyTorch < 1.3 config, input_ids, input_mask, token_labels = self.model_tester.prepare_config_and_inputs_for_decoder() input_mask = None diff --git a/tests/models/hubert/test_modeling_hubert.py b/tests/models/hubert/test_modeling_hubert.py index 800a35ab4ad..c4e65cfa5c6 100644 --- a/tests/models/hubert/test_modeling_hubert.py +++ b/tests/models/hubert/test_modeling_hubert.py @@ -23,7 +23,6 @@ import pytest from transformers import HubertConfig, is_torch_available from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device -from transformers.utils import is_torch_fx_available from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( @@ -48,8 +47,7 @@ if is_torch_available(): ) from transformers.models.hubert.modeling_hubert import _compute_mask_indices -if is_torch_fx_available(): - from transformers.utils.fx import symbolic_trace +from transformers.utils.fx import symbolic_trace class HubertModelTester: @@ -438,8 +436,8 @@ class HubertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): # TODO: fix it self.skipTest(reason="torch 2.1 breaks torch fx tests for wav2vec2/hubert.") - if not is_torch_fx_available() or not self.fx_compatible: - self.skipTest(reason="torch fx is not available or not compatible with this model") + if not self.fx_compatible: + self.skipTest(reason="torch fx is not compatible with this model") configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init.return_dict = False diff --git a/tests/models/mt5/test_modeling_mt5.py b/tests/models/mt5/test_modeling_mt5.py index 185bd149175..7da319e1963 100644 --- a/tests/models/mt5/test_modeling_mt5.py +++ b/tests/models/mt5/test_modeling_mt5.py @@ -27,7 +27,7 @@ from transformers.testing_utils import ( slow, torch_device, ) -from transformers.utils import is_torch_fx_available +from transformers.utils.fx import symbolic_trace from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -35,9 +35,6 @@ from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_ten from ...test_pipeline_mixin import PipelineTesterMixin -if is_torch_fx_available(): - from transformers.utils.fx import symbolic_trace - if is_torch_available(): import torch import torch.nn.functional as F @@ -598,8 +595,8 @@ class MT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, return False def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False): - if not is_torch_fx_available() or not self.fx_compatible: - self.skipTest(reason="torch.fx is not available or not compatible with this model") + if not self.fx_compatible: + self.skipTest(reason="torch.fx is not compatible with this model") configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init.return_dict = False diff --git a/tests/models/rembert/test_modeling_rembert.py b/tests/models/rembert/test_modeling_rembert.py index 66eeab10181..e7804c59799 100644 --- a/tests/models/rembert/test_modeling_rembert.py +++ b/tests/models/rembert/test_modeling_rembert.py @@ -416,7 +416,6 @@ class RemBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) def test_model_as_decoder_with_default_input_mask(self): - # This regression test was failing with PyTorch < 1.3 ( config, input_ids, diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py index d92cfbbb13e..4f4d93b07f4 100644 --- a/tests/models/roberta/test_modeling_roberta.py +++ b/tests/models/roberta/test_modeling_roberta.py @@ -417,7 +417,6 @@ class RobertaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) def test_model_as_decoder_with_default_input_mask(self): - # This regression test was failing with PyTorch < 1.3 ( config, input_ids, diff --git a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py index 4de46795960..7bb0de874d9 100644 --- a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py +++ b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py @@ -421,7 +421,6 @@ class RobertaPreLayerNormModelTest(ModelTesterMixin, GenerationTesterMixin, Pipe # Copied from tests.models.roberta.test_modeling_roberta.RobertaModelTest.test_model_as_decoder_with_default_input_mask def test_model_as_decoder_with_default_input_mask(self): - # This regression test was failing with PyTorch < 1.3 ( config, input_ids, diff --git a/tests/models/roc_bert/test_modeling_roc_bert.py b/tests/models/roc_bert/test_modeling_roc_bert.py index 4f25106873b..55babab54b5 100644 --- a/tests/models/roc_bert/test_modeling_roc_bert.py +++ b/tests/models/roc_bert/test_modeling_roc_bert.py @@ -664,7 +664,6 @@ class RoCBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) def test_model_as_decoder_with_default_input_mask(self): - # This regression test was failing with PyTorch < 1.3 ( config, input_ids, diff --git a/tests/models/roformer/test_modeling_roformer.py b/tests/models/roformer/test_modeling_roformer.py index 7dca92ddf4c..3b94cac79ea 100644 --- a/tests/models/roformer/test_modeling_roformer.py +++ b/tests/models/roformer/test_modeling_roformer.py @@ -433,7 +433,6 @@ class RoFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) def test_model_as_decoder_with_default_input_mask(self): - # This regression test was failing with PyTorch < 1.3 ( config, input_ids, diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py index 4a2bb0232c1..c608a2dbdd5 100644 --- a/tests/models/t5/test_modeling_t5.py +++ b/tests/models/t5/test_modeling_t5.py @@ -32,7 +32,8 @@ from transformers.testing_utils import ( slow, torch_device, ) -from transformers.utils import cached_property, is_torch_fx_available +from transformers.utils import cached_property +from transformers.utils.fx import symbolic_trace from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -40,10 +41,6 @@ from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_ten from ...test_pipeline_mixin import PipelineTesterMixin -if is_torch_fx_available(): - from transformers.utils.fx import symbolic_trace - - if is_torch_available(): import torch import torch.nn.functional as F @@ -603,8 +600,8 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, return False def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False): - if not is_torch_fx_available() or not self.fx_compatible: - self.skipTest(reason="torch.fx is not available or not compatible with this model") + if not self.fx_compatible: + self.skipTest(reason="torch.fx is not compatible with this model") configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init.return_dict = False diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py index f3ffa8051ee..46a263cf799 100644 --- a/tests/models/umt5/test_modeling_umt5.py +++ b/tests/models/umt5/test_modeling_umt5.py @@ -27,7 +27,7 @@ from transformers.testing_utils import ( slow, torch_device, ) -from transformers.utils import is_torch_fx_available +from transformers.utils.fx import symbolic_trace from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -35,10 +35,6 @@ from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_ten from ...test_pipeline_mixin import PipelineTesterMixin -if is_torch_fx_available(): - from transformers.utils.fx import symbolic_trace - - if is_torch_available(): import torch import torch.nn.functional as F @@ -300,8 +296,8 @@ class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin return False def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False): - if not is_torch_fx_available() or not self.fx_compatible: - self.skipTest(reason="torch fx is not available or not compatible with this model") + if not self.fx_compatible: + self.skipTest(reason="torch fx is not compatible with this model") configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init.return_dict = False diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py index edafba90ca7..c47d4855ccd 100644 --- a/tests/models/wav2vec2/test_modeling_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py @@ -42,7 +42,6 @@ from transformers.testing_utils import ( slow, torch_device, ) -from transformers.utils import is_torch_fx_available from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( @@ -90,8 +89,7 @@ if is_pyctcdecode_available(): from transformers.models.wav2vec2_with_lm import processing_wav2vec2_with_lm -if is_torch_fx_available(): - from transformers.utils.fx import symbolic_trace +from transformers.utils.fx import symbolic_trace def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout): @@ -716,8 +714,8 @@ class Wav2Vec2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase # TODO: fix it self.skipTest(reason="torch 2.1 breaks torch fx tests for wav2vec2/hubert.") - if not is_torch_fx_available() or not self.fx_compatible: - self.skipTest(reason="torch fx not available or not compatible with this model") + if not self.fx_compatible: + self.skipTest(reason="torch fx is not compatible with this model") configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init.return_dict = False diff --git a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py index 513277c77c9..543f4ef841d 100644 --- a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py +++ b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py @@ -425,7 +425,6 @@ class XLMRobertaXLModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTes self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) def test_model_as_decoder_with_default_input_mask(self): - # This regression test was failing with PyTorch < 1.3 ( config, input_ids, diff --git a/tests/models/xmod/test_modeling_xmod.py b/tests/models/xmod/test_modeling_xmod.py index 754384534ba..8a0c90cd1fc 100644 --- a/tests/models/xmod/test_modeling_xmod.py +++ b/tests/models/xmod/test_modeling_xmod.py @@ -420,7 +420,6 @@ class XmodModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) def test_model_as_decoder_with_default_input_mask(self): - # This regression test was failing with PyTorch < 1.3 ( config, input_ids, diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index be65971c95f..9bea39cecd5 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -101,7 +101,6 @@ from transformers.utils import ( is_accelerate_available, is_torch_bf16_available_on_device, is_torch_fp16_available_on_device, - is_torch_fx_available, is_torch_sdpa_available, ) from transformers.utils.generic import ContextManagers @@ -125,8 +124,8 @@ if is_torch_available(): from transformers.modeling_utils import load_state_dict, no_init_weights from transformers.pytorch_utils import id_tensor_storage -if is_torch_fx_available(): - from transformers.utils.fx import _FX_SUPPORTED_MODELS_WITH_KV_CACHE, symbolic_trace +from transformers.utils.fx import _FX_SUPPORTED_MODELS_WITH_KV_CACHE, symbolic_trace + if is_deepspeed_available(): import deepspeed @@ -1190,10 +1189,8 @@ class ModelTesterMixin: self._create_and_check_torch_fx_tracing(config, inputs_dict, output_loss=True) def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False): - if not is_torch_fx_available() or not self.fx_compatible: - self.skipTest( - f"Either torch.fx is not available, or the model type {config.model_type} is not compatible with torch.fx" - ) + if not self.fx_compatible: + self.skipTest(f"The model type {config.model_type} is not compatible with torch.fx") configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init.return_dict = False diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index c2d279509fc..e9f11bf7294 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -99,7 +99,6 @@ from transformers.testing_utils import ( require_torch_tensorrt_fx, require_torch_tf32, require_torch_up_to_2_accelerators, - require_torchdynamo, require_vision, require_wandb, run_first, @@ -3994,10 +3993,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): @require_non_xpu @require_torch_non_multi_gpu - @require_torchdynamo @require_torch_tensorrt_fx def test_torchdynamo_full_eval(self): - import torchdynamo + from torch import _dynamo as torchdynamo # torchdynamo at the moment doesn't support DP/DDP, therefore require a single gpu n_gpus = get_gpu_count() @@ -4017,30 +4015,35 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): del trainer # 2. TorchDynamo eager - trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, torchdynamo="eager", output_dir=tmp_dir) + trainer = get_regression_trainer( + a=a, b=b, eval_len=eval_len, torch_compile_backend="eager", output_dir=tmp_dir + ) metrics = trainer.evaluate() self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss) del trainer torchdynamo.reset() # 3. TorchDynamo nvfuser - trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, torchdynamo="nvfuser", output_dir=tmp_dir) + trainer = get_regression_trainer( + a=a, b=b, eval_len=eval_len, torch_compile_backend="nvfuser", output_dir=tmp_dir + ) metrics = trainer.evaluate() self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss) torchdynamo.reset() # 4. TorchDynamo fx2trt - trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, torchdynamo="fx2trt", output_dir=tmp_dir) + trainer = get_regression_trainer( + a=a, b=b, eval_len=eval_len, torch_compile_backend="fx2trt", output_dir=tmp_dir + ) metrics = trainer.evaluate() self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss) torchdynamo.reset() - @unittest.skip(reason="torch 2.0.0 gives `ModuleNotFoundError: No module named 'torchdynamo'`.") @require_torch_non_multi_gpu - @require_torchdynamo + @require_torch_gpu def test_torchdynamo_memory(self): # torchdynamo at the moment doesn't support DP/DDP, therefore require a single gpu - import torchdynamo + from torch import _dynamo as torchdynamo class CustomTrainer(Trainer): def compute_loss(self, model, inputs, return_outputs=False): @@ -4085,7 +4088,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): with tempfile.TemporaryDirectory() as tmp_dir: a = torch.ones(1024, 1024, device="cuda", requires_grad=True) a.grad = None - args = TrainingArguments(output_dir=tmp_dir, torchdynamo="nvfuser") + args = TrainingArguments(output_dir=tmp_dir, torch_compile_backend="nvfuser") trainer = CustomTrainer(model=mod, args=args) # warmup for _ in range(10): diff --git a/tests/trainer/test_trainer_fsdp.py b/tests/trainer/test_trainer_fsdp.py index 193846ad2cf..690ebd9d80d 100644 --- a/tests/trainer/test_trainer_fsdp.py +++ b/tests/trainer/test_trainer_fsdp.py @@ -21,7 +21,6 @@ from transformers.testing_utils import ( get_torch_dist_unique_port, require_accelerate, require_fp8, - require_fsdp, require_torch_multi_accelerator, run_first, torch_device, @@ -68,7 +67,6 @@ if is_torch_available(): class TestFSDPTrainer(TestCasePlus): @require_torch_multi_accelerator @require_accelerate - @require_fsdp @run_first def test_trainer(self): output_dir = self.get_auto_remove_tmp_dir() @@ -95,7 +93,6 @@ class TestFSDPTrainer(TestCasePlus): class TestFSDPTrainerFP8(TestCasePlus): @require_torch_multi_accelerator @require_accelerate - @require_fsdp @require_fp8 @run_first def test_trainer(self): @@ -125,7 +122,6 @@ class TestFSDPTrainerFP8(TestCasePlus): class TestFSDPTrainerWrap(TestCasePlus): @require_torch_multi_accelerator @require_accelerate - @require_fsdp @run_first def test_trainer(self): output_dir = self.get_auto_remove_tmp_dir() diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py index b75d1187086..b96678f114d 100644 --- a/tests/utils/test_modeling_utils.py +++ b/tests/utils/test_modeling_utils.py @@ -81,7 +81,6 @@ from transformers.utils.import_utils import ( is_tf_available, is_torch_npu_available, is_torch_sdpa_available, - is_torchdynamo_available, ) @@ -1483,8 +1482,6 @@ class ModelUtilsTest(TestCasePlus): model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None) self.assertIn("You may ignore this warning if your `pad_token_id`", cl.out) - if not is_torchdynamo_available(): - self.skipTest(reason="torchdynamo is not available") with self.subTest("Ensure that the warning code is skipped when compiling with torchdynamo."): logger.warning_once.cache_clear() from torch._dynamo import config, testing diff --git a/tests/utils/test_versions_utils.py b/tests/utils/test_versions_utils.py index 5192fd2d5cf..c485c4e84bc 100644 --- a/tests/utils/test_versions_utils.py +++ b/tests/utils/test_versions_utils.py @@ -86,7 +86,7 @@ class DependencyVersionCheckTest(TestCasePlus): def test_python(self): # matching requirement - require_version("python>=3.6.0") + require_version("python>=3.9.0") # not matching requirements for req in ["python>9.9.9", "python<3.0.0"]: