diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 1cf0f88ad6f..39a9c562349 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1377,7 +1377,6 @@ else: "LogitNormalization", "LogitsProcessor", "LogitsProcessorList", - "LogitsWarper", "MaxLengthCriteria", "MaxTimeCriteria", "MinLengthLogitsProcessor", @@ -6460,7 +6459,6 @@ if TYPE_CHECKING: LogitNormalization, LogitsProcessor, LogitsProcessorList, - LogitsWarper, MaxLengthCriteria, MaxTimeCriteria, MinLengthLogitsProcessor, diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py index ad497581c08..e616adbe679 100644 --- a/src/transformers/cache_utils.py +++ b/src/transformers/cache_utils.py @@ -63,17 +63,6 @@ class Cache(torch.nn.Module): # TODO: deprecate this function in favor of `cache_position` raise NotImplementedError("Make sure to implement `get_seq_length` in a subclass.") - # Deprecate in favor of max-cache-shape because we want to be specifc by what we mean with "max_length" - # Prev some cache objects didn't have "max_length" (SlidingWindowCache or SinkCache) because the cache object technically handles - # infinite amount of tokens. In the codebase what we really need to check is the max capacity of certain cache instances, so - # we change naming to be more explicit - def get_max_length(self) -> Optional[int]: - logger.warning_once( - "`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. " - "Calling `get_max_cache()` will raise error from v4.48" - ) - return self.get_max_cache_shape() - def get_max_cache_shape(self) -> Optional[int]: """Returns the maximum sequence length (i.e. max capacity) of the cache object""" raise NotImplementedError("Make sure to implement `get_max_cache_shape` in a subclass.") diff --git a/src/transformers/generation/__init__.py b/src/transformers/generation/__init__.py index d3eb10c1e6b..ea39e8a10b8 100644 --- a/src/transformers/generation/__init__.py +++ b/src/transformers/generation/__init__.py @@ -68,7 +68,6 @@ else: "LogitNormalization", "LogitsProcessor", "LogitsProcessorList", - "LogitsWarper", "MinLengthLogitsProcessor", "MinNewTokensLengthLogitsProcessor", "MinPLogitsWarper", @@ -89,7 +88,6 @@ else: "WatermarkLogitsProcessor", ] _import_structure["stopping_criteria"] = [ - "MaxNewTokensCriteria", "MaxLengthCriteria", "MaxTimeCriteria", "ConfidenceCriteria", @@ -230,7 +228,6 @@ if TYPE_CHECKING: LogitNormalization, LogitsProcessor, LogitsProcessorList, - LogitsWarper, MinLengthLogitsProcessor, MinNewTokensLengthLogitsProcessor, MinPLogitsWarper, @@ -254,7 +251,6 @@ if TYPE_CHECKING: ConfidenceCriteria, EosTokenCriteria, MaxLengthCriteria, - MaxNewTokensCriteria, MaxTimeCriteria, StoppingCriteria, StoppingCriteriaList, diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py index 39a38f9139e..7351abb1199 100644 --- a/src/transformers/generation/logits_process.py +++ b/src/transformers/generation/logits_process.py @@ -52,22 +52,6 @@ class LogitsProcessor: ) -class LogitsWarper: - """Abstract base class for all logit warpers that can be applied during generation with multinomial sampling.""" - - def __init__(self): - logger.warning_once( - "`LogitsWarper` is deprecated and will be removed in v4.48. Your class should inherit `LogitsProcessor` " - "instead, which has the same properties and interface." - ) - - @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: - raise NotImplementedError( - f"{self.__class__} is an abstract class. Only classes inheriting this class can be called." - ) - - class LogitsProcessorList(list): """ This class can be used to create a list of [`LogitsProcessor`] to subsequently process a `scores` input tensor. diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index beed2430b44..e4814ce4e7c 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -467,28 +467,6 @@ class GPTNeoXAttention(nn.Module): return target_dtype -# TODO Remove in deprecation cycle -class GPTNeoXFlashAttention2(GPTNeoXAttention): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - logger.warning_once( - "The `GPTNeoXFlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`" - "attribute of the `GPTNeoXAttention` class! It will be removed in v4.48" - ) - - -# TODO Remove in deprecation cycle -class GPTNeoXSdpaAttention(GPTNeoXAttention): - def __init__(self, config, layer_idx=None): - super().__init__(config, layer_idx=layer_idx) - - logger.warning_once( - "The `GPTNeoXSdpaAttention` class is deprecated in favor of simply modifying the `config._attn_implementation`" - "attribute of the `GPTNeoXAttention` class! It will be removed in v4.48" - ) - - # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->GPTNeoX class GPTNeoXRotaryEmbedding(nn.Module): def __init__(self, config: GPTNeoXConfig, device=None): @@ -600,14 +578,6 @@ class GPTNeoXMLP(nn.Module): return hidden_states -GPT_NEOX_ATTENTION_CLASSES = { - "eager": GPTNeoXAttention, - "flash_attention_2": GPTNeoXFlashAttention2, - "sdpa": GPTNeoXSdpaAttention, - "flex_attention": GPTNeoXAttention, -} - - class GPTNeoXLayer(nn.Module): def __init__(self, config, layer_idx): super().__init__() @@ -616,7 +586,7 @@ class GPTNeoXLayer(nn.Module): self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.post_attention_dropout = nn.Dropout(config.hidden_dropout) self.post_mlp_dropout = nn.Dropout(config.hidden_dropout) - self.attention = GPT_NEOX_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) + self.attention = GPTNeoXAttention(config, layer_idx) self.mlp = GPTNeoXMLP(config) def forward( diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index bac6220a718..843ff871da5 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -352,13 +352,6 @@ class LogitsProcessorList(metaclass=DummyObject): requires_backends(self, ["torch"]) -class LogitsWarper(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class MaxLengthCriteria(metaclass=DummyObject): _backends = ["torch"] diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py index a63ca59690f..e588307690b 100644 --- a/utils/check_docstrings.py +++ b/utils/check_docstrings.py @@ -70,7 +70,6 @@ OBJECTS_TO_IGNORE = [ # Deprecated "InputExample", "InputFeatures", - "LogitsWarper", # Signature is *args/**kwargs "TFSequenceSummary", "TFBertTokenizer", diff --git a/utils/check_repo.py b/utils/check_repo.py index 7f3e0c66d55..d35bf27420c 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -946,7 +946,6 @@ DEPRECATED_OBJECTS = [ "LineByLineTextDataset", "LineByLineWithRefDataset", "LineByLineWithSOPTextDataset", - "LogitsWarper", "NerPipeline", "PretrainedBartModel", "PretrainedFSMTModel",