Improve greedy search memory usage (#32895)

Do not call torch.repeat_interleave if expand_size is 1
2025-07-31 10:12:23 +06:00 · 2024-08-22 16:37:44 +02:00 · 2024-08-22 16:37:44 +02:00 · 99d67f1a09
commit 99d67f1a09
parent bf97d4aa6d
1 changed files with 4 additions and 0 deletions
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@ -617,6 +617,10 @@ class GenerationMixin:
        **model_kwargs,
    ) -> Tuple[torch.LongTensor, Dict[str, Any]]:
        """Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...]"""
+        # Do not call torch.repeat_interleave if expand_size is 1 because it clones
+        # the input tensor and thus requires more memory although no change is applied
+        if expand_size == 1:
+            return input_ids, model_kwargs

        def _expand_dict_for_generation(dict_to_expand):
            for key in dict_to_expand: