Aligning modling code for GPT2 to work with vLLM (fallback) (#36934)

* aligning for vllm

* using input shape rather than attn outputs

* remove demo

* revert Conv1D

* style

* style

* Update src/transformers/models/gpt2/modeling_gpt2.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* fix copies

* Apply suggestions from code review

Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

* adding docs about vllm

* chore: style

---------

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Aritra Roy Gosthipaty 2025-05-02 13:25:16 +05:30 committed by GitHub
parent e94a4807df
commit 8a0a508f2b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 13 additions and 0 deletions

View File

@ -73,6 +73,12 @@ echo -e "Hello, I'm a language model" | transformers run --task text-generation
</hfoption> </hfoption>
</hfoptions> </hfoptions>
One can also serve the model using vLLM with the `transformers backend`.
```
vllm serve openai-community/gpt2 --model-imp transformers
```
Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends. Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits. The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits.

View File

@ -396,6 +396,7 @@ class DecisionTransformerGPT2Block(nn.Module):
encoder_attention_mask: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = False, use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
**kwargs,
) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]: ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
residual = hidden_states residual = hidden_states
hidden_states = self.ln_1(hidden_states) hidden_states = self.ln_1(hidden_states)
@ -407,6 +408,7 @@ class DecisionTransformerGPT2Block(nn.Module):
head_mask=head_mask, head_mask=head_mask,
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
**kwargs,
) )
# residual connection # residual connection
hidden_states = attn_output + residual hidden_states = attn_output + residual

View File

@ -401,6 +401,7 @@ class GPT2Block(nn.Module):
encoder_attention_mask: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = False, use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
**kwargs,
) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]: ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
residual = hidden_states residual = hidden_states
hidden_states = self.ln_1(hidden_states) hidden_states = self.ln_1(hidden_states)
@ -412,6 +413,7 @@ class GPT2Block(nn.Module):
head_mask=head_mask, head_mask=head_mask,
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
**kwargs,
) )
# residual connection # residual connection
hidden_states = attn_output + residual hidden_states = attn_output + residual
@ -567,6 +569,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
_skip_keys_device_placement = "past_key_values" _skip_keys_device_placement = "past_key_values"
_supports_flash_attn_2 = True _supports_flash_attn_2 = True
_supports_sdpa = True _supports_sdpa = True
_supports_attention_backend = True
_supports_cache_class = True _supports_cache_class = True
_supports_static_cache = True _supports_static_cache = True
@ -903,6 +906,7 @@ class GPT2Model(GPT2PreTrainedModel):
output_attentions: Optional[bool] = None, output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None, output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None, return_dict: Optional[bool] = None,
**kwargs,
) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
@ -1046,6 +1050,7 @@ class GPT2Model(GPT2PreTrainedModel):
encoder_attention_mask=encoder_attention_mask, encoder_attention_mask=encoder_attention_mask,
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
**kwargs,
) )
hidden_states = outputs[0] hidden_states = outputs[0]