mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-04 05:10:06 +06:00
Aligning modling code for GPT2 to work with vLLM (fallback) (#36934)
* aligning for vllm * using input shape rather than attn outputs * remove demo * revert Conv1D * style * style * Update src/transformers/models/gpt2/modeling_gpt2.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * fix copies * Apply suggestions from code review Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> * adding docs about vllm * chore: style --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
e94a4807df
commit
8a0a508f2b
@ -73,6 +73,12 @@ echo -e "Hello, I'm a language model" | transformers run --task text-generation
|
|||||||
</hfoption>
|
</hfoption>
|
||||||
</hfoptions>
|
</hfoptions>
|
||||||
|
|
||||||
|
One can also serve the model using vLLM with the `transformers backend`.
|
||||||
|
|
||||||
|
```
|
||||||
|
vllm serve openai-community/gpt2 --model-imp transformers
|
||||||
|
```
|
||||||
|
|
||||||
Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
|
Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
|
||||||
|
|
||||||
The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits.
|
The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits.
|
||||||
|
@ -396,6 +396,7 @@ class DecisionTransformerGPT2Block(nn.Module):
|
|||||||
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
||||||
use_cache: Optional[bool] = False,
|
use_cache: Optional[bool] = False,
|
||||||
output_attentions: Optional[bool] = False,
|
output_attentions: Optional[bool] = False,
|
||||||
|
**kwargs,
|
||||||
) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
|
) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
|
||||||
residual = hidden_states
|
residual = hidden_states
|
||||||
hidden_states = self.ln_1(hidden_states)
|
hidden_states = self.ln_1(hidden_states)
|
||||||
@ -407,6 +408,7 @@ class DecisionTransformerGPT2Block(nn.Module):
|
|||||||
head_mask=head_mask,
|
head_mask=head_mask,
|
||||||
use_cache=use_cache,
|
use_cache=use_cache,
|
||||||
output_attentions=output_attentions,
|
output_attentions=output_attentions,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
# residual connection
|
# residual connection
|
||||||
hidden_states = attn_output + residual
|
hidden_states = attn_output + residual
|
||||||
|
@ -401,6 +401,7 @@ class GPT2Block(nn.Module):
|
|||||||
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
||||||
use_cache: Optional[bool] = False,
|
use_cache: Optional[bool] = False,
|
||||||
output_attentions: Optional[bool] = False,
|
output_attentions: Optional[bool] = False,
|
||||||
|
**kwargs,
|
||||||
) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
|
) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
|
||||||
residual = hidden_states
|
residual = hidden_states
|
||||||
hidden_states = self.ln_1(hidden_states)
|
hidden_states = self.ln_1(hidden_states)
|
||||||
@ -412,6 +413,7 @@ class GPT2Block(nn.Module):
|
|||||||
head_mask=head_mask,
|
head_mask=head_mask,
|
||||||
use_cache=use_cache,
|
use_cache=use_cache,
|
||||||
output_attentions=output_attentions,
|
output_attentions=output_attentions,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
# residual connection
|
# residual connection
|
||||||
hidden_states = attn_output + residual
|
hidden_states = attn_output + residual
|
||||||
@ -567,6 +569,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
|
|||||||
_skip_keys_device_placement = "past_key_values"
|
_skip_keys_device_placement = "past_key_values"
|
||||||
_supports_flash_attn_2 = True
|
_supports_flash_attn_2 = True
|
||||||
_supports_sdpa = True
|
_supports_sdpa = True
|
||||||
|
_supports_attention_backend = True
|
||||||
_supports_cache_class = True
|
_supports_cache_class = True
|
||||||
_supports_static_cache = True
|
_supports_static_cache = True
|
||||||
|
|
||||||
@ -903,6 +906,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
output_attentions: Optional[bool] = None,
|
output_attentions: Optional[bool] = None,
|
||||||
output_hidden_states: Optional[bool] = None,
|
output_hidden_states: Optional[bool] = None,
|
||||||
return_dict: Optional[bool] = None,
|
return_dict: Optional[bool] = None,
|
||||||
|
**kwargs,
|
||||||
) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
|
) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
|
||||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||||
output_hidden_states = (
|
output_hidden_states = (
|
||||||
@ -1046,6 +1050,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
encoder_attention_mask=encoder_attention_mask,
|
encoder_attention_mask=encoder_attention_mask,
|
||||||
use_cache=use_cache,
|
use_cache=use_cache,
|
||||||
output_attentions=output_attentions,
|
output_attentions=output_attentions,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
hidden_states = outputs[0]
|
hidden_states = outputs[0]
|
||||||
|
Loading…
Reference in New Issue
Block a user