diff --git a/docs/source/en/model_doc/gpt2.md b/docs/source/en/model_doc/gpt2.md index cd2f388e69c..d67aabfa832 100644 --- a/docs/source/en/model_doc/gpt2.md +++ b/docs/source/en/model_doc/gpt2.md @@ -73,6 +73,12 @@ echo -e "Hello, I'm a language model" | transformers run --task text-generation +One can also serve the model using vLLM with the `transformers backend`. + +``` +vllm serve openai-community/gpt2 --model-imp transformers +``` + Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends. The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits. diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py index 22501ee5086..ab2a3024052 100755 --- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py +++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py @@ -396,6 +396,7 @@ class DecisionTransformerGPT2Block(nn.Module): encoder_attention_mask: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = False, output_attentions: Optional[bool] = False, + **kwargs, ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]: residual = hidden_states hidden_states = self.ln_1(hidden_states) @@ -407,6 +408,7 @@ class DecisionTransformerGPT2Block(nn.Module): head_mask=head_mask, use_cache=use_cache, output_attentions=output_attentions, + **kwargs, ) # residual connection hidden_states = attn_output + residual diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index 314fba427a7..0af4d990655 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -401,6 +401,7 @@ class GPT2Block(nn.Module): encoder_attention_mask: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = False, output_attentions: Optional[bool] = False, + **kwargs, ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]: residual = hidden_states hidden_states = self.ln_1(hidden_states) @@ -412,6 +413,7 @@ class GPT2Block(nn.Module): head_mask=head_mask, use_cache=use_cache, output_attentions=output_attentions, + **kwargs, ) # residual connection hidden_states = attn_output + residual @@ -567,6 +569,7 @@ class GPT2PreTrainedModel(PreTrainedModel): _skip_keys_device_placement = "past_key_values" _supports_flash_attn_2 = True _supports_sdpa = True + _supports_attention_backend = True _supports_cache_class = True _supports_static_cache = True @@ -903,6 +906,7 @@ class GPT2Model(GPT2PreTrainedModel): output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + **kwargs, ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -1046,6 +1050,7 @@ class GPT2Model(GPT2PreTrainedModel): encoder_attention_mask=encoder_attention_mask, use_cache=use_cache, output_attentions=output_attentions, + **kwargs, ) hidden_states = outputs[0]