mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 18:22:34 +06:00
Add accelerate
support for M2M100 (#19912)
* add `accelerate` support for M2M100 * fix device set nit
This commit is contained in:
parent
c766a2d70a
commit
d56d723fad
@ -532,6 +532,7 @@ class M2M100PreTrainedModel(PreTrainedModel):
|
|||||||
config_class = M2M100Config
|
config_class = M2M100Config
|
||||||
base_model_prefix = "model"
|
base_model_prefix = "model"
|
||||||
supports_gradient_checkpointing = True
|
supports_gradient_checkpointing = True
|
||||||
|
_no_split_modules = ["M2M100Attention"]
|
||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
std = self.config.init_std
|
std = self.config.init_std
|
||||||
@ -693,10 +694,10 @@ class M2M100Encoder(M2M100PreTrainedModel):
|
|||||||
self.max_source_positions = config.max_position_embeddings
|
self.max_source_positions = config.max_position_embeddings
|
||||||
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
|
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
|
||||||
|
|
||||||
|
self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
|
||||||
|
|
||||||
if embed_tokens is not None:
|
if embed_tokens is not None:
|
||||||
self.embed_tokens = embed_tokens
|
self.embed_tokens.weight = embed_tokens.weight
|
||||||
else:
|
|
||||||
self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
|
|
||||||
|
|
||||||
self.embed_positions = M2M100SinusoidalPositionalEmbedding(
|
self.embed_positions = M2M100SinusoidalPositionalEmbedding(
|
||||||
config.max_position_embeddings,
|
config.max_position_embeddings,
|
||||||
@ -777,6 +778,7 @@ class M2M100Encoder(M2M100PreTrainedModel):
|
|||||||
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
|
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
|
||||||
|
|
||||||
embed_pos = self.embed_positions(input_ids, inputs_embeds)
|
embed_pos = self.embed_positions(input_ids, inputs_embeds)
|
||||||
|
embed_pos = embed_pos.to(inputs_embeds.device)
|
||||||
|
|
||||||
hidden_states = inputs_embeds + embed_pos
|
hidden_states = inputs_embeds + embed_pos
|
||||||
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||||
@ -868,10 +870,10 @@ class M2M100Decoder(M2M100PreTrainedModel):
|
|||||||
self.max_target_positions = config.max_position_embeddings
|
self.max_target_positions = config.max_position_embeddings
|
||||||
self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
|
self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
|
||||||
|
|
||||||
|
self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
|
||||||
|
|
||||||
if embed_tokens is not None:
|
if embed_tokens is not None:
|
||||||
self.embed_tokens = embed_tokens
|
self.embed_tokens.weight = embed_tokens.weight
|
||||||
else:
|
|
||||||
self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
|
|
||||||
|
|
||||||
self.embed_positions = M2M100SinusoidalPositionalEmbedding(
|
self.embed_positions = M2M100SinusoidalPositionalEmbedding(
|
||||||
config.max_position_embeddings,
|
config.max_position_embeddings,
|
||||||
@ -1010,6 +1012,7 @@ class M2M100Decoder(M2M100PreTrainedModel):
|
|||||||
|
|
||||||
# embed positions
|
# embed positions
|
||||||
positions = self.embed_positions(input_ids, inputs_embeds, past_key_values_length)
|
positions = self.embed_positions(input_ids, inputs_embeds, past_key_values_length)
|
||||||
|
positions = positions.to(inputs_embeds.device)
|
||||||
|
|
||||||
hidden_states = inputs_embeds + positions
|
hidden_states = inputs_embeds + positions
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user