mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-02 19:21:31 +06:00
fix transposition in model conversion and memory initialization
This commit is contained in:
parent
009101de12
commit
b9c77b98d5
@ -93,8 +93,6 @@ def build_tf_to_pytorch_map(model, config):
|
||||
|
||||
# Relative positioning biases
|
||||
if config.untie_r:
|
||||
layer_str = "transformer/r_r_bias"
|
||||
layer_str_2 = "transformer/r_w_bias"
|
||||
r_r_list = []
|
||||
r_w_list = []
|
||||
for b in model.layers:
|
||||
@ -158,7 +156,7 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
|
||||
array = tf_weights[name]
|
||||
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
|
||||
# which are not required for using pretrained model
|
||||
if 'kernel' in name or 'proj_W' in name:
|
||||
if 'kernel' in name or 'proj' in name:
|
||||
array = np.transpose(array)
|
||||
if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1:
|
||||
# Here we will split the TF weigths
|
||||
|
@ -447,10 +447,10 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
|
||||
if attn_mask is not None and attn_mask.any().item():
|
||||
if attn_mask.dim() == 2:
|
||||
attn_score = attn_score.float().masked_fill(
|
||||
attn_mask[None,:,:,None], -float('inf')).type_as(attn_score)
|
||||
attn_mask[None,:,:,None], -1e30).type_as(attn_score)
|
||||
elif attn_mask.dim() == 3:
|
||||
attn_score = attn_score.float().masked_fill(
|
||||
attn_mask[:,:,:,None], -float('inf')).type_as(attn_score)
|
||||
attn_mask[:,:,:,None], -1e30).type_as(attn_score)
|
||||
|
||||
# [qlen x klen x bsz x n_head]
|
||||
attn_prob = F.softmax(attn_score, dim=1)
|
||||
@ -947,12 +947,13 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
|
||||
self.mem_len = mem_len
|
||||
self.ext_len = ext_len
|
||||
|
||||
def init_mems(self):
|
||||
def init_mems(self, data):
|
||||
if self.mem_len > 0:
|
||||
mems = []
|
||||
param = next(self.parameters())
|
||||
for i in range(self.n_layer+1):
|
||||
empty = torch.empty(0, dtype=param.dtype, device=param.device)
|
||||
empty = torch.zeros(self.mem_len, data.size(1), self.config.d_model,
|
||||
dtype=param.dtype, device=param.device)
|
||||
mems.append(empty)
|
||||
|
||||
return mems
|
||||
@ -1081,7 +1082,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
|
||||
# So, have to initialize size(0) mems inside the model forward.
|
||||
# Moreover, have to return new_mems to allow nn.DataParallel to piece
|
||||
# them together.
|
||||
if not mems: mems = self.init_mems()
|
||||
if not mems: mems = self.init_mems(data)
|
||||
|
||||
hidden, new_mems = self._forward(data, mems=mems)
|
||||
if target is None:
|
||||
|
Loading…
Reference in New Issue
Block a user