From 009ee86a192ff6d1f0b3d0fd81d497887e073afd Mon Sep 17 00:00:00 2001 From: thomwolf Date: Sun, 17 Feb 2019 23:57:23 +0100 Subject: [PATCH] fix tests - bump up version --- pytorch_pretrained_bert/__init__.py | 2 +- pytorch_pretrained_bert/modeling_gpt2.py | 36 ++++++++------ pytorch_pretrained_bert/modeling_openai.py | 2 +- setup.py | 2 +- tests/modeling_gpt2_test.py | 9 ++-- tests/tokenization_gpt2_test.py | 56 ---------------------- 6 files changed, 28 insertions(+), 79 deletions(-) delete mode 100644 tests/tokenization_gpt2_test.py diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py index 466dbb57e9e..eeb8392728e 100644 --- a/pytorch_pretrained_bert/__init__.py +++ b/pytorch_pretrained_bert/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.5.1" +__version__ = "0.6.0" from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer from .tokenization_openai import OpenAIGPTTokenizer from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus) diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py index 4289d1d02e5..6a2c161946e 100644 --- a/pytorch_pretrained_bert/modeling_gpt2.py +++ b/pytorch_pretrained_bert/modeling_gpt2.py @@ -64,20 +64,24 @@ def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path): print("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) - arrays.append(array) + arrays.append(array.squeeze()) for name, array in zip(names, arrays): + name = name[6:] # skip "model/" name = name.split('/') pointer = model for m_name in name: - if re.fullmatch(r'[A-Za-z]+_\d+', m_name): - l = re.split(r'_(\d+)', m_name) + if re.fullmatch(r'[A-Za-z]+\d+', m_name): + l = re.split(r'(\d+)', m_name) else: l = [m_name] if l[0] == 'w' or l[0] == 'g': pointer = getattr(pointer, 'weight') elif l[0] == 'b': pointer = getattr(pointer, 'bias') + elif l[0] == 'wpe' or l[0] == 'wte': + pointer = getattr(pointer, l[0]) + pointer = getattr(pointer, 'weight') else: pointer = getattr(pointer, l[0]) if len(l) >= 2: @@ -107,7 +111,7 @@ class GPT2Config(object): def __init__( self, - vocab_size_or_config_json_file=40478, + vocab_size_or_config_json_file=50257, n_positions=1024, n_ctx=1024, n_embd=768, @@ -273,10 +277,10 @@ class Block(nn.Module): self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon) self.mlp = MLP(4 * nx, config) - def forward(self, x, past): + def forward(self, x, past=None): a, present = self.attn(self.ln_1(x), past=past) x = x + a - m = self.mlp(self.ln_2(c)) + m = self.mlp(self.ln_2(x)) x = x + m return x, present @@ -522,8 +526,12 @@ class GPT2Model(GPT2PreTrainedModel): self.apply(self.init_weights) - def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None): - past_length = 0 if past is None else past[0][0].size(-2) + def forward(self, input_ids, position_ids=None, token_type_ids=None, pasts=None): + if pasts is None: + past_length = 0 + pasts = [None] * len(self.h) + else: + pasts[0][0].size(-2) if position_ids is None: position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device) position_ids = position_ids.unsqueeze(0).expand_as(input_ids) @@ -541,8 +549,8 @@ class GPT2Model(GPT2PreTrainedModel): token_type_embeds = 0 hidden_states = inputs_embeds + position_embeds + token_type_embeds presents = [] - for block in self.h: - hidden_states, present = block(hidden_states) + for block, past in zip(self.h, pasts): + hidden_states, present = block(hidden_states, past) presents.append(present) hidden_states = self.ln_f(hidden_states) output_shape = input_shape + (hidden_states.size(-1),) @@ -599,8 +607,8 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): """ self.lm_head.set_embeddings_weights(self.transformer.wte.weight) - def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None): - hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past) + def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, pasts=None): + hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, pasts) lm_logits = self.lm_head(hidden_states) if lm_labels is not None: loss_fct = CrossEntropyLoss(ignore_index=-1) @@ -665,8 +673,8 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): """ self.lm_head.set_embeddings_weights(self.transformer.wte.weight) - def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None, past=None): - hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past) + def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None, pasts=None): + hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, pasts) lm_logits = self.lm_head(hidden_states) mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids) losses = [] diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index 60bf546c8c6..fb86148d7d7 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -56,7 +56,7 @@ def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path): init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1] init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)] - # Thsi as used when we had a single embedding matrix for positions and tokens + # This was used when we had a single embedding matrix for positions and tokens # init_params[0] = np.concatenate([init_params[1], init_params[0]], 0) # del init_params[1] init_params = [arr.squeeze() for arr in init_params] diff --git a/setup.py b/setup.py index f3762b6b949..4070e35aa8d 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ from setuptools import find_packages, setup setup( name="pytorch_pretrained_bert", - version="0.5.1", + version="0.6.0", author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors", author_email="thomas@huggingface.co", description="PyTorch version of Google AI BERT model with script to load Google pre-trained models", diff --git a/tests/modeling_gpt2_test.py b/tests/modeling_gpt2_test.py index 79889bebf51..12a539c44b6 100644 --- a/tests/modeling_gpt2_test.py +++ b/tests/modeling_gpt2_test.py @@ -38,7 +38,6 @@ class GPT2ModelTest(unittest.TestCase): use_token_type_ids=True, use_labels=True, vocab_size=99, - n_special=1, n_positions=33, n_embd=32, n_layer=5, @@ -56,7 +55,6 @@ class GPT2ModelTest(unittest.TestCase): self.use_token_type_ids = use_token_type_ids self.use_labels = use_labels self.vocab_size = vocab_size - self.n_special = n_special self.n_positions = n_positions self.n_embd = n_embd self.n_layer = n_layer @@ -76,7 +74,7 @@ class GPT2ModelTest(unittest.TestCase): token_type_ids = None if self.use_token_type_ids: - total_voc = self.vocab_size + self.n_special + total_voc = self.vocab_size token_type_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc) mc_labels = None @@ -90,7 +88,6 @@ class GPT2ModelTest(unittest.TestCase): config = GPT2Config( vocab_size_or_config_json_file=self.vocab_size, n_positions=self.n_positions, - n_special=self.n_special, n_embd=self.n_embd, n_layer=self.n_layer, n_head=self.n_head, @@ -130,7 +127,7 @@ class GPT2ModelTest(unittest.TestCase): return outputs def check_gpt2_lm_head_output(self, result): - total_voc = self.n_special + self.vocab_size + total_voc = self.vocab_size self.parent.assertListEqual( list(result["lm_logits"].size()), [self.batch_size, self.n_choices, self.seq_length, total_voc]) @@ -157,7 +154,7 @@ class GPT2ModelTest(unittest.TestCase): return outputs def check_gpt2_double_heads_output(self, result): - total_voc = self.n_special + self.vocab_size + total_voc = self.vocab_size self.parent.assertListEqual( list(result["lm_logits"].size()), [self.batch_size, self.n_choices, self.seq_length, total_voc]) diff --git a/tests/tokenization_gpt2_test.py b/tests/tokenization_gpt2_test.py deleted file mode 100644 index 4bff2b6b351..00000000000 --- a/tests/tokenization_gpt2_test.py +++ /dev/null @@ -1,56 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from __future__ import absolute_import, division, print_function, unicode_literals - -import os -import unittest -import json - -from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer - - -class GPT2TokenizationTest(unittest.TestCase): - - def test_full_tokenizer(self): - """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """ - vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", - "w", "r", "t", - "lo", "low", "er", - "low", "lowest", "newer", "wider"] - vocab_tokens = dict(zip(vocab, range(len(vocab)))) - merges = ["#version: 0.2", "l o", "lo w", "e r", ""] - with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp: - json.dump(vocab_tokens, fp) - vocab_file = fp.name - with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp: - fp.write("\n".join(merges)) - merges_file = fp.name - - tokenizer = GPT2Tokenizer(vocab_file, merges_file) - os.remove(vocab_file) - os.remove(merges_file) - - text = "lower" - bpe_tokens = ["low", "er"] - tokens = tokenizer.tokenize(text) - self.assertListEqual(tokens, bpe_tokens) - - input_tokens = tokens - input_bpe_tokens = [14, 15, 20] - self.assertListEqual( - tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) - -if __name__ == '__main__': - unittest.main()