# coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. " Run OpenAI GPT on RocStories" import argparse import os import random import logging from sklearn.metrics import accuracy_score from sklearn.utils import shuffle # from analysis import rocstories as rocstories_analysis # from datasets import rocstories # from model_pytorch import DoubleHeadModel, load_openai_pretrained_model # from opt import OpenAIAdam # from text_utils import TextEncoder # from utils import (encode_dataset, iter_data, # ResultLogger, make_path) # from loss import MultipleChoiceLossCompute import numpy as np import torch from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer from pytorch_pretrained_bert.modeling_openai import OpenAIGPTDoubleHeadsModel from pytorch_pretrained_bert.optimization_openai import OpenAIAdam from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO) logger = logging.getLogger(__name__) def transform_roc(X1, X2, X3): n_batch = len(X1) xmb = np.zeros((n_batch, 2, n_ctx, 2), dtype=np.int32) mmb = np.zeros((n_batch, 2, n_ctx), dtype=np.float32) start = encoder['_start_'] delimiter = encoder['_delimiter_'] for i, (x1, x2, x3), in enumerate(zip(X1, X2, X3)): x12 = [start] + x1[:max_len] + [delimiter] + x2[:max_len] + [clf_token] x13 = [start] + x1[:max_len] + [delimiter] + x3[:max_len] + [clf_token] l12 = len(x12) l13 = len(x13) xmb[i, 0, :l12, 0] = x12 xmb[i, 1, :l13, 0] = x13 mmb[i, 0, :l12] = 1 mmb[i, 1, :l13] = 1 # Position information that is added to the input embeddings in the TransformerModel xmb[:, :, :, 1] = np.arange(n_vocab + n_special, n_vocab + n_special + n_ctx) return xmb, mmb def iter_apply(Xs, Ms, Ys): # fns = [lambda x: np.concatenate(x, 0), lambda x: float(np.sum(x))] logits = [] cost = 0 with torch.no_grad(): dh_model.eval() for xmb, mmb, ymb in iter_data(Xs, Ms, Ys, n_batch=n_batch_train, truncate=False, verbose=True): n = len(xmb) XMB = torch.tensor(xmb, dtype=torch.long).to(device) YMB = torch.tensor(ymb, dtype=torch.long).to(device) MMB = torch.tensor(mmb).to(device) _, clf_logits = dh_model(XMB) clf_logits *= n clf_losses = compute_loss_fct(XMB, YMB, MMB, clf_logits, only_return_losses=True) clf_losses *= n logits.append(clf_logits.to("cpu").numpy()) cost += clf_losses.sum().item() logits = np.concatenate(logits, 0) return logits, cost def iter_predict(Xs, Ms): logits = [] with torch.no_grad(): dh_model.eval() for xmb, mmb in iter_data(Xs, Ms, n_batch=n_batch_train, truncate=False, verbose=True): n = len(xmb) XMB = torch.tensor(xmb, dtype=torch.long).to(device) MMB = torch.tensor(mmb).to(device) _, clf_logits = dh_model(XMB) logits.append(clf_logits.to("cpu").numpy()) logits = np.concatenate(logits, 0) return logits def log(save_dir, desc): global best_score print("Logging") tr_logits, tr_cost = iter_apply(trX[:n_valid], trM[:n_valid], trY[:n_valid]) va_logits, va_cost = iter_apply(vaX, vaM, vaY) tr_cost = tr_cost / len(trY[:n_valid]) va_cost = va_cost / n_valid tr_acc = accuracy_score(trY[:n_valid], np.argmax(tr_logits, 1)) * 100. va_acc = accuracy_score(vaY, np.argmax(va_logits, 1)) * 100. logger.log(n_epochs=n_epochs, n_updates=n_updates, tr_cost=tr_cost, va_cost=va_cost, tr_acc=tr_acc, va_acc=va_acc) print('%d %d %.3f %.3f %.2f %.2f' % (n_epochs, n_updates, tr_cost, va_cost, tr_acc, va_acc)) if submit: score = va_acc if score > best_score: best_score = score path = os.path.join(save_dir, desc, 'best_params') torch.save(dh_model.state_dict(), make_path(path)) def predict(dataset, submission_dir): filename = filenames[dataset] pred_fn = pred_fns[dataset] label_decoder = label_decoders[dataset] predictions = pred_fn(iter_predict(teX, teM)) if label_decoder is not None: predictions = [label_decoder[prediction] for prediction in predictions] path = os.path.join(submission_dir, filename) os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, 'w') as f: f.write('{}\t{}\n'.format('index', 'prediction')) for i, prediction in enumerate(predictions): f.write('{}\t{}\n'.format(i, prediction)) def run_epoch(): for xmb, mmb, ymb in iter_data(*shuffle(trX, trM, trYt, random_state=np.random), n_batch=n_batch_train, truncate=True, verbose=True): global n_updates dh_model.train() XMB = torch.tensor(xmb, dtype=torch.long).to(device) YMB = torch.tensor(ymb, dtype=torch.long).to(device) MMB = torch.tensor(mmb).to(device) lm_logits, clf_logits = dh_model(XMB) compute_loss_fct(XMB, YMB, MMB, clf_logits, lm_logits) n_updates += 1 if n_updates in [1000, 2000, 4000, 8000, 16000, 32000] and n_epochs == 0: log(save_dir, desc) argmax = lambda x: np.argmax(x, 1) pred_fns = { 'rocstories': argmax, } filenames = { 'rocstories': 'ROCStories.tsv', } label_decoders = { 'rocstories': None, } if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--desc', type=str, help="Description") parser.add_argument('--dataset', type=str) parser.add_argument('--log_dir', type=str, default='log/') parser.add_argument('--save_dir', type=str, default='save/') parser.add_argument('--data_dir', type=str, default='data/') parser.add_argument('--submission_dir', type=str, default='submission/') parser.add_argument('--submit', action='store_true') parser.add_argument('--analysis', action='store_true') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--n_iter', type=int, default=3) parser.add_argument('--n_batch', type=int, default=8) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--lr', type=float, default=6.25e-5) parser.add_argument('--lr_warmup', type=float, default=0.002) parser.add_argument('--n_ctx', type=int, default=512) parser.add_argument('--n_embd', type=int, default=768) parser.add_argument('--n_head', type=int, default=12) parser.add_argument('--n_layer', type=int, default=12) parser.add_argument('--embd_pdrop', type=float, default=0.1) parser.add_argument('--attn_pdrop', type=float, default=0.1) parser.add_argument('--resid_pdrop', type=float, default=0.1) parser.add_argument('--clf_pdrop', type=float, default=0.1) parser.add_argument('--l2', type=float, default=0.01) parser.add_argument('--vector_l2', action='store_true') parser.add_argument('--opt', type=str, default='adam') parser.add_argument('--afn', type=str, default='gelu') parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--encoder_path', type=str, default='model/encoder_bpe_40000.json') parser.add_argument('--bpe_path', type=str, default='model/vocab_40000.bpe') parser.add_argument('--n_transfer', type=int, default=12) parser.add_argument('--lm_coef', type=float, default=0.5) parser.add_argument('--b1', type=float, default=0.9) parser.add_argument('--b2', type=float, default=0.999) parser.add_argument('--e', type=float, default=1e-8) parser.add_argument('--n_valid', type=int, default=374) args = parser.parse_args() print(args) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Constants submit = args.submit dataset = args.dataset n_ctx = args.n_ctx save_dir = args.save_dir desc = args.desc data_dir = args.data_dir log_dir = args.log_dir submission_dir = args.submission_dir device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) print("Encoding dataset...") ((trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3)) = encode_dataset(*rocstories(data_dir, n_valid=args.n_valid), encoder=text_encoder) encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 max_len = n_ctx // 2 - 2 n_ctx = min(max( [len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3)] + [len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3)] + [len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(teX1, teX2, teX3)] ) + 3, n_ctx) vocab = n_vocab + n_special + n_ctx trX, trM = transform_roc(trX1, trX2, trX3) vaX, vaM = transform_roc(vaX1, vaX2, vaX3) if submit: teX, teM = transform_roc(teX1, teX2, teX3) n_train = len(trY) n_valid = len(vaY) n_batch_train = args.n_batch * max(n_gpu, 1) n_updates_total = (n_train // n_batch_train) * args.n_iter dh_model = DoubleHeadModel(args, clf_token, 'multiple_choice', vocab, n_ctx) criterion = nn.CrossEntropyLoss(reduce=False) model_opt = OpenAIAdam(dh_model.parameters(), lr=args.lr, schedule=args.lr_schedule, warmup=args.lr_warmup, t_total=n_updates_total, b1=args.b1, b2=args.b2, e=args.e, l2=args.l2, vector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm) compute_loss_fct = MultipleChoiceLossCompute(criterion, criterion, args.lm_coef, model_opt) load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special) dh_model.to(device) dh_model = nn.DataParallel(dh_model) n_updates = 0 n_epochs = 0 if dataset != 'stsb': trYt = trY if submit: path = os.path.join(save_dir, desc, 'best_params') torch.save(dh_model.state_dict(), make_path(path)) best_score = 0 for i in range(args.n_iter): print("running epoch", i) run_epoch() n_epochs += 1 log(save_dir, desc) if submit: path = os.path.join(save_dir, desc, 'best_params') dh_model.load_state_dict(torch.load(path)) predict(dataset, args.submission_dir) if args.analysis: rocstories_analysis(data_dir, os.path.join(args.submission_dir, 'ROCStories.tsv'), os.path.join(log_dir, 'rocstories.jsonl'))