mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-03 03:31:05 +06:00
345 lines
15 KiB
Python
345 lines
15 KiB
Python
# coding=utf-8
|
|
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
|
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
""" OpenAI GPT model fine-tuning script.
|
|
Adapted from https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/train.py
|
|
It self adapted from https://github.com/openai/finetune-transformer-lm/blob/master/train.py
|
|
|
|
This script with default values fine-tunes and evaluate a pretrained OpenAI GPT on the RocStories dataset
|
|
"""
|
|
import argparse
|
|
import os
|
|
import csv
|
|
import random
|
|
import logging
|
|
from tqdm import tqdm
|
|
|
|
import numpy as np
|
|
import torch
|
|
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
|
TensorDataset)
|
|
from sklearn.metrics import accuracy_score
|
|
from sklearn.utils import shuffle
|
|
|
|
from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, OpenAIAdam
|
|
|
|
# from analysis import rocstories as rocstories_analysis
|
|
# from datasets import rocstories
|
|
# from model_pytorch import DoubleHeadModel, load_openai_pretrained_model
|
|
# from opt import OpenAIAdam
|
|
# from text_utils import TextEncoder
|
|
# from utils import (encode_dataset, iter_data,
|
|
# ResultLogger, make_path)
|
|
# from loss import MultipleChoiceLossCompute
|
|
|
|
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
|
datefmt = '%m/%d/%Y %H:%M:%S',
|
|
level = logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def iter_apply(Xs, Ms, Ys):
|
|
# fns = [lambda x: np.concatenate(x, 0), lambda x: float(np.sum(x))]
|
|
logits = []
|
|
cost = 0
|
|
with torch.no_grad():
|
|
dh_model.eval()
|
|
for xmb, mmb, ymb in iter_data(Xs, Ms, Ys, n_batch=n_batch_train, truncate=False, verbose=True):
|
|
n = len(xmb)
|
|
XMB = torch.tensor(xmb, dtype=torch.long).to(device)
|
|
YMB = torch.tensor(ymb, dtype=torch.long).to(device)
|
|
MMB = torch.tensor(mmb).to(device)
|
|
_, clf_logits = dh_model(XMB)
|
|
clf_logits *= n
|
|
clf_losses = compute_loss_fct(XMB, YMB, MMB, clf_logits, only_return_losses=True)
|
|
clf_losses *= n
|
|
logits.append(clf_logits.to("cpu").numpy())
|
|
cost += clf_losses.sum().item()
|
|
logits = np.concatenate(logits, 0)
|
|
return logits, cost
|
|
|
|
|
|
def iter_predict(Xs, Ms):
|
|
logits = []
|
|
with torch.no_grad():
|
|
dh_model.eval()
|
|
for xmb, mmb in iter_data(Xs, Ms, n_batch=n_batch_train, truncate=False, verbose=True):
|
|
n = len(xmb)
|
|
XMB = torch.tensor(xmb, dtype=torch.long).to(device)
|
|
MMB = torch.tensor(mmb).to(device)
|
|
_, clf_logits = dh_model(XMB)
|
|
logits.append(clf_logits.to("cpu").numpy())
|
|
logits = np.concatenate(logits, 0)
|
|
return logits
|
|
|
|
|
|
def log(save_dir, desc):
|
|
global best_score
|
|
print("Logging")
|
|
tr_logits, tr_cost = iter_apply(trX[:n_valid], trM[:n_valid], trY[:n_valid])
|
|
va_logits, va_cost = iter_apply(vaX, vaM, vaY)
|
|
tr_cost = tr_cost / len(trY[:n_valid])
|
|
va_cost = va_cost / n_valid
|
|
tr_acc = accuracy_score(trY[:n_valid], np.argmax(tr_logits, 1)) * 100.
|
|
va_acc = accuracy_score(vaY, np.argmax(va_logits, 1)) * 100.
|
|
logger.log(n_epochs=n_epochs, n_updates=n_updates, tr_cost=tr_cost, va_cost=va_cost, tr_acc=tr_acc, va_acc=va_acc)
|
|
print('%d %d %.3f %.3f %.2f %.2f' % (n_epochs, n_updates, tr_cost, va_cost, tr_acc, va_acc))
|
|
if submit:
|
|
score = va_acc
|
|
if score > best_score:
|
|
best_score = score
|
|
path = os.path.join(save_dir, desc, 'best_params')
|
|
torch.save(dh_model.state_dict(), make_path(path))
|
|
|
|
|
|
def predict(dataset, submission_dir):
|
|
filename = filenames[dataset]
|
|
pred_fn = pred_fns[dataset]
|
|
label_decoder = label_decoders[dataset]
|
|
predictions = pred_fn(iter_predict(teX, teM))
|
|
if label_decoder is not None:
|
|
predictions = [label_decoder[prediction] for prediction in predictions]
|
|
path = os.path.join(submission_dir, filename)
|
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
with open(path, 'w') as f:
|
|
f.write('{}\t{}\n'.format('index', 'prediction'))
|
|
for i, prediction in enumerate(predictions):
|
|
f.write('{}\t{}\n'.format(i, prediction))
|
|
|
|
|
|
def run_epoch():
|
|
for xmb, mmb, ymb in iter_data(*shuffle(trX, trM, trYt, random_state=np.random),
|
|
n_batch=n_batch_train, truncate=True, verbose=True):
|
|
global n_updates
|
|
dh_model.train()
|
|
XMB = torch.tensor(xmb, dtype=torch.long).to(device)
|
|
YMB = torch.tensor(ymb, dtype=torch.long).to(device)
|
|
MMB = torch.tensor(mmb).to(device)
|
|
lm_logits, clf_logits = dh_model(XMB)
|
|
compute_loss_fct(XMB, YMB, MMB, clf_logits, lm_logits)
|
|
n_updates += 1
|
|
if n_updates in [1000, 2000, 4000, 8000, 16000, 32000] and n_epochs == 0:
|
|
log(save_dir, desc)
|
|
|
|
|
|
def accuracy(out, labels):
|
|
outputs = np.argmax(out, axis=1)
|
|
return np.sum(outputs == labels)
|
|
|
|
def load_rocstories_dataset(dataset_path):
|
|
""" Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
|
|
with open(dataset_path, encoding='utf_8') as f:
|
|
f = csv.reader(f)
|
|
output = []
|
|
next(f) # skip the first line
|
|
for line in tqdm(f):
|
|
output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1))
|
|
return output
|
|
|
|
def pre_process_dataset(encoded_dataset, max_len, start_token, delimiter_token, clf_token):
|
|
n_batch = len(dataset)
|
|
input_ids = np.zeros((n_batch, 2, max_len), dtype=np.int32)
|
|
mc_token_mask = np.zeros((n_batch, 2, max_len), dtype=np.int32)
|
|
lm_labels = np.full((n_batch, 2, max_len), -1, dtype=np.float32)
|
|
mc_labels = np.zeros((n_batch,), dtype=np.float32)
|
|
for i, (story, cont1, cont2, mc_label), in enumerate(encoded_dataset):
|
|
with_cont1 = [start_token] + story[:max_len] + [delimiter_token] + cont1[:max_len] + [clf_token]
|
|
with_cont2 = [start_token] + story[:max_len] + [delimiter_token] + cont2[:max_len] + [clf_token]
|
|
xmb[i, 0, :len(with_cont1)] = with_cont1
|
|
xmb[i, 1, :len(with_cont2)] = with_cont2
|
|
mc_token_mask[i, 0, len(with_cont1) - 1] = 1
|
|
lm_labels[i, 0, :len(with_cont1)-1] = with_cont1[1:]
|
|
lm_labels[i, 1, :len(with_cont2)-1] = with_cont2[1:]
|
|
mc_labels[i] = mc_label
|
|
all_inputs = (input_ids, mc_token_mask, lm_labels, mc_labels)
|
|
all_input_tensors = list(torch.tensor(t) for t in all_inputs)
|
|
return all_input_tensors
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--model_name', type=str, default='openai-gpt',
|
|
help='pretrained model name')
|
|
parser.add_argument('--data_dir', type=str, default='data/')
|
|
parser.add_argument('--seed', type=int, default=42)
|
|
parser.add_argument('--num_train_epochs', type=int, default=3)
|
|
parser.add_argument('--train_batch_size', type=int, default=8)
|
|
parser.add_argument('--max_grad_norm', type=int, default=1)
|
|
parser.add_argument('--learning_rate', type=float, default=6.25e-5)
|
|
parser.add_argument('--warmup_proportion', type=float, default=0.002)
|
|
parser.add_argument('--max_grad_norm', type=float, default=1)
|
|
parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
|
|
parser.add_argument('--weight_decay', type=float, default=0.01)
|
|
parser.add_argument('--lm_coef', type=float, default=0.5)
|
|
parser.add_argument('--n_valid', type=int, default=374)
|
|
args = parser.parse_args()
|
|
print(args)
|
|
|
|
random.seed(args.seed)
|
|
np.random.seed(args.seed)
|
|
torch.manual_seed(args.seed)
|
|
torch.cuda.manual_seed_all(args.seed)
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
n_gpu = torch.cuda.device_count()
|
|
logger.info("device", device, "n_gpu", n_gpu)
|
|
|
|
# Load tokenizer and model
|
|
# This loading functions also add new tokens and embeddings called `special tokens`
|
|
# These new embeddings will be fine-tuned on the RocStories dataset
|
|
special_tokens = ['_start_', '_delimiter_', '_classify_']
|
|
tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
|
|
special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
|
|
model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))
|
|
|
|
# Load the dataset and prepare the inputs
|
|
logger.info("Encoding dataset...")
|
|
dataset = load_rocstories_dataset(args.dataset_path)
|
|
tokenized_dataset = list(list(tokenizer.tokenize(x) for x in instance) for instance in dataset)
|
|
encoded_dataset = list(list(tokenizer.convert_tokens_to_ids(x) for x in instance) for instance in tokenized_dataset)
|
|
|
|
max_input_length = max(len(story)+max(len(cont1), len(cont2))+3 for story, cont1, cont2, _ in encoded_dataset)
|
|
max_input_length = min(max_input_length, model.config.n_positions) # Max size of input for the pre-trained model
|
|
max_sub_part_length = max_input_length // 2 - 2
|
|
|
|
# Prepare dataloader
|
|
dataset_tensors = pre_process_dataset(encoded_dataset, max_sub_part_length, *special_tokens_ids)
|
|
train_data = TensorDataset(*dataset_tensors)
|
|
train_sampler = RandomSampler(train_data)
|
|
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
|
|
|
|
# Prepare optimizer
|
|
param_optimizer = list(model.named_parameters())
|
|
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
|
optimizer_grouped_parameters = [
|
|
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
|
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
|
]
|
|
num_train_optimization_steps = len(train_data) // args.train_batch_size
|
|
optimizer = OpenAIAdam(optimizer_grouped_parameters,
|
|
lr=args.learning_rate,
|
|
warmup=args.warmup_proportion,
|
|
max_grad_norm=args.max_grad_norm,
|
|
weight_decay=arsg.weight_decay,
|
|
t_total=num_train_optimization_steps)
|
|
|
|
if args.do_train:
|
|
global_step = 0
|
|
nb_tr_steps = 0
|
|
tr_loss = 0
|
|
model.train()
|
|
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
|
|
tr_loss = 0
|
|
nb_tr_examples, nb_tr_steps = 0, 0
|
|
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
|
|
batch = tuple(t.to(device) for t in batch)
|
|
input_ids, mc_token_mask, lm_labels, mc_labels = batch
|
|
losses = model(input_ids, mc_token_mask, lm_labels, mc_labels)
|
|
loss = args.lm_coef * losses[0] + losses[1]
|
|
loss.backward()
|
|
tr_loss += loss.item()
|
|
nb_tr_examples += input_ids.size(0)
|
|
nb_tr_steps += 1
|
|
|
|
# Save a trained model
|
|
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
|
output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
|
|
if args.do_train:
|
|
torch.save(model_to_save.state_dict(), output_model_file)
|
|
|
|
# Load a trained model that you have fine-tuned
|
|
model_state_dict = torch.load(output_model_file)
|
|
model = OpenAIGPTDoubleHeadsModel(args.mode, state_dict=model_state_dict, num_labels=num_labels)
|
|
model.to(device)
|
|
|
|
if args.do_eval:
|
|
eval_examples = processor.get_dev_examples(args.data_dir)
|
|
eval_features = convert_examples_to_features(
|
|
eval_examples, label_list, args.max_seq_length, tokenizer)
|
|
logger.info("***** Running evaluation *****")
|
|
logger.info(" Num examples = %d", len(eval_examples))
|
|
logger.info(" Batch size = %d", args.eval_batch_size)
|
|
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
|
|
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
|
|
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
|
|
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
|
|
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
|
|
# Run prediction for full data
|
|
eval_sampler = SequentialSampler(eval_data)
|
|
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
|
|
|
model.eval()
|
|
eval_loss, eval_accuracy = 0, 0
|
|
nb_eval_steps, nb_eval_examples = 0, 0
|
|
|
|
for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
|
|
input_ids = input_ids.to(device)
|
|
input_mask = input_mask.to(device)
|
|
segment_ids = segment_ids.to(device)
|
|
label_ids = label_ids.to(device)
|
|
|
|
with torch.no_grad():
|
|
tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
|
|
logits = model(input_ids, segment_ids, input_mask)
|
|
|
|
logits = logits.detach().cpu().numpy()
|
|
label_ids = label_ids.to('cpu').numpy()
|
|
tmp_eval_accuracy = accuracy(logits, label_ids)
|
|
|
|
eval_loss += tmp_eval_loss.mean().item()
|
|
eval_accuracy += tmp_eval_accuracy
|
|
|
|
nb_eval_examples += input_ids.size(0)
|
|
nb_eval_steps += 1
|
|
|
|
eval_loss = eval_loss / nb_eval_steps
|
|
eval_accuracy = eval_accuracy / nb_eval_examples
|
|
loss = tr_loss/nb_tr_steps if args.do_train else None
|
|
result = {'eval_loss': eval_loss,
|
|
'eval_accuracy': eval_accuracy,
|
|
'global_step': global_step,
|
|
'loss': loss}
|
|
|
|
output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
|
|
with open(output_eval_file, "w") as writer:
|
|
logger.info("***** Eval results *****")
|
|
for key in sorted(result.keys()):
|
|
logger.info(" %s = %s", key, str(result[key]))
|
|
writer.write("%s = %s\n" % (key, str(result[key])))
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
n_updates = 0
|
|
n_epochs = 0
|
|
if dataset != 'stsb':
|
|
trYt = trY
|
|
if submit:
|
|
path = os.path.join(save_dir, desc, 'best_params')
|
|
torch.save(dh_model.state_dict(), make_path(path))
|
|
best_score = 0
|
|
for i in range(args.n_iter):
|
|
print("running epoch", i)
|
|
run_epoch()
|
|
n_epochs += 1
|
|
log(save_dir, desc)
|
|
if submit:
|
|
path = os.path.join(save_dir, desc, 'best_params')
|
|
dh_model.load_state_dict(torch.load(path))
|
|
predict(dataset, args.submission_dir)
|
|
if args.analysis:
|
|
rocstories_analysis(data_dir, os.path.join(args.submission_dir, 'ROCStories.tsv'),
|
|
os.path.join(log_dir, 'rocstories.jsonl'))
|