diff --git a/examples/hans/hans_processors.py b/examples/hans/hans_processors.py index c3cb0bdda4c..ff75a0acd18 100644 --- a/examples/hans/hans_processors.py +++ b/examples/hans/hans_processors.py @@ -18,8 +18,9 @@ import logging import os -from utils_hans import DataProcessor, InputExample, InputFeatures from transformers.file_utils import is_tf_available +from utils_hans import DataProcessor, InputExample, InputFeatures + if is_tf_available(): import tensorflow as tf @@ -27,15 +28,18 @@ if is_tf_available(): logger = logging.getLogger(__name__) -def hans_convert_examples_to_features(examples, tokenizer, - max_length=512, - task=None, - label_list=None, - output_mode=None, - pad_on_left=False, - pad_token=0, - pad_token_segment_id=0, - mask_padding_with_zero=True): +def hans_convert_examples_to_features( + examples, + tokenizer, + max_length=512, + task=None, + label_list=None, + output_mode=None, + pad_on_left=False, + pad_token=0, + pad_token_segment_id=0, + mask_padding_with_zero=True, +): """ Loads a data file into a list of ``InputFeatures`` @@ -82,12 +86,7 @@ def hans_convert_examples_to_features(examples, tokenizer, example = processor.get_example_from_tensor_dict(example) example = processor.tfds_map(example) - inputs = tokenizer.encode_plus( - example.text_a, - example.text_b, - add_special_tokens=True, - max_length=max_length, - ) + inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length,) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real @@ -106,8 +105,12 @@ def hans_convert_examples_to_features(examples, tokenizer, token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) - assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length) - assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length) + assert len(attention_mask) == max_length, "Error with input length {} vs {}".format( + len(attention_mask), max_length + ) + assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format( + len(token_type_ids), max_length + ) if output_mode == "classification": label = label_map[example.label] if example.label in label_map else 0 @@ -128,28 +131,40 @@ def hans_convert_examples_to_features(examples, tokenizer, logger.info("label: %s (id = %d)" % (example.label, label)) features.append( - InputFeatures(input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - label=label, pairID=pairID)) + InputFeatures( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + label=label, + pairID=pairID, + ) + ) if is_tf_available() and is_tf_dataset: + def gen(): for ex in features: - yield ({'input_ids': ex.input_ids, - 'attention_mask': ex.attention_mask, - 'token_type_ids': ex.token_type_ids}, - ex.label) + yield ( + { + "input_ids": ex.input_ids, + "attention_mask": ex.attention_mask, + "token_type_ids": ex.token_type_ids, + }, + ex.label, + ) - return tf.data.Dataset.from_generator(gen, - ({'input_ids': tf.int32, - 'attention_mask': tf.int32, - 'token_type_ids': tf.int32}, - tf.int64), - ({'input_ids': tf.TensorShape([None]), - 'attention_mask': tf.TensorShape([None]), - 'token_type_ids': tf.TensorShape([None])}, - tf.TensorShape([]))) + return tf.data.Dataset.from_generator( + gen, + ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64), + ( + { + "input_ids": tf.TensorShape([None]), + "attention_mask": tf.TensorShape([None]), + "token_type_ids": tf.TensorShape([None]), + }, + tf.TensorShape([]), + ), + ) return features @@ -159,21 +174,20 @@ class HansProcessor(DataProcessor): def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" - return InputExample(tensor_dict['idx'].numpy(), - tensor_dict['premise'].numpy().decode('utf-8'), - tensor_dict['hypothesis'].numpy().decode('utf-8'), - str(tensor_dict['label'].numpy())) + return InputExample( + tensor_dict["idx"].numpy(), + tensor_dict["premise"].numpy().decode("utf-8"), + tensor_dict["hypothesis"].numpy().decode("utf-8"), + str(tensor_dict["label"].numpy()), + ) def get_train_examples(self, data_dir): """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "heuristics_train_set.txt")), "train") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_train_set.txt")), "train") def get_dev_examples(self, data_dir): """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "heuristics_evaluation_set.txt")), - "dev") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_evaluation_set.txt")), "dev") def get_labels(self): """See base class.""" @@ -188,14 +202,12 @@ class HansProcessor(DataProcessor): guid = "%s-%s" % (set_type, line[0]) text_a = line[5] text_b = line[6] - pairID = line[7][2:] if line[7].startswith('ex') else line[7] - label = line[-1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, pairID=pairID)) + pairID = line[7][2:] if line[7].startswith("ex") else line[7] + label = line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, pairID=pairID)) return examples - glue_tasks_num_labels = { "hans": 3, } @@ -207,4 +219,3 @@ glue_processors = { glue_output_modes = { "hans": "classification", } - diff --git a/examples/hans/test_hans.py b/examples/hans/test_hans.py index 242a2b17767..315ad2b2e45 100644 --- a/examples/hans/test_hans.py +++ b/examples/hans/test_hans.py @@ -19,60 +19,72 @@ from __future__ import absolute_import, division, print_function import argparse import glob +import json import logging import os import random -import json import numpy as np import torch -from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from torch.utils.data.distributed import DistributedSampler +from tqdm import tqdm, trange + +from hans_processors import glue_output_modes as output_modes +from hans_processors import glue_processors as processors +from hans_processors import hans_convert_examples_to_features as convert_examples_to_features +from transformers import ( + WEIGHTS_NAME, + AdamW, + AlbertConfig, + AlbertForSequenceClassification, + AlbertTokenizer, + BertConfig, + BertForSequenceClassification, + BertTokenizer, + DistilBertConfig, + DistilBertForSequenceClassification, + DistilBertTokenizer, + RobertaConfig, + RobertaForSequenceClassification, + RobertaTokenizer, + XLMConfig, + XLMForSequenceClassification, + XLMTokenizer, + XLNetConfig, + XLNetForSequenceClassification, + XLNetTokenizer, + get_linear_schedule_with_warmup, +) +from transformers import glue_compute_metrics as compute_metrics + try: from torch.utils.tensorboard import SummaryWriter except: from tensorboardX import SummaryWriter -from tqdm import tqdm, trange -from transformers import (WEIGHTS_NAME, BertConfig, - BertForSequenceClassification, BertTokenizer, - RobertaConfig, - RobertaForSequenceClassification, - RobertaTokenizer, - XLMConfig, XLMForSequenceClassification, - XLMTokenizer, XLNetConfig, - XLNetForSequenceClassification, - XLNetTokenizer, - DistilBertConfig, - DistilBertForSequenceClassification, - DistilBertTokenizer, - AlbertConfig, - AlbertForSequenceClassification, - AlbertTokenizer, - ) -from transformers import AdamW, get_linear_schedule_with_warmup -from transformers import glue_compute_metrics as compute_metrics -from hans_processors import glue_output_modes as output_modes -from hans_processors import glue_processors as processors -from hans_processors import hans_convert_examples_to_features as convert_examples_to_features logger = logging.getLogger(__name__) -ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig, - RobertaConfig, DistilBertConfig)), ()) +ALL_MODELS = sum( + ( + tuple(conf.pretrained_config_archive_map.keys()) + for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig) + ), + (), +) MODEL_CLASSES = { - 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer), - 'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer), - 'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer), - 'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer), - 'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer), - 'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer) + "bert": (BertConfig, BertForSequenceClassification, BertTokenizer), + "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer), + "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer), + "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer), + "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer), + "albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer), } @@ -100,14 +112,19 @@ def train(args, train_dataset, model, tokenizer): t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) - no_decay = ['bias', 'LayerNorm.weight'] + no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ - {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, - {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] + { + "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": args.weight_decay, + }, + {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, + ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) + scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total + ) if args.fp16: try: from apex import amp @@ -121,17 +138,21 @@ def train(args, train_dataset, model, tokenizer): # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], - output_device=args.local_rank, - find_unused_parameters=True) + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True + ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) - logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", - args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) + logger.info( + " Total train batch size (w. parallel, distributed & accumulation) = %d", + args.train_batch_size + * args.gradient_accumulation_steps + * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), + ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) @@ -145,16 +166,16 @@ def train(args, train_dataset, model, tokenizer): for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'labels': batch[3]} - if args.model_type != 'distilbert': - inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids + inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} + if args.model_type != "distilbert": + inputs["token_type_ids"] = ( + batch[2] if args.model_type in ["bert", "xlnet"] else None + ) # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel training + loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps @@ -178,30 +199,34 @@ def train(args, train_dataset, model, tokenizer): if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} - if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well + if ( + args.local_rank == -1 and args.evaluate_during_training + ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): - eval_key = 'eval_{}'.format(key) + eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] - logs['learning_rate'] = learning_rate_scalar - logs['loss'] = loss_scalar + logs["learning_rate"] = learning_rate_scalar + logs["loss"] = loss_scalar logging_loss = tr_loss for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) - #print(json.dumps({**logs, **{'step': global_step}})) + # print(json.dumps({**logs, **{'step': global_step}})) if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint - output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) + output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save = ( + model.module if hasattr(model, "module") else model + ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) - torch.save(args, os.path.join(output_dir, 'training_args.bin')) + torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: @@ -220,7 +245,7 @@ def train(args, train_dataset, model, tokenizer): def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) - eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) + eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): @@ -251,11 +276,11 @@ def evaluate(args, model, tokenizer, prefix=""): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'labels': batch[3]} - if args.model_type != 'distilbert': - inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids + inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} + if args.model_type != "distilbert": + inputs["token_type_ids"] = ( + batch[2] if args.model_type in ["bert", "xlnet"] else None + ) # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] @@ -263,11 +288,11 @@ def evaluate(args, model, tokenizer, prefix=""): nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() - out_label_ids = inputs['labels'].detach().cpu().numpy() + out_label_ids = inputs["labels"].detach().cpu().numpy() pair_ids = batch[4].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) + out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) pair_ids = np.append(pair_ids, batch[4].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps @@ -280,7 +305,7 @@ def evaluate(args, model, tokenizer, prefix=""): with open(output_eval_file, "w") as writer: writer.write("pairID,gld_label\n") for pid, pred in zip(pair_ids, preds): - writer.write('ex' + str(pid) + ',' + label_list[int(pred)] + '\n') + writer.write("ex" + str(pid) + "," + label_list[int(pred)] + "\n") return results @@ -292,11 +317,15 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file - cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( - 'dev' if evaluate else 'train', - list(filter(None, args.model_name_or_path.split('/'))).pop(), - str(args.max_seq_length), - str(task))) + cached_features_file = os.path.join( + args.data_dir, + "cached_{}_{}_{}_{}".format( + "dev" if evaluate else "train", + list(filter(None, args.model_name_or_path.split("/"))).pop(), + str(args.max_seq_length), + str(task), + ), + ) label_list = processor.get_labels() @@ -305,18 +334,21 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) - if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: + if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta"]: # HACK(label indices are swapped in RoBERTa pretrained model) - label_list[1], label_list[2] = label_list[2], label_list[1] - examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) - features = convert_examples_to_features(examples, - tokenizer, - label_list=label_list, - max_length=args.max_seq_length, - output_mode=output_mode, - pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet - pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], - pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, + label_list[1], label_list[2] = label_list[2], label_list[1] + examples = ( + processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) + ) + features = convert_examples_to_features( + examples, + tokenizer, + label_list=label_list, + max_length=args.max_seq_length, + output_mode=output_mode, + pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet + pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], + pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) @@ -335,7 +367,6 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): all_labels = torch.tensor([f.label for f in features], dtype=torch.float) all_pair_ids = torch.tensor([int(f.pairID) for f in features], dtype=torch.long) - dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels, all_pair_ids) return dataset, label_list @@ -344,90 +375,149 @@ def main(): parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--data_dir", default=None, type=str, required=True, - help="The input data dir. Should contain the .tsv files (or other data files) for the task.") - parser.add_argument("--model_type", default=None, type=str, required=True, - help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) - parser.add_argument("--model_name_or_path", default=None, type=str, required=True, - help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) - parser.add_argument("--task_name", default=None, type=str, required=True, - help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) - parser.add_argument("--output_dir", default=None, type=str, required=True, - help="The output directory where the model predictions and checkpoints will be written.") + parser.add_argument( + "--data_dir", + default=None, + type=str, + required=True, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.", + ) + parser.add_argument( + "--model_type", + default=None, + type=str, + required=True, + help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), + ) + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + required=True, + help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), + ) + parser.add_argument( + "--task_name", + default=None, + type=str, + required=True, + help="The name of the task to train selected in the list: " + ", ".join(processors.keys()), + ) + parser.add_argument( + "--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model predictions and checkpoints will be written.", + ) ## Other parameters - parser.add_argument("--config_name", default="", type=str, - help="Pretrained config name or path if not the same as model_name") - parser.add_argument("--tokenizer_name", default="", type=str, - help="Pretrained tokenizer name or path if not the same as model_name") - parser.add_argument("--cache_dir", default="", type=str, - help="Where do you want to store the pre-trained models downloaded from s3") - parser.add_argument("--max_seq_length", default=128, type=int, - help="The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded.") - parser.add_argument("--do_train", action='store_true', - help="Whether to run training.") - parser.add_argument("--do_eval", action='store_true', - help="Whether to run eval on the dev set.") - parser.add_argument("--evaluate_during_training", action='store_true', - help="Rul evaluation during training at each logging step.") - parser.add_argument("--do_lower_case", action='store_true', - help="Set this flag if you are using an uncased model.") + parser.add_argument( + "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" + ) + parser.add_argument( + "--tokenizer_name", + default="", + type=str, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--cache_dir", + default="", + type=str, + help="Where do you want to store the pre-trained models downloaded from s3", + ) + parser.add_argument( + "--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded.", + ) + parser.add_argument("--do_train", action="store_true", help="Whether to run training.") + parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") + parser.add_argument( + "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step." + ) + parser.add_argument( + "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." + ) - parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for training.") - parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for evaluation.") - parser.add_argument('--gradient_accumulation_steps', type=int, default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.") - parser.add_argument("--learning_rate", default=5e-5, type=float, - help="The initial learning rate for Adam.") - parser.add_argument("--weight_decay", default=0.0, type=float, - help="Weight decay if we apply some.") - parser.add_argument("--adam_epsilon", default=1e-8, type=float, - help="Epsilon for Adam optimizer.") - parser.add_argument("--max_grad_norm", default=1.0, type=float, - help="Max gradient norm.") - parser.add_argument("--num_train_epochs", default=3.0, type=float, - help="Total number of training epochs to perform.") - parser.add_argument("--max_steps", default=-1, type=int, - help="If > 0: set total number of training steps to perform. Override num_train_epochs.") - parser.add_argument("--warmup_steps", default=0, type=int, - help="Linear warmup over warmup_steps.") + parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") + parser.add_argument( + "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") + parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") + parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument( + "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." + ) + parser.add_argument( + "--max_steps", + default=-1, + type=int, + help="If > 0: set total number of training steps to perform. Override num_train_epochs.", + ) + parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") - parser.add_argument('--logging_steps', type=int, default=50, - help="Log every X updates steps.") - parser.add_argument('--save_steps', type=int, default=50, - help="Save checkpoint every X updates steps.") - parser.add_argument("--eval_all_checkpoints", action='store_true', - help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") - parser.add_argument("--no_cuda", action='store_true', - help="Avoid using CUDA when available") - parser.add_argument('--overwrite_output_dir', action='store_true', - help="Overwrite the content of the output directory") - parser.add_argument('--overwrite_cache', action='store_true', - help="Overwrite the cached training and evaluation sets") - parser.add_argument('--seed', type=int, default=42, - help="random seed for initialization") + parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") + parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") + parser.add_argument( + "--eval_all_checkpoints", + action="store_true", + help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", + ) + parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") + parser.add_argument( + "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" + ) + parser.add_argument( + "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" + ) + parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") - parser.add_argument('--fp16', action='store_true', - help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") - parser.add_argument('--fp16_opt_level', type=str, default='O1', - help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." - "See details at https://nvidia.github.io/apex/amp.html") - parser.add_argument("--local_rank", type=int, default=-1, - help="For distributed training: local_rank") - parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") - parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") + parser.add_argument( + "--fp16", + action="store_true", + help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", + ) + parser.add_argument( + "--fp16_opt_level", + type=str, + default="O1", + help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." + "See details at https://nvidia.github.io/apex/amp.html", + ) + parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") + parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() - if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: - raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) + if ( + os.path.exists(args.output_dir) + and os.listdir(args.output_dir) + and args.do_train + and not args.overwrite_output_dir + ): + raise ValueError( + "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( + args.output_dir + ) + ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd + print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() @@ -439,16 +529,24 @@ def main(): else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) - torch.distributed.init_process_group(backend='nccl') + torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging - logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) - logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, + ) + logger.warning( + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + args.local_rank, + device, + args.n_gpu, + bool(args.local_rank != -1), + args.fp16, + ) # Set seed set_seed(args) @@ -468,17 +566,23 @@ def main(): args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, - num_labels=num_labels, - finetuning_task=args.task_name, - cache_dir=args.cache_dir if args.cache_dir else None) - tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, - do_lower_case=args.do_lower_case, - cache_dir=args.cache_dir if args.cache_dir else None) - model = model_class.from_pretrained(args.model_name_or_path, - from_tf=bool('.ckpt' in args.model_name_or_path), - config=config, - cache_dir=args.cache_dir if args.cache_dir else None) + config = config_class.from_pretrained( + args.config_name if args.config_name else args.model_name_or_path, + num_labels=num_labels, + finetuning_task=args.task_name, + cache_dir=args.cache_dir if args.cache_dir else None, + ) + tokenizer = tokenizer_class.from_pretrained( + args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, + do_lower_case=args.do_lower_case, + cache_dir=args.cache_dir if args.cache_dir else None, + ) + model = model_class.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + cache_dir=args.cache_dir if args.cache_dir else None, + ) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab @@ -487,14 +591,12 @@ def main(): logger.info("Training/evaluation parameters %s", args) - # Training if args.do_train: train_dataset, _ = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) - # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed @@ -504,36 +606,39 @@ def main(): logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save = ( + model.module if hasattr(model, "module") else model + ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model - torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) + torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir) model.to(args.device) - # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: - checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) + checkpoints = list( + os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) + ) logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: - global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" - prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" - + global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" + prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else "" + model = model_class.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) - result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) + result = dict((k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results diff --git a/examples/hans/utils_hans.py b/examples/hans/utils_hans.py index 1ea2db301b3..a0f908b5ba3 100644 --- a/examples/hans/utils_hans.py +++ b/examples/hans/utils_hans.py @@ -14,10 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import csv -import sys import copy +import csv import json +import sys + class InputExample(object): """ @@ -32,6 +33,7 @@ class InputExample(object): label: (Optional) string. The label of the example. This should be specified for train and dev examples, but not for test examples. """ + def __init__(self, guid, text_a, text_b=None, label=None, pairID=None): self.guid = guid self.text_a = text_a @@ -117,6 +119,6 @@ class DataProcessor(object): lines = [] for line in reader: if sys.version_info[0] == 2: - line = list(unicode(cell, 'utf-8') for cell in line) + line = list(unicode(cell, "utf-8") for cell in line) lines.append(line) return lines