transformers/examples/run_swag.py

227 lines
8.4 KiB
Python

# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
import pandas as pd
import logging
from pytorch_pretrained_bert.tokenization import BertTokenizer
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
class SwagExample(object):
"""A single training/test example for the SWAG dataset."""
def __init__(self,
swag_id,
context_sentence,
start_ending,
ending_0,
ending_1,
ending_2,
ending_3,
label = None):
self.swag_id = swag_id
self.context_sentence = context_sentence
self.start_ending = start_ending
self.endings = [
ending_0,
ending_1,
ending_2,
ending_3,
]
self.label = label
def __str__(self):
return self.__repr__()
def __repr__(self):
l = [
f"swag_id: {self.swag_id}",
f"context_sentence: {self.context_sentence}",
f"start_ending: {self.start_ending}",
f"ending_0: {self.endings[0]}",
f"ending_1: {self.endings[1]}",
f"ending_2: {self.endings[2]}",
f"ending_3: {self.endings[3]}",
]
if self.label is not None:
l.append(f"label: {self.label}")
return ", ".join(l)
class InputFeatures(object):
def __init__(self,
example_id,
choices_features,
label
):
self.example_id = example_id
self.choices_features = [
{
'input_ids': input_ids,
'input_mask': input_mask,
'segment_ids': segment_ids
}
for _, input_ids, input_mask, segment_ids in choices_features
]
self.label = label
def read_swag_examples(input_file, is_training):
input_df = pd.read_csv(input_file)
if is_training and 'label' not in input_df.columns:
raise ValueError(
"For training, the input file must contain a label column.")
examples = [
SwagExample(
swag_id = row['fold-ind'],
context_sentence = row['sent1'],
start_ending = row['sent2'], # in the swag dataset, the
# common beginning of each
# choice is stored in "sent2".
ending_0 = row['ending0'],
ending_1 = row['ending1'],
ending_2 = row['ending2'],
ending_3 = row['ending3'],
label = row['label'] if is_training else None
) for _, row in input_df.iterrows()
]
return examples
def convert_examples_to_features(examples, tokenizer, max_seq_length,
is_training):
"""Loads a data file into a list of `InputBatch`s."""
# Swag is a multiple choice task. To perform this task using Bert,
# we will use the formatting proposed in "Improving Language
# Understanding by Generative Pre-Training" and suggested by
# @jacobdevlin-google in this issue
# https://github.com/google-research/bert/issues/38.
#
# Each choice will correspond to a sample on which we run the
# inference. For a given Swag example, we will create the 4
# following inputs:
# - [CLS] context [SEP] choice_1 [SEP]
# - [CLS] context [SEP] choice_2 [SEP]
# - [CLS] context [SEP] choice_3 [SEP]
# - [CLS] context [SEP] choice_4 [SEP]
# The model will output a single value for each input. To get the
# final decision of the model, we will run a softmax over these 4
# outputs.
features = []
for example_index, example in enumerate(examples):
context_tokens = tokenizer.tokenize(example.context_sentence)
start_ending_tokens = tokenizer.tokenize(example.start_ending)
choices_features = []
for ending_index, ending in enumerate(example.endings):
# We create a copy of the context tokens in order to be
# able to shrink it according to ending_tokens
context_tokens_choice = context_tokens[:]
ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
# Modifies `context_tokens_choice` and `ending_tokens` in
# place so that the total length is less than the
# specified length. Account for [CLS], [SEP], [SEP] with
# "- 3"
_truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
padding = [0] * (max_seq_length - len(input_ids))
input_ids += padding
input_mask += padding
segment_ids += padding
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
choices_features.append((tokens, input_ids, input_mask, segment_ids))
label = example.label
if example_index < 5:
logger.info("*** Example ***")
logger.info(f"swag_id: {example.swag_id}")
for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
logger.info(f"choice: {choice_idx}")
logger.info(f"tokens: {' '.join(tokens)}")
logger.info(f"input_ids: {' '.join(map(str, input_ids))}")
logger.info(f"input_mask: {' '.join(map(str, input_mask))}")
logger.info(f"segment_ids: {' '.join(map(str, segment_ids))}")
if is_training:
logger.info(f"label: {label}")
features.append(
InputFeatures(
example_id = example.swag_id,
choices_features = choices_features,
label = label
)
)
return features
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
if __name__ == "__main__":
is_training = True
max_seq_length = 80
examples = read_swag_examples('data/train.csv', is_training)
print(len(examples))
for example in examples[:5]:
print("###########################")
print(example)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
features = convert_examples_to_features(examples[:500], tokenizer, max_seq_length, is_training)
for i in range(10):
choice_feature_list = features[i].choices_features
for choice_idx, choice_feature in enumerate(choice_feature_list):
print(f'choice_idx: {choice_idx}')
print(f'input_ids: {" ".join(map(str, choice_feature["input_ids"]))}')
print(f'input_mask: {" ".join(map(str, choice_feature["input_mask"]))}')
print(f'segment_ids: {" ".join(map(str, choice_feature["segment_ids"]))}')