Add POS tagging and Phrase chunking token classification examples (#6457)

* Add more token classification examples

* POS tagging example

* Phrase chunking example

* PR review fixes

* Add conllu to third party list (used in token classification examples)
This commit is contained in:
vblagoje 2020-08-13 12:09:51 -04:00 committed by GitHub
parent f51161e230
commit eda07efaa5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 473 additions and 204 deletions

View File

@ -15,3 +15,4 @@ pandas
nlp
fire
pytest
conllu

1
examples/token-classification/run.sh Normal file → Executable file
View File

@ -18,6 +18,7 @@ export SAVE_STEPS=750
export SEED=1
python3 run_ner.py \
--task_type NER \
--data_dir . \
--labels ./labels.txt \
--model_name_or_path $BERT_MODEL \

View File

@ -0,0 +1,37 @@
if ! [ -f ./dev.txt ]; then
echo "Downloading CONLL2003 dev dataset...."
curl -L -o ./dev.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/valid.txt'
fi
if ! [ -f ./test.txt ]; then
echo "Downloading CONLL2003 test dataset...."
curl -L -o ./test.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/test.txt'
fi
if ! [ -f ./train.txt ]; then
echo "Downloading CONLL2003 train dataset...."
curl -L -o ./train.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/train.txt'
fi
export MAX_LENGTH=200
export BERT_MODEL=bert-base-uncased
export OUTPUT_DIR=chunker-model
export BATCH_SIZE=32
export NUM_EPOCHS=3
export SAVE_STEPS=750
export SEED=1
python3 run_ner.py \
--task_type Chunk \
--data_dir . \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--per_gpu_train_batch_size $BATCH_SIZE \
--save_steps $SAVE_STEPS \
--seed $SEED \
--do_train \
--do_eval \
--do_predict

View File

@ -14,16 +14,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
""" Fine-tuning the library models for named entity recognition on CoNLL-2003. """
import logging
import os
import sys
from dataclasses import dataclass, field
from importlib import import_module
from typing import Dict, List, Optional, Tuple
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
from torch import nn
from transformers import (
@ -36,7 +35,7 @@ from transformers import (
TrainingArguments,
set_seed,
)
from utils_ner import NerDataset, Split, get_labels
from utils_ner import Split, TokenClassificationDataset, TokenClassificationTask
logger = logging.getLogger(__name__)
@ -54,6 +53,9 @@ class ModelArguments:
config_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
)
task_type: Optional[str] = field(
default="NER", metadata={"help": "Task type to fine tune in training (e.g. NER, POS, etc)"}
)
tokenizer_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
@ -113,6 +115,16 @@ def main():
f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
)
module = import_module("tasks")
try:
token_classification_task_clazz = getattr(module, model_args.task_type)
token_classification_task: TokenClassificationTask = token_classification_task_clazz()
except AttributeError:
raise ValueError(
f"Task {model_args.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. "
f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}"
)
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@ -133,7 +145,7 @@ def main():
set_seed(training_args.seed)
# Prepare CONLL-2003 task
labels = get_labels(data_args.labels)
labels = token_classification_task.get_labels(data_args.labels)
label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
num_labels = len(labels)
@ -164,7 +176,8 @@ def main():
# Get datasets
train_dataset = (
NerDataset(
TokenClassificationDataset(
token_classification_task=token_classification_task,
data_dir=data_args.data_dir,
tokenizer=tokenizer,
labels=labels,
@ -177,7 +190,8 @@ def main():
else None
)
eval_dataset = (
NerDataset(
TokenClassificationDataset(
token_classification_task=token_classification_task,
data_dir=data_args.data_dir,
tokenizer=tokenizer,
labels=labels,
@ -209,6 +223,7 @@ def main():
def compute_metrics(p: EvalPrediction) -> Dict:
preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
return {
"accuracy_score": accuracy_score(out_label_list, preds_list),
"precision": precision_score(out_label_list, preds_list),
"recall": recall_score(out_label_list, preds_list),
"f1": f1_score(out_label_list, preds_list),
@ -253,7 +268,8 @@ def main():
# Predict
if training_args.do_predict:
test_dataset = NerDataset(
test_dataset = TokenClassificationDataset(
token_classification_task=token_classification_task,
data_dir=data_args.data_dir,
tokenizer=tokenizer,
labels=labels,
@ -278,19 +294,7 @@ def main():
if trainer.is_world_master():
with open(output_test_predictions_file, "w") as writer:
with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f:
example_id = 0
for line in f:
if line.startswith("-DOCSTART-") or line == "" or line == "\n":
writer.write(line)
if not preds_list[example_id]:
example_id += 1
elif preds_list[example_id]:
output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n"
writer.write(output_line)
else:
logger.warning(
"Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]
)
token_classification_task.write_predictions_to_file(writer, f, preds_list)
return results

View File

@ -2,15 +2,17 @@ import argparse
import glob
import logging
import os
from argparse import Namespace
from importlib import import_module
import numpy as np
import torch
from seqeval.metrics import f1_score, precision_score, recall_score
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, TensorDataset
from lightning_base import BaseTransformer, add_generic_args, generic_train
from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
from utils_ner import TokenClassificationTask
logger = logging.getLogger(__name__)
@ -24,10 +26,20 @@ class NERTransformer(BaseTransformer):
mode = "token-classification"
def __init__(self, hparams):
self.labels = get_labels(hparams.labels)
num_labels = len(self.labels)
if type(hparams) == dict:
hparams = Namespace(**hparams)
module = import_module("tasks")
try:
token_classification_task_clazz = getattr(module, hparams.task_type)
self.token_classification_task: TokenClassificationTask = token_classification_task_clazz()
except AttributeError:
raise ValueError(
f"Task {hparams.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. "
f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}"
)
self.labels = self.token_classification_task.get_labels(hparams.labels)
self.pad_token_label_id = CrossEntropyLoss().ignore_index
super().__init__(hparams, num_labels, self.mode)
super().__init__(hparams, len(self.labels), self.mode)
def forward(self, **inputs):
return self.model(**inputs)
@ -42,8 +54,8 @@ class NERTransformer(BaseTransformer):
outputs = self(**inputs)
loss = outputs[0]
tensorboard_logs = {"loss": loss, "rate": self.lr_scheduler.get_last_lr()[-1]}
return {"loss": loss, "log": tensorboard_logs}
# tensorboard_logs = {"loss": loss, "rate": self.lr_scheduler.get_last_lr()[-1]}
return {"loss": loss}
def prepare_data(self):
"Called to initialize data. Use the call to construct features"
@ -55,8 +67,8 @@ class NERTransformer(BaseTransformer):
features = torch.load(cached_features_file)
else:
logger.info("Creating features from dataset file at %s", args.data_dir)
examples = read_examples_from_file(args.data_dir, mode)
features = convert_examples_to_features(
examples = self.token_classification_task.read_examples_from_file(args.data_dir, mode)
features = self.token_classification_task.convert_examples_to_features(
examples,
self.labels,
args.max_seq_length,
@ -74,7 +86,7 @@ class NERTransformer(BaseTransformer):
logger.info("Saving features into cached file %s", cached_features_file)
torch.save(features, cached_features_file)
def load_dataset(self, mode, batch_size):
def get_dataloader(self, mode: int, batch_size: int) -> DataLoader:
"Load datasets. Called after prepare data."
cached_features_file = self._feature_file(mode)
logger.info("Loading features from cached file %s", cached_features_file)
@ -124,6 +136,7 @@ class NERTransformer(BaseTransformer):
results = {
"val_loss": val_loss_mean,
"accuracy_score": accuracy_score(out_label_list, preds_list),
"precision": precision_score(out_label_list, preds_list),
"recall": recall_score(out_label_list, preds_list),
"f1": f1_score(out_label_list, preds_list),
@ -154,6 +167,9 @@ class NERTransformer(BaseTransformer):
def add_model_specific_args(parser, root_dir):
# Add NER specific options
BaseTransformer.add_model_specific_args(parser, root_dir)
parser.add_argument(
"--task_type", default="NER", type=str, help="Task type to fine tune in training (e.g. NER, POS, etc)"
)
parser.add_argument(
"--max_seq_length",
default=128,

View File

@ -0,0 +1,37 @@
if ! [ -f ./dev.txt ]; then
echo "Download dev dataset...."
curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
fi
if ! [ -f ./test.txt ]; then
echo "Download test dataset...."
curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
fi
if ! [ -f ./train.txt ]; then
echo "Download train dataset...."
curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
fi
export MAX_LENGTH=200
export BERT_MODEL=bert-base-uncased
export OUTPUT_DIR=postagger-model
export BATCH_SIZE=32
export NUM_EPOCHS=3
export SAVE_STEPS=750
export SEED=1
python3 run_ner.py \
--task_type POS \
--data_dir . \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--per_gpu_train_batch_size $BATCH_SIZE \
--save_steps $SAVE_STEPS \
--seed $SEED \
--do_train \
--do_eval \
--do_predict

View File

@ -0,0 +1,39 @@
#!/usr/bin/env bash
if ! [ -f ./dev.txt ]; then
echo "Download dev dataset...."
curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
fi
if ! [ -f ./test.txt ]; then
echo "Download test dataset...."
curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
fi
if ! [ -f ./train.txt ]; then
echo "Download train dataset...."
curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
fi
export MAX_LENGTH=200
export BERT_MODEL=bert-base-uncased
export OUTPUT_DIR=postagger-model
export BATCH_SIZE=32
export NUM_EPOCHS=3
export SAVE_STEPS=750
export SEED=1
# Add parent directory to python path to access lightning_base.py
export PYTHONPATH="../":"${PYTHONPATH}"
python3 run_pl_ner.py --data_dir ./ \
--task_type POS \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--train_batch_size $BATCH_SIZE \
--seed $SEED \
--gpus 1 \
--do_train \
--do_predict

View File

@ -0,0 +1,163 @@
import logging
import os
from typing import List, TextIO, Union
from conllu import parse_incr
from utils_ner import InputExample, Split, TokenClassificationTask
logger = logging.getLogger(__name__)
class NER(TokenClassificationTask):
def __init__(self, label_idx=-1):
# in NER datasets, the last column is usually reserved for NER label
self.label_idx = label_idx
def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
if isinstance(mode, Split):
mode = mode.value
file_path = os.path.join(data_dir, f"{mode}.txt")
guid_index = 1
examples = []
with open(file_path, encoding="utf-8") as f:
words = []
labels = []
for line in f:
if line.startswith("-DOCSTART-") or line == "" or line == "\n":
if words:
examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
guid_index += 1
words = []
labels = []
else:
splits = line.split(" ")
words.append(splits[0])
if len(splits) > 1:
labels.append(splits[self.label_idx].replace("\n", ""))
else:
# Examples could have no label for mode = "test"
labels.append("O")
if words:
examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
return examples
def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: List):
example_id = 0
for line in test_input_reader:
if line.startswith("-DOCSTART-") or line == "" or line == "\n":
writer.write(line)
if not preds_list[example_id]:
example_id += 1
elif preds_list[example_id]:
output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n"
writer.write(output_line)
else:
logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
def get_labels(self, path: str) -> List[str]:
if path:
with open(path, "r") as f:
labels = f.read().splitlines()
if "O" not in labels:
labels = ["O"] + labels
return labels
else:
return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
class Chunk(NER):
def __init__(self):
# in CONLL2003 dataset chunk column is second-to-last
super().__init__(label_idx=-2)
def get_labels(self, path: str) -> List[str]:
if path:
with open(path, "r") as f:
labels = f.read().splitlines()
if "O" not in labels:
labels = ["O"] + labels
return labels
else:
return [
"O",
"B-ADVP",
"B-INTJ",
"B-LST",
"B-PRT",
"B-NP",
"B-SBAR",
"B-VP",
"B-ADJP",
"B-CONJP",
"B-PP",
"I-ADVP",
"I-INTJ",
"I-LST",
"I-PRT",
"I-NP",
"I-SBAR",
"I-VP",
"I-ADJP",
"I-CONJP",
"I-PP",
]
class POS(TokenClassificationTask):
def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
if isinstance(mode, Split):
mode = mode.value
file_path = os.path.join(data_dir, f"{mode}.txt")
guid_index = 1
examples = []
with open(file_path, encoding="utf-8") as f:
for sentence in parse_incr(f):
words = []
labels = []
for token in sentence:
words.append(token["form"])
labels.append(token["upos"])
assert len(words) == len(labels)
if words:
examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
guid_index += 1
return examples
def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: List):
example_id = 0
for sentence in parse_incr(test_input_reader):
s_p = preds_list[example_id]
out = ""
for token in sentence:
out += f'{token["form"]} ({token["upos"]}|{s_p.pop(0)}) '
out += "\n"
writer.write(out)
example_id += 1
def get_labels(self, path: str) -> List[str]:
if path:
with open(path, "r") as f:
return f.read().splitlines()
else:
return [
"ADJ",
"ADP",
"ADV",
"AUX",
"CCONJ",
"DET",
"INTJ",
"NOUN",
"NUM",
"PART",
"PRON",
"PROPN",
"PUNCT",
"SCONJ",
"SYM",
"VERB",
"X",
]

View File

@ -66,202 +66,15 @@ class Split(Enum):
test = "test"
if is_torch_available():
import torch
from torch import nn
from torch.utils.data.dataset import Dataset
class TokenClassificationTask:
def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
raise NotImplementedError
class NerDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach
soon.
"""
def get_labels(self, path: str) -> List[str]:
raise NotImplementedError
features: List[InputFeatures]
pad_token_label_id: int = nn.CrossEntropyLoss().ignore_index
# Use cross entropy ignore_index as padding label id so that only
# real label ids contribute to the loss later.
def __init__(
def convert_examples_to_features(
self,
data_dir: str,
tokenizer: PreTrainedTokenizer,
labels: List[str],
model_type: str,
max_seq_length: Optional[int] = None,
overwrite_cache=False,
mode: Split = Split.train,
):
# Load data features from cache or dataset file
cached_features_file = os.path.join(
data_dir, "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
)
# Make sure only the first process in distributed training processes the dataset,
# and the others will use the cache.
lock_path = cached_features_file + ".lock"
with FileLock(lock_path):
if os.path.exists(cached_features_file) and not overwrite_cache:
logger.info(f"Loading features from cached file {cached_features_file}")
self.features = torch.load(cached_features_file)
else:
logger.info(f"Creating features from dataset file at {data_dir}")
examples = read_examples_from_file(data_dir, mode)
# TODO clean up all this to leverage built-in features of tokenizers
self.features = convert_examples_to_features(
examples,
labels,
max_seq_length,
tokenizer,
cls_token_at_end=bool(model_type in ["xlnet"]),
# xlnet has a cls token at the end
cls_token=tokenizer.cls_token,
cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
sep_token=tokenizer.sep_token,
sep_token_extra=False,
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
pad_on_left=bool(tokenizer.padding_side == "left"),
pad_token=tokenizer.pad_token_id,
pad_token_segment_id=tokenizer.pad_token_type_id,
pad_token_label_id=self.pad_token_label_id,
)
logger.info(f"Saving features into cached file {cached_features_file}")
torch.save(self.features, cached_features_file)
def __len__(self):
return len(self.features)
def __getitem__(self, i) -> InputFeatures:
return self.features[i]
if is_tf_available():
import tensorflow as tf
class TFNerDataset:
"""
This will be superseded by a framework-agnostic approach
soon.
"""
features: List[InputFeatures]
pad_token_label_id: int = -100
# Use cross entropy ignore_index as padding label id so that only
# real label ids contribute to the loss later.
def __init__(
self,
data_dir: str,
tokenizer: PreTrainedTokenizer,
labels: List[str],
model_type: str,
max_seq_length: Optional[int] = None,
overwrite_cache=False,
mode: Split = Split.train,
):
examples = read_examples_from_file(data_dir, mode)
# TODO clean up all this to leverage built-in features of tokenizers
self.features = convert_examples_to_features(
examples,
labels,
max_seq_length,
tokenizer,
cls_token_at_end=bool(model_type in ["xlnet"]),
# xlnet has a cls token at the end
cls_token=tokenizer.cls_token,
cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
sep_token=tokenizer.sep_token,
sep_token_extra=False,
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
pad_on_left=bool(tokenizer.padding_side == "left"),
pad_token=tokenizer.pad_token_id,
pad_token_segment_id=tokenizer.pad_token_type_id,
pad_token_label_id=self.pad_token_label_id,
)
def gen():
for ex in self.features:
if ex.token_type_ids is None:
yield (
{"input_ids": ex.input_ids, "attention_mask": ex.attention_mask},
ex.label_ids,
)
else:
yield (
{
"input_ids": ex.input_ids,
"attention_mask": ex.attention_mask,
"token_type_ids": ex.token_type_ids,
},
ex.label_ids,
)
if "token_type_ids" not in tokenizer.model_input_names:
self.dataset = tf.data.Dataset.from_generator(
gen,
({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
(
{"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])},
tf.TensorShape([None]),
),
)
else:
self.dataset = tf.data.Dataset.from_generator(
gen,
({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
(
{
"input_ids": tf.TensorShape([None]),
"attention_mask": tf.TensorShape([None]),
"token_type_ids": tf.TensorShape([None]),
},
tf.TensorShape([None]),
),
)
def get_dataset(self):
self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features)))
return self.dataset
def __len__(self):
return len(self.features)
def __getitem__(self, i) -> InputFeatures:
return self.features[i]
def read_examples_from_file(data_dir, mode: Union[Split, str]) -> List[InputExample]:
if isinstance(mode, Split):
mode = mode.value
file_path = os.path.join(data_dir, f"{mode}.txt")
guid_index = 1
examples = []
with open(file_path, encoding="utf-8") as f:
words = []
labels = []
for line in f:
if line.startswith("-DOCSTART-") or line == "" or line == "\n":
if words:
examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
guid_index += 1
words = []
labels = []
else:
splits = line.split(" ")
words.append(splits[0])
if len(splits) > 1:
labels.append(splits[-1].replace("\n", ""))
else:
# Examples could have no label for mode = "test"
labels.append("O")
if words:
examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
return examples
def convert_examples_to_features(
examples: List[InputExample],
label_list: List[str],
max_seq_length: int,
@ -277,7 +90,7 @@ def convert_examples_to_features(
pad_token_label_id=-100,
sequence_a_segment_id=0,
mask_padding_with_zero=True,
) -> List[InputFeatures]:
) -> List[InputFeatures]:
""" Loads a data file into a list of `InputFeatures`
`cls_token_at_end` define the location of the CLS token:
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
@ -389,12 +202,169 @@ def convert_examples_to_features(
return features
def get_labels(path: str) -> List[str]:
if path:
with open(path, "r") as f:
labels = f.read().splitlines()
if "O" not in labels:
labels = ["O"] + labels
return labels
if is_torch_available():
import torch
from torch import nn
from torch.utils.data.dataset import Dataset
class TokenClassificationDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach
soon.
"""
features: List[InputFeatures]
pad_token_label_id: int = nn.CrossEntropyLoss().ignore_index
# Use cross entropy ignore_index as padding label id so that only
# real label ids contribute to the loss later.
def __init__(
self,
token_classification_task: TokenClassificationTask,
data_dir: str,
tokenizer: PreTrainedTokenizer,
labels: List[str],
model_type: str,
max_seq_length: Optional[int] = None,
overwrite_cache=False,
mode: Split = Split.train,
):
# Load data features from cache or dataset file
cached_features_file = os.path.join(
data_dir, "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
)
# Make sure only the first process in distributed training processes the dataset,
# and the others will use the cache.
lock_path = cached_features_file + ".lock"
with FileLock(lock_path):
if os.path.exists(cached_features_file) and not overwrite_cache:
logger.info(f"Loading features from cached file {cached_features_file}")
self.features = torch.load(cached_features_file)
else:
return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
logger.info(f"Creating features from dataset file at {data_dir}")
examples = token_classification_task.read_examples_from_file(data_dir, mode)
# TODO clean up all this to leverage built-in features of tokenizers
self.features = token_classification_task.convert_examples_to_features(
examples,
labels,
max_seq_length,
tokenizer,
cls_token_at_end=bool(model_type in ["xlnet"]),
# xlnet has a cls token at the end
cls_token=tokenizer.cls_token,
cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
sep_token=tokenizer.sep_token,
sep_token_extra=False,
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
pad_on_left=bool(tokenizer.padding_side == "left"),
pad_token=tokenizer.pad_token_id,
pad_token_segment_id=tokenizer.pad_token_type_id,
pad_token_label_id=self.pad_token_label_id,
)
logger.info(f"Saving features into cached file {cached_features_file}")
torch.save(self.features, cached_features_file)
def __len__(self):
return len(self.features)
def __getitem__(self, i) -> InputFeatures:
return self.features[i]
if is_tf_available():
import tensorflow as tf
class TFNerDataset:
"""
This will be superseded by a framework-agnostic approach
soon.
"""
features: List[InputFeatures]
pad_token_label_id: int = -100
# Use cross entropy ignore_index as padding label id so that only
# real label ids contribute to the loss later.
def __init__(
self,
token_classification_task: TokenClassificationTask,
data_dir: str,
tokenizer: PreTrainedTokenizer,
labels: List[str],
model_type: str,
max_seq_length: Optional[int] = None,
overwrite_cache=False,
mode: Split = Split.train,
):
examples = token_classification_task.read_examples_from_file(data_dir, mode)
# TODO clean up all this to leverage built-in features of tokenizers
self.features = token_classification_task.convert_examples_to_features(
examples,
labels,
max_seq_length,
tokenizer,
cls_token_at_end=bool(model_type in ["xlnet"]),
# xlnet has a cls token at the end
cls_token=tokenizer.cls_token,
cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
sep_token=tokenizer.sep_token,
sep_token_extra=False,
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
pad_on_left=bool(tokenizer.padding_side == "left"),
pad_token=tokenizer.pad_token_id,
pad_token_segment_id=tokenizer.pad_token_type_id,
pad_token_label_id=self.pad_token_label_id,
)
def gen():
for ex in self.features:
if ex.token_type_ids is None:
yield (
{"input_ids": ex.input_ids, "attention_mask": ex.attention_mask},
ex.label_ids,
)
else:
yield (
{
"input_ids": ex.input_ids,
"attention_mask": ex.attention_mask,
"token_type_ids": ex.token_type_ids,
},
ex.label_ids,
)
if "token_type_ids" not in tokenizer.model_input_names:
self.dataset = tf.data.Dataset.from_generator(
gen,
({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
(
{"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])},
tf.TensorShape([None]),
),
)
else:
self.dataset = tf.data.Dataset.from_generator(
gen,
({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
(
{
"input_ids": tf.TensorShape([None]),
"attention_mask": tf.TensorShape([None]),
"token_type_ids": tf.TensorShape([None]),
},
tf.TensorShape([None]),
),
)
def get_dataset(self):
self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features)))
return self.dataset
def __len__(self):
return len(self.features)
def __getitem__(self, i) -> InputFeatures:
return self.features[i]

View File

@ -5,6 +5,7 @@ include_trailing_comma = True
known_first_party = transformers
known_third_party =
absl
conllu
elasticsearch
fairseq
faiss