Add POS tagging and Phrase chunking token classification examples (#6457)

* Add more token classification examples * POS tagging example * Phrase chunking example * PR review fixes * Add conllu to third party list (used in token classification examples)
2025-07-30 17:52:35 +06:00 · 2020-08-13 12:09:51 -04:00 · 2020-08-13 12:09:51 -04:00 · eda07efaa5
commit eda07efaa5
parent f51161e230
10 changed files with 473 additions and 204 deletions
--- a/examples/requirements.txt
+++ b/examples/requirements.txt
@ -15,3 +15,4 @@ pandas
 nlp
 fire
 pytest
+conllu
--- a/examples/token-classification/run.sh
+++ b/examples/token-classification/run.sh
@ -18,6 +18,7 @@ export SAVE_STEPS=750
 export SEED=1

 python3 run_ner.py \
+--task_type NER \
 --data_dir . \
 --labels ./labels.txt \
 --model_name_or_path $BERT_MODEL \
--- a/examples/token-classification/run_chunk.sh
+++ b/examples/token-classification/run_chunk.sh
@ -0,0 +1,37 @@
+if ! [ -f ./dev.txt ]; then
+  echo "Downloading CONLL2003 dev dataset...."
+  curl -L -o ./dev.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/valid.txt'
+fi
+
+if ! [ -f ./test.txt ]; then
+  echo "Downloading CONLL2003 test dataset...."
+  curl -L -o ./test.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/test.txt'
+fi
+
+if ! [ -f ./train.txt ]; then
+  echo "Downloading CONLL2003 train dataset...."
+  curl -L -o ./train.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/train.txt'
+fi
+
+export MAX_LENGTH=200
+export BERT_MODEL=bert-base-uncased
+export OUTPUT_DIR=chunker-model
+export BATCH_SIZE=32
+export NUM_EPOCHS=3
+export SAVE_STEPS=750
+export SEED=1
+
+python3 run_ner.py \
+--task_type Chunk \
+--data_dir . \
+--model_name_or_path $BERT_MODEL \
+--output_dir $OUTPUT_DIR \
+--max_seq_length  $MAX_LENGTH \
+--num_train_epochs $NUM_EPOCHS \
+--per_gpu_train_batch_size $BATCH_SIZE \
+--save_steps $SAVE_STEPS \
+--seed $SEED \
+--do_train \
+--do_eval \
+--do_predict
+
--- a/examples/token-classification/run_ner.py
+++ b/examples/token-classification/run_ner.py
@ -14,16 +14,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Fine-tuning the library models for named entity recognition on CoNLL-2003. """
-
-
 import logging
 import os
 import sys
 from dataclasses import dataclass, field
+from importlib import import_module
 from typing import Dict, List, Optional, Tuple

 import numpy as np
-from seqeval.metrics import f1_score, precision_score, recall_score
+from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
 from torch import nn

 from transformers import (
@ -36,7 +35,7 @@ from transformers import (
    TrainingArguments,
    set_seed,
 )
-from utils_ner import NerDataset, Split, get_labels
+from utils_ner import Split, TokenClassificationDataset, TokenClassificationTask


 logger = logging.getLogger(__name__)
@ -54,6 +53,9 @@ class ModelArguments:
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
+    task_type: Optional[str] = field(
+        default="NER", metadata={"help": "Task type to fine tune in training (e.g. NER, POS, etc)"}
+    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
@ -113,6 +115,16 @@ def main():
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

+    module = import_module("tasks")
+    try:
+        token_classification_task_clazz = getattr(module, model_args.task_type)
+        token_classification_task: TokenClassificationTask = token_classification_task_clazz()
+    except AttributeError:
+        raise ValueError(
+            f"Task {model_args.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. "
+            f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}"
+        )
+
    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
@ -133,7 +145,7 @@ def main():
    set_seed(training_args.seed)

    # Prepare CONLL-2003 task
-    labels = get_labels(data_args.labels)
+    labels = token_classification_task.get_labels(data_args.labels)
    label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
    num_labels = len(labels)

@ -164,7 +176,8 @@ def main():

    # Get datasets
    train_dataset = (
-        NerDataset(
+        TokenClassificationDataset(
+            token_classification_task=token_classification_task,
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
@ -177,7 +190,8 @@ def main():
        else None
    )
    eval_dataset = (
-        NerDataset(
+        TokenClassificationDataset(
+            token_classification_task=token_classification_task,
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
@ -209,6 +223,7 @@ def main():
    def compute_metrics(p: EvalPrediction) -> Dict:
        preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
        return {
+            "accuracy_score": accuracy_score(out_label_list, preds_list),
            "precision": precision_score(out_label_list, preds_list),
            "recall": recall_score(out_label_list, preds_list),
            "f1": f1_score(out_label_list, preds_list),
@ -253,7 +268,8 @@ def main():

    # Predict
    if training_args.do_predict:
-        test_dataset = NerDataset(
+        test_dataset = TokenClassificationDataset(
+            token_classification_task=token_classification_task,
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
@ -278,19 +294,7 @@ def main():
        if trainer.is_world_master():
            with open(output_test_predictions_file, "w") as writer:
                with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f:
-                    example_id = 0
-                    for line in f:
-                        if line.startswith("-DOCSTART-") or line == "" or line == "\n":
-                            writer.write(line)
-                            if not preds_list[example_id]:
-                                example_id += 1
-                        elif preds_list[example_id]:
-                            output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n"
-                            writer.write(output_line)
-                        else:
-                            logger.warning(
-                                "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]
-                            )
+                    token_classification_task.write_predictions_to_file(writer, f, preds_list)

    return results

--- a/examples/token-classification/run_pl_ner.py
+++ b/examples/token-classification/run_pl_ner.py
@ -2,15 +2,17 @@ import argparse
 import glob
 import logging
 import os
+from argparse import Namespace
+from importlib import import_module

 import numpy as np
 import torch
-from seqeval.metrics import f1_score, precision_score, recall_score
+from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
 from torch.nn import CrossEntropyLoss
 from torch.utils.data import DataLoader, TensorDataset

 from lightning_base import BaseTransformer, add_generic_args, generic_train
-from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
+from utils_ner import TokenClassificationTask


 logger = logging.getLogger(__name__)
@ -24,10 +26,20 @@ class NERTransformer(BaseTransformer):
    mode = "token-classification"

    def __init__(self, hparams):
-        self.labels = get_labels(hparams.labels)
-        num_labels = len(self.labels)
+        if type(hparams) == dict:
+            hparams = Namespace(**hparams)
+        module = import_module("tasks")
+        try:
+            token_classification_task_clazz = getattr(module, hparams.task_type)
+            self.token_classification_task: TokenClassificationTask = token_classification_task_clazz()
+        except AttributeError:
+            raise ValueError(
+                f"Task {hparams.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. "
+                f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}"
+            )
+        self.labels = self.token_classification_task.get_labels(hparams.labels)
        self.pad_token_label_id = CrossEntropyLoss().ignore_index
-        super().__init__(hparams, num_labels, self.mode)
+        super().__init__(hparams, len(self.labels), self.mode)

    def forward(self, **inputs):
        return self.model(**inputs)
@ -42,8 +54,8 @@ class NERTransformer(BaseTransformer):

        outputs = self(**inputs)
        loss = outputs[0]
-        tensorboard_logs = {"loss": loss, "rate": self.lr_scheduler.get_last_lr()[-1]}
-        return {"loss": loss, "log": tensorboard_logs}
+        # tensorboard_logs = {"loss": loss, "rate": self.lr_scheduler.get_last_lr()[-1]}
+        return {"loss": loss}

    def prepare_data(self):
        "Called to initialize data. Use the call to construct features"
@ -55,8 +67,8 @@ class NERTransformer(BaseTransformer):
                features = torch.load(cached_features_file)
            else:
                logger.info("Creating features from dataset file at %s", args.data_dir)
-                examples = read_examples_from_file(args.data_dir, mode)
-                features = convert_examples_to_features(
+                examples = self.token_classification_task.read_examples_from_file(args.data_dir, mode)
+                features = self.token_classification_task.convert_examples_to_features(
                    examples,
                    self.labels,
                    args.max_seq_length,
@ -74,7 +86,7 @@ class NERTransformer(BaseTransformer):
                logger.info("Saving features into cached file %s", cached_features_file)
                torch.save(features, cached_features_file)

-    def load_dataset(self, mode, batch_size):
+    def get_dataloader(self, mode: int, batch_size: int) -> DataLoader:
        "Load datasets. Called after prepare data."
        cached_features_file = self._feature_file(mode)
        logger.info("Loading features from cached file %s", cached_features_file)
@ -124,6 +136,7 @@ class NERTransformer(BaseTransformer):

        results = {
            "val_loss": val_loss_mean,
+            "accuracy_score": accuracy_score(out_label_list, preds_list),
            "precision": precision_score(out_label_list, preds_list),
            "recall": recall_score(out_label_list, preds_list),
            "f1": f1_score(out_label_list, preds_list),
@ -154,6 +167,9 @@ class NERTransformer(BaseTransformer):
    def add_model_specific_args(parser, root_dir):
        # Add NER specific options
        BaseTransformer.add_model_specific_args(parser, root_dir)
+        parser.add_argument(
+            "--task_type", default="NER", type=str, help="Task type to fine tune in training (e.g. NER, POS, etc)"
+        )
        parser.add_argument(
            "--max_seq_length",
            default=128,
--- a/examples/token-classification/run_pos.sh
+++ b/examples/token-classification/run_pos.sh
@ -0,0 +1,37 @@
+if ! [ -f ./dev.txt ]; then
+  echo "Download dev dataset...."
+  curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
+fi
+
+if ! [ -f ./test.txt ]; then
+  echo "Download test dataset...."
+  curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
+fi
+
+if ! [ -f ./train.txt ]; then
+  echo "Download train dataset...."
+  curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
+fi
+
+export MAX_LENGTH=200
+export BERT_MODEL=bert-base-uncased
+export OUTPUT_DIR=postagger-model
+export BATCH_SIZE=32
+export NUM_EPOCHS=3
+export SAVE_STEPS=750
+export SEED=1
+
+python3 run_ner.py \
+--task_type POS \
+--data_dir . \
+--model_name_or_path $BERT_MODEL \
+--output_dir $OUTPUT_DIR \
+--max_seq_length  $MAX_LENGTH \
+--num_train_epochs $NUM_EPOCHS \
+--per_gpu_train_batch_size $BATCH_SIZE \
+--save_steps $SAVE_STEPS \
+--seed $SEED \
+--do_train \
+--do_eval \
+--do_predict
+
--- a/examples/token-classification/run_pos_pl.sh
+++ b/examples/token-classification/run_pos_pl.sh
@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+if ! [ -f ./dev.txt ]; then
+  echo "Download dev dataset...."
+  curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
+fi
+
+if ! [ -f ./test.txt ]; then
+  echo "Download test dataset...."
+  curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
+fi
+
+if ! [ -f ./train.txt ]; then
+  echo "Download train dataset...."
+  curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
+fi
+
+export MAX_LENGTH=200
+export BERT_MODEL=bert-base-uncased
+export OUTPUT_DIR=postagger-model
+export BATCH_SIZE=32
+export NUM_EPOCHS=3
+export SAVE_STEPS=750
+export SEED=1
+
+
+# Add parent directory to python path to access lightning_base.py
+export PYTHONPATH="../":"${PYTHONPATH}"
+
+python3 run_pl_ner.py --data_dir ./ \
+--task_type POS \
+--model_name_or_path $BERT_MODEL \
+--output_dir $OUTPUT_DIR \
+--max_seq_length  $MAX_LENGTH \
+--num_train_epochs $NUM_EPOCHS \
+--train_batch_size $BATCH_SIZE \
+--seed $SEED \
+--gpus 1 \
+--do_train \
+--do_predict
--- a/examples/token-classification/tasks.py
+++ b/examples/token-classification/tasks.py
@ -0,0 +1,163 @@
+import logging
+import os
+from typing import List, TextIO, Union
+
+from conllu import parse_incr
+
+from utils_ner import InputExample, Split, TokenClassificationTask
+
+
+logger = logging.getLogger(__name__)
+
+
+class NER(TokenClassificationTask):
+    def __init__(self, label_idx=-1):
+        # in NER datasets, the last column is usually reserved for NER label
+        self.label_idx = label_idx
+
+    def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
+        if isinstance(mode, Split):
+            mode = mode.value
+        file_path = os.path.join(data_dir, f"{mode}.txt")
+        guid_index = 1
+        examples = []
+        with open(file_path, encoding="utf-8") as f:
+            words = []
+            labels = []
+            for line in f:
+                if line.startswith("-DOCSTART-") or line == "" or line == "\n":
+                    if words:
+                        examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
+                        guid_index += 1
+                        words = []
+                        labels = []
+                else:
+                    splits = line.split(" ")
+                    words.append(splits[0])
+                    if len(splits) > 1:
+                        labels.append(splits[self.label_idx].replace("\n", ""))
+                    else:
+                        # Examples could have no label for mode = "test"
+                        labels.append("O")
+            if words:
+                examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
+        return examples
+
+    def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: List):
+        example_id = 0
+        for line in test_input_reader:
+            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
+                writer.write(line)
+                if not preds_list[example_id]:
+                    example_id += 1
+            elif preds_list[example_id]:
+                output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n"
+                writer.write(output_line)
+            else:
+                logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
+
+    def get_labels(self, path: str) -> List[str]:
+        if path:
+            with open(path, "r") as f:
+                labels = f.read().splitlines()
+            if "O" not in labels:
+                labels = ["O"] + labels
+            return labels
+        else:
+            return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
+
+
+class Chunk(NER):
+    def __init__(self):
+        # in CONLL2003 dataset chunk column is second-to-last
+        super().__init__(label_idx=-2)
+
+    def get_labels(self, path: str) -> List[str]:
+        if path:
+            with open(path, "r") as f:
+                labels = f.read().splitlines()
+            if "O" not in labels:
+                labels = ["O"] + labels
+            return labels
+        else:
+            return [
+                "O",
+                "B-ADVP",
+                "B-INTJ",
+                "B-LST",
+                "B-PRT",
+                "B-NP",
+                "B-SBAR",
+                "B-VP",
+                "B-ADJP",
+                "B-CONJP",
+                "B-PP",
+                "I-ADVP",
+                "I-INTJ",
+                "I-LST",
+                "I-PRT",
+                "I-NP",
+                "I-SBAR",
+                "I-VP",
+                "I-ADJP",
+                "I-CONJP",
+                "I-PP",
+            ]
+
+
+class POS(TokenClassificationTask):
+    def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
+        if isinstance(mode, Split):
+            mode = mode.value
+        file_path = os.path.join(data_dir, f"{mode}.txt")
+        guid_index = 1
+        examples = []
+
+        with open(file_path, encoding="utf-8") as f:
+            for sentence in parse_incr(f):
+                words = []
+                labels = []
+                for token in sentence:
+                    words.append(token["form"])
+                    labels.append(token["upos"])
+                assert len(words) == len(labels)
+                if words:
+                    examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
+                    guid_index += 1
+        return examples
+
+    def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: List):
+        example_id = 0
+        for sentence in parse_incr(test_input_reader):
+            s_p = preds_list[example_id]
+            out = ""
+            for token in sentence:
+                out += f'{token["form"]} ({token["upos"]}|{s_p.pop(0)}) '
+            out += "\n"
+            writer.write(out)
+            example_id += 1
+
+    def get_labels(self, path: str) -> List[str]:
+        if path:
+            with open(path, "r") as f:
+                return f.read().splitlines()
+        else:
+            return [
+                "ADJ",
+                "ADP",
+                "ADV",
+                "AUX",
+                "CCONJ",
+                "DET",
+                "INTJ",
+                "NOUN",
+                "NUM",
+                "PART",
+                "PRON",
+                "PROPN",
+                "PUNCT",
+                "SCONJ",
+                "SYM",
+                "VERB",
+                "X",
+            ]
--- a/examples/token-classification/utils_ner.py
+++ b/examples/token-classification/utils_ner.py
@ -66,12 +66,148 @@ class Split(Enum):
    test = "test"


+class TokenClassificationTask:
+    def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
+        raise NotImplementedError
+
+    def get_labels(self, path: str) -> List[str]:
+        raise NotImplementedError
+
+    def convert_examples_to_features(
+        self,
+        examples: List[InputExample],
+        label_list: List[str],
+        max_seq_length: int,
+        tokenizer: PreTrainedTokenizer,
+        cls_token_at_end=False,
+        cls_token="[CLS]",
+        cls_token_segment_id=1,
+        sep_token="[SEP]",
+        sep_token_extra=False,
+        pad_on_left=False,
+        pad_token=0,
+        pad_token_segment_id=0,
+        pad_token_label_id=-100,
+        sequence_a_segment_id=0,
+        mask_padding_with_zero=True,
+    ) -> List[InputFeatures]:
+        """ Loads a data file into a list of `InputFeatures`
+            `cls_token_at_end` define the location of the CLS token:
+                - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
+                - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
+            `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
+        """
+        # TODO clean up all this to leverage built-in features of tokenizers
+
+        label_map = {label: i for i, label in enumerate(label_list)}
+
+        features = []
+        for (ex_index, example) in enumerate(examples):
+            if ex_index % 10_000 == 0:
+                logger.info("Writing example %d of %d", ex_index, len(examples))
+
+            tokens = []
+            label_ids = []
+            for word, label in zip(example.words, example.labels):
+                word_tokens = tokenizer.tokenize(word)
+
+                # bert-base-multilingual-cased sometimes output "nothing ([]) when calling tokenize with just a space.
+                if len(word_tokens) > 0:
+                    tokens.extend(word_tokens)
+                    # Use the real label id for the first token of the word, and padding ids for the remaining tokens
+                    label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))
+
+            # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
+            special_tokens_count = tokenizer.num_special_tokens_to_add()
+            if len(tokens) > max_seq_length - special_tokens_count:
+                tokens = tokens[: (max_seq_length - special_tokens_count)]
+                label_ids = label_ids[: (max_seq_length - special_tokens_count)]
+
+            # The convention in BERT is:
+            # (a) For sequence pairs:
+            #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+            #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
+            # (b) For single sequences:
+            #  tokens:   [CLS] the dog is hairy . [SEP]
+            #  type_ids:   0   0   0   0  0     0   0
+            #
+            # Where "type_ids" are used to indicate whether this is the first
+            # sequence or the second sequence. The embedding vectors for `type=0` and
+            # `type=1` were learned during pre-training and are added to the wordpiece
+            # embedding vector (and position vector). This is not *strictly* necessary
+            # since the [SEP] token unambiguously separates the sequences, but it makes
+            # it easier for the model to learn the concept of sequences.
+            #
+            # For classification tasks, the first vector (corresponding to [CLS]) is
+            # used as as the "sentence vector". Note that this only makes sense because
+            # the entire model is fine-tuned.
+            tokens += [sep_token]
+            label_ids += [pad_token_label_id]
+            if sep_token_extra:
+                # roberta uses an extra separator b/w pairs of sentences
+                tokens += [sep_token]
+                label_ids += [pad_token_label_id]
+            segment_ids = [sequence_a_segment_id] * len(tokens)
+
+            if cls_token_at_end:
+                tokens += [cls_token]
+                label_ids += [pad_token_label_id]
+                segment_ids += [cls_token_segment_id]
+            else:
+                tokens = [cls_token] + tokens
+                label_ids = [pad_token_label_id] + label_ids
+                segment_ids = [cls_token_segment_id] + segment_ids
+
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            padding_length = max_seq_length - len(input_ids)
+            if pad_on_left:
+                input_ids = ([pad_token] * padding_length) + input_ids
+                input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
+                segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
+                label_ids = ([pad_token_label_id] * padding_length) + label_ids
+            else:
+                input_ids += [pad_token] * padding_length
+                input_mask += [0 if mask_padding_with_zero else 1] * padding_length
+                segment_ids += [pad_token_segment_id] * padding_length
+                label_ids += [pad_token_label_id] * padding_length
+
+            assert len(input_ids) == max_seq_length
+            assert len(input_mask) == max_seq_length
+            assert len(segment_ids) == max_seq_length
+            assert len(label_ids) == max_seq_length
+
+            if ex_index < 5:
+                logger.info("*** Example ***")
+                logger.info("guid: %s", example.guid)
+                logger.info("tokens: %s", " ".join([str(x) for x in tokens]))
+                logger.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
+                logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
+                logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
+                logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))
+
+            if "token_type_ids" not in tokenizer.model_input_names:
+                segment_ids = None
+
+            features.append(
+                InputFeatures(
+                    input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, label_ids=label_ids
+                )
+            )
+        return features
+
+
 if is_torch_available():
    import torch
    from torch import nn
    from torch.utils.data.dataset import Dataset

-    class NerDataset(Dataset):
+    class TokenClassificationDataset(Dataset):
        """
        This will be superseded by a framework-agnostic approach
        soon.
@ -84,6 +220,7 @@ if is_torch_available():

        def __init__(
            self,
+            token_classification_task: TokenClassificationTask,
            data_dir: str,
            tokenizer: PreTrainedTokenizer,
            labels: List[str],
@ -107,9 +244,9 @@ if is_torch_available():
                    self.features = torch.load(cached_features_file)
                else:
                    logger.info(f"Creating features from dataset file at {data_dir}")
-                    examples = read_examples_from_file(data_dir, mode)
+                    examples = token_classification_task.read_examples_from_file(data_dir, mode)
                    # TODO clean up all this to leverage built-in features of tokenizers
-                    self.features = convert_examples_to_features(
+                    self.features = token_classification_task.convert_examples_to_features(
                        examples,
                        labels,
                        max_seq_length,
@ -152,6 +289,7 @@ if is_tf_available():

        def __init__(
            self,
+            token_classification_task: TokenClassificationTask,
            data_dir: str,
            tokenizer: PreTrainedTokenizer,
            labels: List[str],
@ -160,9 +298,9 @@ if is_tf_available():
            overwrite_cache=False,
            mode: Split = Split.train,
        ):
-            examples = read_examples_from_file(data_dir, mode)
+            examples = token_classification_task.read_examples_from_file(data_dir, mode)
            # TODO clean up all this to leverage built-in features of tokenizers
-            self.features = convert_examples_to_features(
+            self.features = token_classification_task.convert_examples_to_features(
                examples,
                labels,
                max_seq_length,
@ -230,171 +368,3 @@ if is_tf_available():

        def __getitem__(self, i) -> InputFeatures:
            return self.features[i]
-
-
-def read_examples_from_file(data_dir, mode: Union[Split, str]) -> List[InputExample]:
-    if isinstance(mode, Split):
-        mode = mode.value
-    file_path = os.path.join(data_dir, f"{mode}.txt")
-    guid_index = 1
-    examples = []
-    with open(file_path, encoding="utf-8") as f:
-        words = []
-        labels = []
-        for line in f:
-            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
-                if words:
-                    examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
-                    guid_index += 1
-                    words = []
-                    labels = []
-            else:
-                splits = line.split(" ")
-                words.append(splits[0])
-                if len(splits) > 1:
-                    labels.append(splits[-1].replace("\n", ""))
-                else:
-                    # Examples could have no label for mode = "test"
-                    labels.append("O")
-        if words:
-            examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
-    return examples
-
-
-def convert_examples_to_features(
-    examples: List[InputExample],
-    label_list: List[str],
-    max_seq_length: int,
-    tokenizer: PreTrainedTokenizer,
-    cls_token_at_end=False,
-    cls_token="[CLS]",
-    cls_token_segment_id=1,
-    sep_token="[SEP]",
-    sep_token_extra=False,
-    pad_on_left=False,
-    pad_token=0,
-    pad_token_segment_id=0,
-    pad_token_label_id=-100,
-    sequence_a_segment_id=0,
-    mask_padding_with_zero=True,
-) -> List[InputFeatures]:
-    """ Loads a data file into a list of `InputFeatures`
-        `cls_token_at_end` define the location of the CLS token:
-            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
-            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
-        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
-    """
-    # TODO clean up all this to leverage built-in features of tokenizers
-
-    label_map = {label: i for i, label in enumerate(label_list)}
-
-    features = []
-    for (ex_index, example) in enumerate(examples):
-        if ex_index % 10_000 == 0:
-            logger.info("Writing example %d of %d", ex_index, len(examples))
-
-        tokens = []
-        label_ids = []
-        for word, label in zip(example.words, example.labels):
-            word_tokens = tokenizer.tokenize(word)
-
-            # bert-base-multilingual-cased sometimes output "nothing ([]) when calling tokenize with just a space.
-            if len(word_tokens) > 0:
-                tokens.extend(word_tokens)
-                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
-                label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))
-
-        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
-        special_tokens_count = tokenizer.num_special_tokens_to_add()
-        if len(tokens) > max_seq_length - special_tokens_count:
-            tokens = tokens[: (max_seq_length - special_tokens_count)]
-            label_ids = label_ids[: (max_seq_length - special_tokens_count)]
-
-        # The convention in BERT is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids:   0   0   0   0  0     0   0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambiguously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-        tokens += [sep_token]
-        label_ids += [pad_token_label_id]
-        if sep_token_extra:
-            # roberta uses an extra separator b/w pairs of sentences
-            tokens += [sep_token]
-            label_ids += [pad_token_label_id]
-        segment_ids = [sequence_a_segment_id] * len(tokens)
-
-        if cls_token_at_end:
-            tokens += [cls_token]
-            label_ids += [pad_token_label_id]
-            segment_ids += [cls_token_segment_id]
-        else:
-            tokens = [cls_token] + tokens
-            label_ids = [pad_token_label_id] + label_ids
-            segment_ids = [cls_token_segment_id] + segment_ids
-
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-        # Zero-pad up to the sequence length.
-        padding_length = max_seq_length - len(input_ids)
-        if pad_on_left:
-            input_ids = ([pad_token] * padding_length) + input_ids
-            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
-            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
-            label_ids = ([pad_token_label_id] * padding_length) + label_ids
-        else:
-            input_ids += [pad_token] * padding_length
-            input_mask += [0 if mask_padding_with_zero else 1] * padding_length
-            segment_ids += [pad_token_segment_id] * padding_length
-            label_ids += [pad_token_label_id] * padding_length
-
-        assert len(input_ids) == max_seq_length
-        assert len(input_mask) == max_seq_length
-        assert len(segment_ids) == max_seq_length
-        assert len(label_ids) == max_seq_length
-
-        if ex_index < 5:
-            logger.info("*** Example ***")
-            logger.info("guid: %s", example.guid)
-            logger.info("tokens: %s", " ".join([str(x) for x in tokens]))
-            logger.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
-            logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
-            logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
-            logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))
-
-        if "token_type_ids" not in tokenizer.model_input_names:
-            segment_ids = None
-
-        features.append(
-            InputFeatures(
-                input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, label_ids=label_ids
-            )
-        )
-    return features
-
-
-def get_labels(path: str) -> List[str]:
-    if path:
-        with open(path, "r") as f:
-            labels = f.read().splitlines()
-        if "O" not in labels:
-            labels = ["O"] + labels
-        return labels
-    else:
-        return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
--- a/setup.cfg
+++ b/setup.cfg
@ -5,6 +5,7 @@ include_trailing_comma = True
 known_first_party = transformers
 known_third_party =
    absl
+    conllu
    elasticsearch
    fairseq
    faiss