From 46ed56cfd1544296feb73b707022149cf03f8c5e Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Thu, 14 Jan 2021 03:37:07 -0500 Subject: [PATCH] Switch metrics in run_ner to datasets (#9567) * Switch metrics in run_ner to datasets * Add flag to return all metrics * Upstream (and rename) sortish_sampler * Revert "Upstream (and rename) sortish_sampler" This reverts commit e07d0dcf650c2bae36da011dd76c77a8bb4feb0d. --- examples/test_examples.py | 2 +- examples/token-classification/run_ner.py | 33 ++++++++++++++++++------ 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/examples/test_examples.py b/examples/test_examples.py index eb0809d0bc1..28afb3028d0 100644 --- a/examples/test_examples.py +++ b/examples/test_examples.py @@ -184,7 +184,7 @@ class ExamplesTests(TestCasePlus): with patch.object(sys, "argv", testargs): result = run_ner.main() - self.assertGreaterEqual(result["eval_accuracy_score"], 0.75) + self.assertGreaterEqual(result["eval_accuracy"], 0.75) self.assertGreaterEqual(result["eval_precision"], 0.75) self.assertLess(result["eval_loss"], 0.5) diff --git a/examples/token-classification/run_ner.py b/examples/token-classification/run_ner.py index 2b61e100b1e..807d2ee7c4e 100644 --- a/examples/token-classification/run_ner.py +++ b/examples/token-classification/run_ner.py @@ -25,8 +25,7 @@ from dataclasses import dataclass, field from typing import Optional import numpy as np -from datasets import ClassLabel, load_dataset -from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score +from datasets import ClassLabel, load_dataset, load_metric import transformers from transformers import ( @@ -124,6 +123,10 @@ class DataTrainingArguments: "one (in which case the other tokens will have a padding index)." }, ) + return_entity_level_metrics: bool = field( + default=False, + metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."}, + ) def __post_init__(self): if self.dataset_name is None and self.train_file is None and self.validation_file is None: @@ -323,6 +326,8 @@ def main(): data_collator = DataCollatorForTokenClassification(tokenizer) # Metrics + metric = load_metric("seqeval") + def compute_metrics(p): predictions, labels = p predictions = np.argmax(predictions, axis=2) @@ -337,12 +342,24 @@ def main(): for prediction, label in zip(predictions, labels) ] - return { - "accuracy_score": accuracy_score(true_labels, true_predictions), - "precision": precision_score(true_labels, true_predictions), - "recall": recall_score(true_labels, true_predictions), - "f1": f1_score(true_labels, true_predictions), - } + results = metric.compute(predictions=true_predictions, references=true_labels) + if data_args.return_entity_level_metrics: + # Unpack nested dictionaries + final_results = {} + for key, value in results.items(): + if isinstance(value, dict): + for n, v in value.items(): + final_results[f"{key}_{n}"] = v + else: + final_results[key] = value + return final_results + else: + return { + "precision": results["overall_precision"], + "recall": results["overall_recall"], + "f1": results["overall_f1"], + "accuracy": results["overall_accuracy"], + } # Initialize our Trainer trainer = Trainer(