From 46ed56cfd1544296feb73b707022149cf03f8c5e Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 14 Jan 2021 03:37:07 -0500
Subject: [PATCH] Switch metrics in run_ner to datasets (#9567)

* Switch metrics in run_ner to datasets

* Add flag to return all metrics

* Upstream (and rename) sortish_sampler

* Revert "Upstream (and rename) sortish_sampler"

This reverts commit e07d0dcf650c2bae36da011dd76c77a8bb4feb0d.
---
 examples/test_examples.py                |  2 +-
 examples/token-classification/run_ner.py | 33 ++++++++++++++++++------
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/examples/test_examples.py b/examples/test_examples.py
index eb0809d0bc1..28afb3028d0 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -184,7 +184,7 @@ class ExamplesTests(TestCasePlus):
 
         with patch.object(sys, "argv", testargs):
             result = run_ner.main()
-            self.assertGreaterEqual(result["eval_accuracy_score"], 0.75)
+            self.assertGreaterEqual(result["eval_accuracy"], 0.75)
             self.assertGreaterEqual(result["eval_precision"], 0.75)
             self.assertLess(result["eval_loss"], 0.5)
 
diff --git a/examples/token-classification/run_ner.py b/examples/token-classification/run_ner.py
index 2b61e100b1e..807d2ee7c4e 100644
--- a/examples/token-classification/run_ner.py
+++ b/examples/token-classification/run_ner.py
@@ -25,8 +25,7 @@ from dataclasses import dataclass, field
 from typing import Optional
 
 import numpy as np
-from datasets import ClassLabel, load_dataset
-from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
+from datasets import ClassLabel, load_dataset, load_metric
 
 import transformers
 from transformers import (
@@ -124,6 +123,10 @@ class DataTrainingArguments:
             "one (in which case the other tokens will have a padding index)."
         },
     )
+    return_entity_level_metrics: bool = field(
+        default=False,
+        metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."},
+    )
 
     def __post_init__(self):
         if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@@ -323,6 +326,8 @@ def main():
     data_collator = DataCollatorForTokenClassification(tokenizer)
 
     # Metrics
+    metric = load_metric("seqeval")
+
     def compute_metrics(p):
         predictions, labels = p
         predictions = np.argmax(predictions, axis=2)
@@ -337,12 +342,24 @@ def main():
             for prediction, label in zip(predictions, labels)
         ]
 
-        return {
-            "accuracy_score": accuracy_score(true_labels, true_predictions),
-            "precision": precision_score(true_labels, true_predictions),
-            "recall": recall_score(true_labels, true_predictions),
-            "f1": f1_score(true_labels, true_predictions),
-        }
+        results = metric.compute(predictions=true_predictions, references=true_labels)
+        if data_args.return_entity_level_metrics:
+            # Unpack nested dictionaries
+            final_results = {}
+            for key, value in results.items():
+                if isinstance(value, dict):
+                    for n, v in value.items():
+                        final_results[f"{key}_{n}"] = v
+                else:
+                    final_results[key] = value
+            return final_results
+        else:
+            return {
+                "precision": results["overall_precision"],
+                "recall": results["overall_recall"],
+                "f1": results["overall_f1"],
+                "accuracy": results["overall_accuracy"],
+            }
 
     # Initialize our Trainer
     trainer = Trainer(