Tidy Pytorch GLUE benchmark example (#23134)

Migration to Evaluate for metric is not quite complete
2025-07-31 18:22:34 +06:00 · 2023-05-03 12:50:41 -07:00 · 2023-05-03 12:50:41 -07:00 · b6933d76d2
commit b6933d76d2
parent b0a78091a5
1 changed files with 6 additions and 9 deletions
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@ -486,6 +486,8 @@ def main():
    # Get the metric function
    if data_args.task_name is not None:
        metric = evaluate.load("glue", data_args.task_name)
    elif is_regression:
        metric = evaluate.load("mse")
    else:
        metric = evaluate.load("accuracy")
@ -494,15 +496,10 @@ def main():
    def compute_metrics(p: EvalPrediction):
        preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
        preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
-        if data_args.task_name is not None:
+        result = metric.compute(predictions=preds, references=p.label_ids)
-            result = metric.compute(predictions=preds, references=p.label_ids)
+        if len(result) > 1:
-            if len(result) > 1:
+            result["combined_score"] = np.mean(list(result.values())).item()
-                result["combined_score"] = np.mean(list(result.values())).item()
+        return result
            return result
        elif is_regression:
            return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
        else:
            return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
    # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
    # we already did the padding.