From ae3733f06e2e4de45584d143ab37dcf38ef15e1c Mon Sep 17 00:00:00 2001
From: Luc Georges <McPatate@users.noreply.github.com>
Date: Wed, 4 Jun 2025 15:40:52 +0200
Subject: [PATCH] feat: add `repository` field to benchmarks table (#38582)

* feat: add `repository` field to benchmarks table

* fix: remove unwanted `,`
---
 .github/workflows/benchmark.yml    |  2 +-
 benchmark/benchmarks_entrypoint.py | 27 ++++++++++++++++++---------
 benchmark/init_db.sql              |  1 +
 benchmark/llama.py                 |  8 ++++++--
 4 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 6b5555097c0..42fa1b43877 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -64,7 +64,7 @@ jobs:
             commit_id=$GITHUB_SHA
           fi
           commit_msg=$(git show -s --format=%s | cut -c1-70)
-          python3 benchmark/benchmarks_entrypoint.py "$BRANCH_NAME" "$commit_id" "$commit_msg"
+          python3 benchmark/benchmarks_entrypoint.py "huggingface/transformers" "$BRANCH_NAME" "$commit_id" "$commit_msg"
         env:
           HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
           # Enable this to see debug logs
diff --git a/benchmark/benchmarks_entrypoint.py b/benchmark/benchmarks_entrypoint.py
index 6c036fdd693..71b83254b40 100644
--- a/benchmark/benchmarks_entrypoint.py
+++ b/benchmark/benchmarks_entrypoint.py
@@ -2,11 +2,11 @@ import argparse
 import importlib.util
 import logging
 import os
-from typing import Dict
 import sys
+from typing import Dict, Tuple
 
-from psycopg2.extras import Json
 from psycopg2.extensions import register_adapter
+from psycopg2.extras import Json
 
 
 register_adapter(dict, Json)
@@ -17,10 +17,13 @@ class ImportModuleException(Exception):
 
 
 class MetricsRecorder:
-    def __init__(self, connection, logger: logging.Logger, branch: str, commit_id: str, commit_msg: str):
+    def __init__(
+        self, connection, logger: logging.Logger, repository: str, branch: str, commit_id: str, commit_msg: str
+    ):
         self.conn = connection
         self.conn.autocommit = True
         self.logger = logger
+        self.repository = repository
         self.branch = branch
         self.commit_id = commit_id
         self.commit_msg = commit_msg
@@ -32,8 +35,8 @@ class MetricsRecorder:
         # gpu_name: str, model_id: str
         with self.conn.cursor() as cur:
             cur.execute(
-                "INSERT INTO benchmarks (branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s) RETURNING benchmark_id",
-                (self.branch, self.commit_id, self.commit_msg, metadata),
+                "INSERT INTO benchmarks (repository, branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s, %s) RETURNING benchmark_id",
+                (self.repository, self.branch, self.commit_id, self.commit_msg, metadata),
             )
             benchmark_id = cur.fetchone()[0]
             logger.debug(f"initialised benchmark #{benchmark_id}")
@@ -82,12 +85,18 @@ handler.setFormatter(formatter)
 logger.addHandler(handler)
 
 
-def parse_arguments():
+def parse_arguments() -> Tuple[str, str, str, str]:
     """
     Parse command line arguments for the benchmarking CLI.
     """
     parser = argparse.ArgumentParser(description="CLI for benchmarking the huggingface/transformers.")
 
+    parser.add_argument(
+        "repository",
+        type=str,
+        help="The repository name on which the benchmarking is performed.",
+    )
+
     parser.add_argument(
         "branch",
         type=str,
@@ -108,7 +117,7 @@ def parse_arguments():
 
     args = parser.parse_args()
 
-    return args.branch, args.commit_id, args.commit_msg
+    return args.repository, args.branch, args.commit_id, args.commit_msg
 
 
 def import_from_path(module_name, file_path):
@@ -125,7 +134,7 @@ def import_from_path(module_name, file_path):
 if __name__ == "__main__":
     benchmarks_folder_path = os.path.dirname(os.path.realpath(__file__))
 
-    branch, commit_id, commit_msg = parse_arguments()
+    repository, branch, commit_id, commit_msg = parse_arguments()
 
     for entry in os.scandir(benchmarks_folder_path):
         try:
@@ -136,7 +145,7 @@ if __name__ == "__main__":
             logger.debug(f"loading: {entry.name}")
             module = import_from_path(entry.name.split(".")[0], entry.path)
             logger.info(f"running benchmarks in: {entry.name}")
-            module.run_benchmark(logger, branch, commit_id, commit_msg)
+            module.run_benchmark(logger, repository, branch, commit_id, commit_msg)
         except ImportModuleException as e:
             logger.error(e)
         except Exception as e:
diff --git a/benchmark/init_db.sql b/benchmark/init_db.sql
index a7864c4af18..9a575177d72 100644
--- a/benchmark/init_db.sql
+++ b/benchmark/init_db.sql
@@ -1,5 +1,6 @@
 CREATE TABLE IF NOT EXISTS benchmarks (
   benchmark_id SERIAL PRIMARY KEY,
+  repository VARCHAR(255),
   branch VARCHAR(255),
   commit_id VARCHAR(72),
   commit_message VARCHAR(70),
diff --git a/benchmark/llama.py b/benchmark/llama.py
index 6a477de7fbf..bc60454e0af 100644
--- a/benchmark/llama.py
+++ b/benchmark/llama.py
@@ -33,11 +33,15 @@ def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder):
         sleep(0.01)
 
 
-def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100):
+def run_benchmark(
+    logger: Logger, repository: str, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100
+):
     continue_metric_collection = Event()
     metrics_thread = None
     model_id = "meta-llama/Llama-2-7b-hf"
-    metrics_recorder = MetricsRecorder(psycopg2.connect("dbname=metrics"), logger, branch, commit_id, commit_msg)
+    metrics_recorder = MetricsRecorder(
+        psycopg2.connect("dbname=metrics"), logger, repository, branch, commit_id, commit_msg
+    )
     try:
         gpu_stats = gpustat.GPUStatCollection.new_query()
         gpu_name = gpu_stats[0]["name"]