feat: add benchmarks_entrypoint.py (#34495)

* feat: add `benchmarks_entrypoint.py` Adding `benchmarks_entrypoint.py` file, which will be run from the benchmarks CI. This python script will list all python files from the `benchmark/` folder and run the included `run_benchmark` function, allowing people to add new benchmarks scripts. * feat: add `MetricsRecorder` * feat: update dashboard * fix: add missing arguments to `MetricsRecorder` * feat: update dash & add datasource + `default.yml` * fix: move responsibility to create `MetricsRecorder` in bench script * fix: update incorrect datasource UID * fix: incorrect variable values * debug: benchmark entrypoint script * refactor: update log level * fix: update broken import * feat: add debug log in `MetricsRecorder` * debug: set log level to debug * fix: set connection `autocommit` to `True`
2025-07-03 04:40:06 +06:00 · 2024-12-18 18:59:07 +01:00 · 2024-12-18 18:59:07 +01:00 · 9a94dfe123
commit 9a94dfe123
parent 2c47618c1a
8 changed files with 334 additions and 169 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -63,7 +63,7 @@ jobs:
            commit_id=$GITHUB_SHA
          fi
          commit_msg=$(git show -s --format=%s | cut -c1-70)
-          python3 benchmark/llama.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg"
+          python3 benchmark/benchmarks_entrypoint.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg"
        env:
          HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
          # Enable this to see debug logs
--- a/benchmark/README.md
+++ b/benchmark/README.md
@ -0,0 +1,49 @@
+# Benchmarks
+
+You might want to add new benchmarks.
+
+You will need to define a python function named `run_benchmark` in your python file and the file must be located in this `benchmark/` directory.
+
+The expected function signature is the following:
+
+```py
+def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100):
+```
+
+## Writing metrics to the database
+
+`MetricRecorder` is thread-safe, in the sense of the python [`Thread`](https://docs.python.org/3/library/threading.html#threading.Thread). This means you can start a background thread to do the readings on the device measurements while not blocking the main thread to execute the model measurements.
+
+cf [`llama.py`](./llama.py) to see an example of this in practice.
+
+```py
+from benchmarks_entrypoint import MetricsRecorder
+import psycopg2
+
+def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100):
+  metrics_recorder = MetricsRecorder(psycopg2.connect("dbname=metrics"), logger, branch, commit_id, commit_msg)
+  benchmark_id = metrics_recorder.initialise_benchmark({"gpu_name": gpu_name, "model_id": model_id})
+    # To collect device measurements
+    metrics_recorder.collect_device_measurements(
+        benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes
+    )
+    # To collect your model measurements
+    metrics_recorder.collect_model_measurements(
+        benchmark_id,
+        {
+            "model_load_time": model_load_time,
+            "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time,
+            "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time,
+            "first_eager_generate_time_secs": first_eager_generate_time,
+            "second_eager_generate_time_secs": second_eager_generate_time,
+            "time_to_first_token_secs": time_to_first_token,
+            "time_to_second_token_secs": time_to_second_token,
+            "time_to_third_token_secs": time_to_third_token,
+            "time_to_next_token_mean_secs": mean_time_to_next_token,
+            "first_compile_generate_time_secs": first_compile_generate_time,
+            "second_compile_generate_time_secs": second_compile_generate_time,
+            "third_compile_generate_time_secs": third_compile_generate_time,
+            "fourth_compile_generate_time_secs": fourth_compile_generate_time,
+        },
+    )
+```
--- a/benchmark/benchmarks_entrypoint.py
+++ b/benchmark/benchmarks_entrypoint.py
@ -0,0 +1,144 @@
+import argparse
+import importlib.util
+import logging
+import os
+from typing import Dict
+import psycopg2
+import sys
+
+from psycopg2.extras import Json
+from psycopg2.extensions import register_adapter
+
+
+register_adapter(dict, Json)
+
+
+class ImportModuleException(Exception):
+    pass
+
+
+class MetricsRecorder:
+    def __init__(self, connection, logger: logging.Logger, branch: str, commit_id: str, commit_msg: str):
+        self.conn = connection
+        self.conn.autocommit = True
+        self.logger = logger
+        self.branch = branch
+        self.commit_id = commit_id
+        self.commit_msg = commit_msg
+
+    def initialise_benchmark(self, metadata: Dict[str, str]) -> int:
+        """
+        Creates a new benchmark, returns the benchmark id
+        """
+        # gpu_name: str, model_id: str
+        with self.conn.cursor() as cur:
+            cur.execute(
+                "INSERT INTO benchmarks (branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s) RETURNING benchmark_id",
+                (self.branch, self.commit_id, self.commit_msg, metadata),
+            )
+            benchmark_id = cur.fetchone()[0]
+            logger.debug(f"initialised benchmark #{benchmark_id}")
+            return benchmark_id
+
+    def collect_device_measurements(self, benchmark_id: int, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes):
+        """
+        Collect device metrics, such as CPU & GPU usage. These are "static", as in you cannot pass arbitrary arguments to the function.
+        """
+        with self.conn.cursor() as cur:
+            cur.execute(
+                "INSERT INTO device_measurements (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes) VALUES (%s, %s, %s, %s, %s)",
+                (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes),
+            )
+        self.logger.debug(
+            f"inserted device measurements for benchmark #{benchmark_id} [CPU util: {cpu_util}, mem MBs: {mem_megabytes}, GPU util: {gpu_util}, GPU mem MBs: {gpu_mem_megabytes}]"
+        )
+
+    def collect_model_measurements(self, benchmark_id: int, measurements: Dict[str, float]):
+        with self.conn.cursor() as cur:
+            cur.execute(
+                """
+                INSERT INTO model_measurements (
+                    benchmark_id,
+                    measurements
+                ) VALUES (%s, %s)
+                """,
+                (
+                    benchmark_id,
+                    measurements,
+                ),
+            )
+        self.logger.debug(f"inserted model measurements for benchmark #{benchmark_id}: {measurements}")
+
+    def close(self):
+        self.conn.close()
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+handler = logging.StreamHandler(sys.stdout)
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter("[%(levelname)s - %(asctime)s] %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+def parse_arguments():
+    """
+    Parse command line arguments for the benchmarking CLI.
+    """
+    parser = argparse.ArgumentParser(description="CLI for benchmarking the huggingface/transformers.")
+
+    parser.add_argument(
+        "branch",
+        type=str,
+        help="The branch name on which the benchmarking is performed.",
+    )
+
+    parser.add_argument(
+        "commit_id",
+        type=str,
+        help="The commit hash on which the benchmarking is performed.",
+    )
+
+    parser.add_argument(
+        "commit_msg",
+        type=str,
+        help="The commit message associated with the commit, truncated to 70 characters.",
+    )
+
+    args = parser.parse_args()
+
+    return args.branch, args.commit_id, args.commit_msg
+
+
+def import_from_path(module_name, file_path):
+    try:
+        spec = importlib.util.spec_from_file_location(module_name, file_path)
+        module = importlib.util.module_from_spec(spec)
+        sys.modules[module_name] = module
+        spec.loader.exec_module(module)
+        return module
+    except Exception as e:
+        raise ImportModuleException(f"failed to load python module: {e}")
+
+
+if __name__ == "__main__":
+    benchmarks_folder_path = os.path.dirname(os.path.realpath(__file__))
+
+    branch, commit_id, commit_msg = parse_arguments()
+
+    for entry in os.scandir(benchmarks_folder_path):
+        try:
+            if not entry.name.endswith(".py"):
+                continue
+            if entry.path == __file__:
+                continue
+            logger.debug(f"loading: {entry.name}")
+            module = import_from_path(entry.name.split(".")[0], entry.path)
+            logger.info(f"runnning benchmarks in: {entry.name}")
+            module.run_benchmark(logger, branch, commit_id, commit_msg)
+        except ImportModuleException as e:
+            logger.error(e)
+        except Exception as e:
+            logger.error(f"error running benchmarks for {entry.name}: {e}")
--- a/benchmark/default.yml
+++ b/benchmark/default.yml
@ -0,0 +1,10 @@
+apiVersion: 1
+
+providers:
+  - name: 'Transformers Benchmarks'
+    orgId: 1
+    type: file
+    updateIntervalSeconds: 10
+    allowUiUpdates: true
+    options:
+      path: /etc/grafana/dashboards
--- a/benchmark/grafana_dashboard.json
+++ b/benchmark/grafana_dashboard.json
@ -30,7 +30,7 @@
      "title": "Go to data",
      "tooltip": "Go to data",
      "type": "link",
-      "url": "http://transformers-benchmarks.huggingface.co/d/fdz33iyzln9c0a/transformers-benchmarks?orgId=1&from=${StartTime}&to=${EndTime}"
+      "url": "http://transformers-benchmarks.hf.co/d/fdz33iyzln9c0a/transformers-benchmarks?orgId=1&from=${StartTime}&to=${EndTime}"
    }
  ],
  "liveNow": true,
@ -77,7 +77,7 @@
            "properties": [
              {
                "id": "custom.width",
-                "value": 196
+                "value": 202
              }
            ]
          },
@ -101,7 +101,7 @@
            "properties": [
              {
                "id": "custom.width",
-                "value": 581
+                "value": 524
              }
            ]
          },
@ -113,7 +113,19 @@
            "properties": [
              {
                "id": "custom.width",
-                "value": 379
+                "value": 353
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "model_id"
+            },
+            "properties": [
+              {
+                "id": "custom.width",
+                "value": 216
              }
            ]
          }
@ -143,12 +155,14 @@
      "targets": [
        {
          "datasource": {
-            "type": "grafana-postgresql-datasource"
+            "default": true,
+            "type": "grafana-postgresql-datasource",
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT commit_id as commit_id, commit_message, gpu_name, created_at AS date FROM benchmarks WHERE branch = '${branch}' ORDER BY benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT commit_id, commit_message, metadata->>'gpu_name' as gpu_name, metadata->>'model_id' as model_id, created_at AS date FROM benchmarks WHERE branch = '${branch}' AND metadata->>'gpu_name' = '${gpu_name}' ORDER BY benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -306,13 +320,14 @@
      "targets": [
        {
          "datasource": {
+            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -431,13 +446,14 @@
      "targets": [
        {
          "datasource": {
+            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -565,13 +581,14 @@
      "targets": [
        {
          "datasource": {
+            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -686,13 +703,14 @@
      "targets": [
        {
          "datasource": {
+            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -807,13 +825,14 @@
      "targets": [
        {
          "datasource": {
+            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -928,13 +947,14 @@
      "targets": [
        {
          "datasource": {
+            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -1062,13 +1082,14 @@
      "targets": [
        {
          "datasource": {
+            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -1183,13 +1204,14 @@
      "targets": [
        {
          "datasource": {
+            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -1304,13 +1326,14 @@
      "targets": [
        {
          "datasource": {
+            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -1425,13 +1448,14 @@
      "targets": [
        {
          "datasource": {
+            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -1480,11 +1504,7 @@
      "id": 15,
      "panels": [
        {
-          "datasource": {
-            "default": true,
-            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
-          },
+          "datasource": {},
          "fieldConfig": {
            "defaults": {
              "color": {
@ -1528,8 +1548,7 @@
                "mode": "absolute",
                "steps": [
                  {
-                    "color": "green",
-                    "value": null
+                    "color": "green"
                  },
                  {
                    "color": "red",
@ -1563,8 +1582,9 @@
          "targets": [
            {
              "datasource": {
+                "default": true,
                "type": "grafana-postgresql-datasource",
-                "uid": "bdz2yss7sxo1sc"
+                "uid": "be28nkzirtb0gd"
              },
              "editorMode": "code",
              "format": "table",
@ -1665,11 +1685,7 @@
          "type": "timeseries"
        },
        {
-          "datasource": {
-            "default": true,
-            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
-          },
+          "datasource": {},
          "fieldConfig": {
            "defaults": {
              "color": {
@ -1713,8 +1729,7 @@
                "mode": "absolute",
                "steps": [
                  {
-                    "color": "green",
-                    "value": null
+                    "color": "green"
                  },
                  {
                    "color": "red",
@ -1748,8 +1763,9 @@
          "targets": [
            {
              "datasource": {
+                "default": true,
                "type": "grafana-postgresql-datasource",
-                "uid": "bdz2yss7sxo1sc"
+                "uid": "be28nkzirtb0gd"
              },
              "editorMode": "code",
              "format": "table",
@ -1850,11 +1866,7 @@
          "type": "timeseries"
        },
        {
-          "datasource": {
-            "default": true,
-            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
-          },
+          "datasource": {},
          "fieldConfig": {
            "defaults": {
              "color": {
@ -1898,8 +1910,7 @@
                "mode": "absolute",
                "steps": [
                  {
-                    "color": "green",
-                    "value": null
+                    "color": "green"
                  },
                  {
                    "color": "red",
@ -1933,8 +1944,9 @@
          "targets": [
            {
              "datasource": {
+                "default": true,
                "type": "grafana-postgresql-datasource",
-                "uid": "bdz2yss7sxo1sc"
+                "uid": "be28nkzirtb0gd"
              },
              "editorMode": "code",
              "format": "table",
@ -2035,11 +2047,7 @@
          "type": "timeseries"
        },
        {
-          "datasource": {
-            "default": true,
-            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
-          },
+          "datasource": {},
          "fieldConfig": {
            "defaults": {
              "color": {
@ -2083,8 +2091,7 @@
                "mode": "absolute",
                "steps": [
                  {
-                    "color": "green",
-                    "value": null
+                    "color": "green"
                  },
                  {
                    "color": "red",
@ -2118,8 +2125,9 @@
          "targets": [
            {
              "datasource": {
+                "default": true,
                "type": "grafana-postgresql-datasource",
-                "uid": "bdz2yss7sxo1sc"
+                "uid": "be28nkzirtb0gd"
              },
              "editorMode": "code",
              "format": "table",
@ -2224,7 +2232,6 @@
      "type": "row"
    }
  ],
-  "refresh": "",
  "schemaVersion": 39,
  "tags": [],
  "templating": {
@ -2236,6 +2243,7 @@
          "value": "main"
        },
        "datasource": {
+          "default": true,
          "type": "grafana-postgresql-datasource",
          "uid": "be28nkzirtb0gd"
        },
@ -2248,7 +2256,7 @@
        "name": "branch",
        "options": [],
        "query": "SELECT DISTINCT branch FROM benchmarks;",
-        "refresh": 2,
+        "refresh": 1,
        "regex": "",
        "skipUrlSync": false,
        "sort": 0,
@ -2261,6 +2269,7 @@
          "value": "1729701492845"
        },
        "datasource": {
+          "default": true,
          "type": "grafana-postgresql-datasource",
          "uid": "be28nkzirtb0gd"
        },
@ -2281,10 +2290,11 @@
      {
        "current": {
          "selected": false,
-          "text": "1730120430069",
-          "value": "1730120430069"
+          "text": "1730393397577",
+          "value": "1730393397577"
        },
        "datasource": {
+          "default": true,
          "type": "grafana-postgresql-datasource",
          "uid": "be28nkzirtb0gd"
        },
@ -2312,15 +2322,16 @@
          "type": "grafana-postgresql-datasource",
          "uid": "be28nkzirtb0gd"
        },
-        "definition": "SELECT DISTINCT gpu_name FROM benchmarks;",
+        "definition": "SELECT DISTINCT metadata->>'gpu_name' FROM benchmarks;",
+        "description": "",
        "hide": 0,
        "includeAll": false,
        "label": "GPU",
        "multi": false,
        "name": "gpu_name",
        "options": [],
-        "query": "SELECT DISTINCT gpu_name FROM benchmarks;",
-        "refresh": 2,
+        "query": "SELECT DISTINCT metadata->>'gpu_name' FROM benchmarks;",
+        "refresh": 1,
        "regex": "",
        "skipUrlSync": false,
        "sort": 0,
@ -2328,7 +2339,7 @@
      },
      {
        "current": {
-          "selected": false,
+          "selected": true,
          "text": "10",
          "value": "10"
        },
@ -2359,6 +2370,6 @@
  "timezone": "browser",
  "title": "Transformers benchmarks",
  "uid": "fdz33iyzln9c0a",
-  "version": 4,
+  "version": 10,
  "weekStart": ""
 }
--- a/benchmark/grafana_datasource.yaml
+++ b/benchmark/grafana_datasource.yaml
@ -0,0 +1,17 @@
+apiVersion: 1
+datasources:
+  - name: grafana-postgresql-datasource
+    uid: be28nkzirtb0gd
+    type: postgres
+    url: $GRAFANA_POSTGRES_DATASOURCE_URL
+    user: $GRAFANA_POSTGRES_DATASOURCE_USER
+    secureJsonData:
+      password: $GRAFANA_POSTGRES_DATASOURCE_PWD
+    jsonData:
+      database: metrics
+      maxOpenConns: 100
+      maxIdleConns: 100
+      maxIdleConnsAuto: true
+      connMaxLifetime: 14400
+      postgresVersion: 1000
+      timescaledb: false
--- a/benchmark/init_db.sql
+++ b/benchmark/init_db.sql
@ -3,7 +3,7 @@ CREATE TABLE IF NOT EXISTS benchmarks (
  branch VARCHAR(255),
  commit_id VARCHAR(72),
  commit_message VARCHAR(70),
-  gpu_name VARCHAR(255),
+  metadata jsonb,
  created_at timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC')
 );

--- a/benchmark/llama.py
+++ b/benchmark/llama.py
@ -1,71 +1,25 @@
-import argparse
-import json
-import logging
+from logging import Logger
 import os
-import sys
-from statistics import mean
 from threading import Event, Thread
 from time import perf_counter, sleep
 from typing import Optional
+from benchmarks_entrypoint import MetricsRecorder
 import gpustat
 import psutil
 import psycopg2
 import torch

 from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache
-from psycopg2.extras import Json
-from psycopg2.extensions import register_adapter


 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-handler = logging.StreamHandler(sys.stdout)
-handler.setLevel(logging.INFO)
-formatter = logging.Formatter("[%(levelname)s - %(asctime)s] %(message)s")
-handler.setFormatter(formatter)
-logger.addHandler(handler)
-
 os.environ["TOKENIZERS_PARALLELISM"] = "1"
 torch.set_float32_matmul_precision("high")
-register_adapter(dict, Json)


-def parse_arguments():
-    """
-    Parse command line arguments for the benchmarking CLI.
-    """
-    parser = argparse.ArgumentParser(description="CLI for benchmarking the huggingface/transformers.")
-
-    parser.add_argument(
-        "branch",
-        type=str,
-        help="The branch name on which the benchmarking is performed.",
-    )
-
-    parser.add_argument(
-        "commit_id",
-        type=str,
-        help="The commit hash on which the benchmarking is performed.",
-    )
-
-    parser.add_argument(
-        "commit_msg",
-        type=str,
-        help="The commit message associated with the commit, truncated to 70 characters.",
-    )
-
-    args = parser.parse_args()
-
-    return args.branch, args.commit_id, args.commit_msg
-
-
-def collect_metrics(benchmark_id, continue_metric_collection):
+def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder):
    p = psutil.Process(os.getpid())
-    conn = psycopg2.connect("dbname=metrics")
-    cur = conn.cursor()
    while not continue_metric_collection.is_set():
        with p.oneshot():
            cpu_util = p.cpu_percent()
@ -73,47 +27,41 @@ def collect_metrics(benchmark_id, continue_metric_collection):
        gpu_stats = gpustat.GPUStatCollection.new_query()
        gpu_util = gpu_stats[0]["utilization.gpu"]
        gpu_mem_megabytes = gpu_stats[0]["memory.used"]
-        cur.execute(
-            "INSERT INTO device_measurements (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes) VALUES (%s, %s, %s, %s, %s)",
-            (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes),
+        metrics_recorder.collect_device_measurements(
+            benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes
        )
        sleep(0.01)
-        conn.commit()
-    conn.close()


-def run_benchmark(branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100):
+def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100):
    continue_metric_collection = Event()
    metrics_thread = None
+    model_id = "meta-llama/Llama-2-7b-hf"
+    metrics_recorder = MetricsRecorder(psycopg2.connect("dbname=metrics"), logger, branch, commit_id, commit_msg)
    try:
        gpu_stats = gpustat.GPUStatCollection.new_query()
        gpu_name = gpu_stats[0]["name"]
-        conn = psycopg2.connect("dbname=metrics")
-        cur = conn.cursor()
-        cur.execute(
-            "INSERT INTO benchmarks (branch, commit_id, commit_message, gpu_name) VALUES (%s, %s, %s, %s) RETURNING benchmark_id",
-            (branch, commit_id, commit_msg, gpu_name),
+        benchmark_id = metrics_recorder.initialise_benchmark({"gpu_name": gpu_name, "model_id": model_id})
+        logger.info(f"running benchmark #{benchmark_id} on {gpu_name} for {model_id}")
+        metrics_thread = Thread(
+            target=collect_metrics,
+            args=[benchmark_id, continue_metric_collection, metrics_recorder],
        )
-        conn.commit()
-        benchmark_id = cur.fetchone()[0]
-        logger.info(f"running benchmark #{benchmark_id} on {gpu_name}")
-        metrics_thread = Thread(target=collect_metrics, args=[benchmark_id, continue_metric_collection])
        metrics_thread.start()
        logger.info("started background thread to fetch device metrics")

        os.environ["TOKENIZERS_PARALLELISM"] = "false"  # silence warnings when compiling

        device = "cuda"
-        ckpt = "meta-llama/Llama-2-7b-hf"

        logger.info("downloading weights")
        # This is to avoid counting download in model load time measurement
-        model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16)
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
        gen_config = GenerationConfig(do_sample=False, top_p=1, temperature=1)
        logger.info("loading model")
        start = perf_counter()
        model = AutoModelForCausalLM.from_pretrained(
-            ckpt, torch_dtype=torch.float16, generation_config=gen_config
+            model_id, torch_dtype=torch.float16, generation_config=gen_config
        ).eval()
        model.to(device)
        torch.cuda.synchronize()
@ -121,7 +69,7 @@ def run_benchmark(branch: str, commit_id: str, commit_msg: str, num_tokens_to_ge
        model_load_time = end - start
        logger.info(f"loaded model in: {model_load_time}s")

-        tokenizer = AutoTokenizer.from_pretrained(ckpt)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)

        prompt = "Why dogs are so cute?"
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
@ -368,41 +316,27 @@ def run_benchmark(branch: str, commit_id: str, commit_msg: str, num_tokens_to_ge
            logger.info(f"completed second compile generation in: {fourth_compile_generate_time}s")
            logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")

-        cur.execute(
-            """
-            INSERT INTO model_measurements (
-                benchmark_id,
-                measurements
-            ) VALUES (%s, %s)
-            """,
-            (
-                benchmark_id,
-                {
-                    "model_load_time": model_load_time,
-                    "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time,
-                    "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time,
-                    "first_eager_generate_time_secs": first_eager_generate_time,
-                    "second_eager_generate_time_secs": second_eager_generate_time,
-                    "time_to_first_token_secs": time_to_first_token,
-                    "time_to_second_token_secs": time_to_second_token,
-                    "time_to_third_token_secs": time_to_third_token,
-                    "time_to_next_token_mean_secs": mean_time_to_next_token,
-                    "first_compile_generate_time_secs": first_compile_generate_time,
-                    "second_compile_generate_time_secs": second_compile_generate_time,
-                    "third_compile_generate_time_secs": third_compile_generate_time,
-                    "fourth_compile_generate_time_secs": fourth_compile_generate_time,
-                },
-            ),
+        metrics_recorder.collect_model_measurements(
+            benchmark_id,
+            {
+                "model_load_time": model_load_time,
+                "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time,
+                "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time,
+                "first_eager_generate_time_secs": first_eager_generate_time,
+                "second_eager_generate_time_secs": second_eager_generate_time,
+                "time_to_first_token_secs": time_to_first_token,
+                "time_to_second_token_secs": time_to_second_token,
+                "time_to_third_token_secs": time_to_third_token,
+                "time_to_next_token_mean_secs": mean_time_to_next_token,
+                "first_compile_generate_time_secs": first_compile_generate_time,
+                "second_compile_generate_time_secs": second_compile_generate_time,
+                "third_compile_generate_time_secs": third_compile_generate_time,
+                "fourth_compile_generate_time_secs": fourth_compile_generate_time,
+            },
        )
-        conn.commit()
-        conn.close()
    except Exception as e:
        logger.error(f"Caught exception: {e}")
    continue_metric_collection.set()
    if metrics_thread is not None:
        metrics_thread.join()
-
-
-if __name__ == "__main__":
-    branch, commit_id, commit_msg = parse_arguments()
-    run_benchmark(branch, commit_id, commit_msg, num_tokens_to_generate=20)
+    metrics_recorder.close()