[Benchmark] Reuse optimum-benchmark (#30615)

* benchmark * update --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2025-07-03 04:40:06 +06:00 · 2024-05-21 15:15:19 +02:00 · 2024-05-21 15:15:19 +02:00 · 64e0573a81
commit 64e0573a81
parent 3b09d3f05f
7 changed files with 393 additions and 1 deletions
--- a/7
+++ b/7
@ -1,4 +1,4 @@
-.PHONY: deps_table_update modified_only_fixup extra_style_checks quality style fixup fix-copies test test-examples
+.PHONY: deps_table_update modified_only_fixup extra_style_checks quality style fixup fix-copies test test-examples benchmark

 # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 export PYTHONPATH = src
@ -96,6 +96,11 @@ test:
 test-examples:
 	python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/

+# Run benchmark
+
+benchmark:
+	python3 benchmark/benchmark.py --config-dir benchmark/config --config-name generation --commit=diff backend.model=google/gemma-2b backend.cache_implementation=null,static backend.torch_compile=false,true --multirun
+
 # Run tests for SageMaker DLC release

 test-sagemaker: # install sagemaker dependencies in advance with pip install .[sagemaker]
--- a/benchmark/init.py
+++ b/benchmark/init.py
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -0,0 +1,310 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Run benchmark using the `optimum-benchmark` library with some customization in `transformers`.
+
+Assume we are under `transformers` root directory: (make sure the commits are valid commits)
+```bash
+python benchmark/benchmark.py --config-dir benchmark/config --config-name generation --commit=9b9c7f03da625b13643e99205c691fe046461724 --metrics=decode.latency.mean,per_token.latency.mean,per_token.throughput.value backend.model=google/gemma-2b benchmark.input_shapes.sequence_length=5,7 benchmark.input_shapes.batch_size=1,2 --multirun
+```
+"""
+
+import argparse
+import glob
+import json
+import os.path
+import re
+import tempfile
+from contextlib import contextmanager
+from pathlib import Path
+
+from git import Repo
+
+from optimum_benchmark import Benchmark
+from optimum_benchmark_wrapper import main
+
+
+PATH_TO_REPO = Path(__file__).parent.parent.resolve()
+
+
+@contextmanager
+def checkout_commit(repo: Repo, commit_id: str):
+    """
+    Context manager that checks out a given commit when entered, but gets back to the reference it was at on exit.
+    Args:
+        repo (`git.Repo`): A git repository (for instance the Transformers repo).
+        commit_id (`str`): The commit reference to checkout inside the context manager.
+    """
+    current_head = repo.head.commit if repo.head.is_detached else repo.head.ref
+
+    try:
+        repo.git.checkout(commit_id)
+        yield
+
+    finally:
+        repo.git.checkout(current_head)
+
+
+def summarize(run_dir, metrics, expand_metrics=False):
+    """Produce a summary for each optimum-benchmark launched job's output directory found in `run_dir`.
+
+    Each summary's format is as follows (for `expand_metrics=False`):
+    ```
+    {
+        "model": "google/gemma-2b",
+        "commit": "3cd6ed22e4d49219f300f5055e71e3929aba20d7",
+        "config": "benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5",
+        "metrics": {
+            "decode.latency.mean": 1.624666809082031,
+            "per_token.latency.mean": 0.012843788806628804,
+            "per_token.throughput.value": 77.85864553330948
+        }
+    }
+    ```
+    """
+    reports = glob.glob(os.path.join(run_dir, "**/benchmark_report.json"), recursive=True)
+    report_dirs = [str(Path(report).parent) for report in reports]
+
+    summaries = []
+    for report_dir in report_dirs:
+        commit = re.search(r"/commit=([^/]+)", report_dir).groups()[0]
+
+        if not os.path.isfile(os.path.join(report_dir, "benchmark.json")):
+            continue
+        benchmark = Benchmark.from_json(os.path.join(report_dir, "benchmark.json"))
+        report = benchmark.report
+
+        model = benchmark.config.backend["model"]
+
+        # Ths looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`.
+        # (we rely on the usage of hydra's `${hydra.job.override_dirname}`.)
+        benchmark_name = re.sub(f"backend.model={model},*", "", report_dir)
+        benchmark_name = str(Path(benchmark_name).parts[-1])
+        if benchmark_name.startswith("commit="):
+            benchmark_name = benchmark.config.name
+
+        metrics_values = {}
+        # post-processing of report: show a few selected/important metric
+        for metric in metrics:
+            keys = metric.split(".")
+            value = report
+            current = metrics_values
+            for key in keys:
+                # Avoid KeyError when a user's specified metric has typo.
+                # TODO: Give warnings.
+                if key not in value:
+                    continue
+                value = value[key]
+
+                if expand_metrics:
+                    if isinstance(value, dict):
+                        if key not in current:
+                            current[key] = {}
+                            current = current[key]
+                    else:
+                        current[key] = value
+
+            if not expand_metrics:
+                metrics_values[metric] = value
+
+        # show some config information
+        print(f"model: {model}")
+        print(f"commit: {commit}")
+        print(f"config: {benchmark_name}")
+        if len(metrics_values) > 0:
+            print("metrics:")
+            if expand_metrics:
+                print(metrics_values)
+            else:
+                for metric, value in metrics_values.items():
+                    print(f"  - {metric}: {value}")
+        print("-" * 80)
+
+        summary = {
+            "model": model,
+            "commit": commit,
+            "config": benchmark_name,
+            "metrics": metrics_values,
+        }
+        summaries.append(summary)
+
+        with open(os.path.join(report_dir, "summary.json"), "w") as fp:
+            json.dump(summary, fp, indent=4)
+
+    # TODO: upload to Hub
+    return summaries
+
+
+def combine_summaries(summaries):
+    """Combine a list of summary obtained from the function `summarize`.
+
+    The combined summary's format is as follows:
+    ```
+    "google/gemma-2b": {
+        "benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5": {
+            "3cd6ed22e4d49219f300f5055e71e3929aba20d7": {
+                "metrics": {"decode.latency.mean": 1.624666809082031}
+            },
+            "c97ee28b117c0abe8e08891f402065e4df6d72aa": {
+                "metrics": {"decode.latency.mean": 1.6278163452148438}
+            }
+        },
+        "benchmark.input_shapes.batch_size=2,benchmark.input_shapes.sequence_length=5": {
+            "3cd6ed22e4d49219f300f5055e71e3929aba20d7": {
+                "metrics": {"decode.latency.mean": 1.6947791748046876}
+            },
+            "c97ee28b117c0abe8e08891f402065e4df6d72aa": {
+                "metrics": {
+                    "decode.latency.mean": 1.6980519409179688}
+            }
+        }
+    }
+    ```
+    """
+    combined = {}
+    for summary in summaries:
+        model = summary["model"]
+        config = summary["config"]
+        commit = summary["commit"]
+
+        if model not in combined:
+            combined[model] = {}
+
+        if config not in combined[model]:
+            combined[model][config] = {}
+
+        if commit not in combined[model][config]:
+            combined[model][config][commit] = {"metrics": summary["metrics"]}
+
+    with open(os.path.join(exp_run_dir, "summary.json"), "w") as fp:
+        json.dump(combined, fp, indent=4)
+
+    # TODO: upload to Hub
+    print(json.dumps(combined, indent=4))
+
+    return combined
+
+
+if __name__ == "__main__":
+
+    def list_str(values):
+        return values.split(",")
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--config-dir", type=str, required=True, help="The path to the config directory.")
+    parser.add_argument("--config-name", type=str, required=True, help="The config name.")
+
+    # arguments specific to this wrapper for our own customization
+    parser.add_argument("--ensure_empty", type=bool, default=True, help="If to create a temporary directory.")
+    parser.add_argument(
+        "--commit",
+        type=list_str,
+        default="",
+        help="Comma-separated list of branch names and/or commit sha values on which the benchmark will run. If `diff` is specified, it will run on both the current head and the `main` branch.",
+    )
+    parser.add_argument("--metrics", type=str, help="The metrics to be included in the summary.")
+    args, optimum_benchmark_args = parser.parse_known_args()
+
+    repo = Repo(PATH_TO_REPO)
+
+    metrics = [
+        "prefill.latency.mean",
+        "prefill.throughput.value",
+        "decode.latency.mean",
+        "decode.throughput.value",
+        "per_token.latency.mean",
+        "per_token.throughput.value",
+    ]
+    if args.metrics is not None:
+        metrics = args.metrics.split(",")
+
+    # Get `backend.model` in a hacky way: We want to control the experiment flow manually.
+    models = [""]
+    for idx, arg in enumerate(optimum_benchmark_args):
+        if arg.startswith("backend.model="):
+            models = arg[len("backend.model=") :]
+            models = models.split(",")
+            break
+    optimum_benchmark_args = [arg for arg in optimum_benchmark_args if not arg.startswith("backend.model=")]
+
+    # Get the commit(s)
+    current_head = str(repo.head.commit) if repo.head.is_detached else str(repo.head.ref)
+    commits = [x for x in args.commit if x != ""]
+    if len(commits) == 0:
+        commits = [current_head]
+    elif len(commits) == 1 and commits[0] == "diff":
+        # compare to `main`
+        commits = ["main", current_head]
+
+    # Get the specified run directory
+    run_dir_arg_idx, run_dir = -1, None
+    sweep_dir_arg_idx, sweep_dir = -1, None
+    for idx, arg in enumerate(optimum_benchmark_args):
+        if arg.startswith("hydra.run.dir="):
+            run_dir = arg[len("hydra.run.dir=") :]
+            run_dir_arg_idx = idx
+        elif arg.startswith("hydra.sweep.dir="):
+            sweep_dir = arg[len("hydra.sweep.dir=") :]
+            sweep_dir_arg_idx = idx
+    exp_run_dir, arg_dix, arg_name = (
+        (sweep_dir, sweep_dir_arg_idx, "hydra.sweep.dir")
+        if "--multirun" in optimum_benchmark_args
+        else (run_dir, run_dir_arg_idx, "hydra.run.dir")
+    )
+
+    # TODO: not hardcoded
+    if exp_run_dir is None and args.ensure_empty:
+        exp_run_dir = "_benchmark"
+
+    if args.ensure_empty:
+        os.makedirs(exp_run_dir, exist_ok=True)
+        exp_run_dir = tempfile.mkdtemp(dir=exp_run_dir)
+
+    run_summaries = []
+    for commit in commits:
+        with checkout_commit(repo, commit):
+            commit = str(repo.head.commit)
+
+            commit_run_dir = exp_run_dir
+            if exp_run_dir is not None:
+                commit_run_dir = os.path.join(exp_run_dir, rf"commit\={commit}")
+
+            print(f"Run benchmark on commit: {commit}")
+
+            for model in models:
+                model_arg = [f"backend.model={model}"] if model != "" else []
+                dir_args = []
+                if commit_run_dir is not None:
+                    if arg_dix > -1:
+                        optimum_benchmark_args[arg_dix] = f"{arg_name}={commit_run_dir}"
+                    else:
+                        dir_args = [
+                            f"hydra.sweep.dir={commit_run_dir}",
+                            f"hydra.run.dir={commit_run_dir}/" + "${hydra.job.override_dirname}",
+                        ]
+                main(args.config_dir, args.config_name, model_arg + dir_args + optimum_benchmark_args)
+
+            if commit_run_dir is not None:
+                # Need to remove the `\` character
+                summaries = summarize(commit_run_dir.replace("\\", ""), metrics)
+                run_summaries.extend(summaries)
+
+    # aggregate the information across the commits
+    if exp_run_dir is not None:
+        with open(os.path.join(exp_run_dir, "summaries.json"), "w") as fp:
+            json.dump(run_summaries, fp, indent=4)
+
+        combined_summary = combine_summaries(run_summaries)
--- a/benchmark/config/generation.yaml
+++ b/benchmark/config/generation.yaml
@ -0,0 +1,57 @@
+defaults:
+  - benchmark # inheriting benchmark schema
+  - scenario: inference
+  - launcher: process
+  - backend: pytorch
+  - _self_ # for hydra 1.1 compatibility
+
+name: pytorch_generate
+
+launcher:
+  start_method: spawn
+  device_isolation: true
+  device_isolation_action: warn
+
+backend:
+  device: cuda
+  device_ids: 0
+  no_weights: true
+  model: meta-llama/Llama-2-7b-hf
+  cache_implementation: static
+  torch_compile: true
+  torch_dtype: float16
+  torch_compile_config:
+    backend: inductor
+    mode: reduce-overhead
+    fullgraph: true
+
+scenario:
+  input_shapes:
+    batch_size: 1
+    sequence_length: 7
+  generate_kwargs:
+    max_new_tokens: 128
+    min_new_tokens: 128
+    do_sample: false
+  memory: true
+  latency: true
+  iterations: 2
+  duration: 0
+
+
+# hydra/cli specific settings
+hydra:
+  run:
+    # where to store run results
+    dir: runs/${name}
+  job:
+    # change working directory to the run directory
+    chdir: true
+    env_set:
+      # set environment variable OVERRIDE_BENCHMARKS to 1
+      # to not skip benchmarks that have been run before
+      OVERRIDE_BENCHMARKS: 1
+      LOG_LEVEL: WARN
+  sweep:
+    dir: multirun
+    subdir: ${hydra.job.override_dirname}
--- a/benchmark/optimum_benchmark_wrapper.py
+++ b/benchmark/optimum_benchmark_wrapper.py
@ -0,0 +1,16 @@
+import argparse
+import subprocess
+
+
+def main(config_dir, config_name, args):
+    subprocess.run(["optimum-benchmark", "--config-dir", f"{config_dir}", "--config-name", f"{config_name}"] + ["hydra/job_logging=disabled", "hydra/hydra_logging=disabled"] + args)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--config-dir", type=str, required=True, help="The path to the config directory.")
+    parser.add_argument("--config-name", type=str, required=True, help="The config name.")
+    args, unknown = parser.parse_known_args()
+
+    main(args.config_dir, args.config_name, unknown)
--- a/setup.py
+++ b/setup.py
@ -136,6 +136,7 @@ _deps = [
    "onnxruntime-tools>=1.4.2",
    "onnxruntime>=1.4.0",
    "opencv-python",
+    "optimum-benchmark>=0.2.0",
    "optuna",
    "optax>=0.0.8,<=0.1.4",
    "packaging>=20.0",
@ -410,6 +411,8 @@ extras["agents"] = deps_list(
    "diffusers", "accelerate", "datasets", "torch", "sentencepiece", "opencv-python", "Pillow"
 )

+extras["benchmark"] = deps_list("optimum-benchmark")
+
 # when modifying the following list, make sure to update src/transformers/dependency_versions_check.py
 install_requires = [
    deps["filelock"],  # filesystem locks, e.g., to prevent parallel downloads
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@ -42,6 +42,7 @@ deps = {
    "onnxruntime-tools": "onnxruntime-tools>=1.4.2",
    "onnxruntime": "onnxruntime>=1.4.0",
    "opencv-python": "opencv-python",
+    "optimum-benchmark": "optimum-benchmark>=0.2.0",
    "optuna": "optuna",
    "optax": "optax>=0.0.8,<=0.1.4",
    "packaging": "packaging>=20.0",