diff --git a/benchmark/README.md b/benchmark/README.md index a827da444f0..3935f02b389 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -12,7 +12,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, ## Writing metrics to the database -`MetricRecorder` is thread-safe, in the sense of the python [`Thread`](https://docs.python.org/3/library/threading.html#threading.Thread). This means you can start a background thread to do the readings on the device measurements while not blocking the main thread to execute the model measurements. +`MetricsRecorder` is thread-safe, in the sense of the python [`Thread`](https://docs.python.org/3/library/threading.html#threading.Thread). This means you can start a background thread to do the readings on the device measurements while not blocking the main thread to execute the model measurements. cf [`llama.py`](./llama.py) to see an example of this in practice. diff --git a/benchmark/benchmarks_entrypoint.py b/benchmark/benchmarks_entrypoint.py index df348b08523..6c036fdd693 100644 --- a/benchmark/benchmarks_entrypoint.py +++ b/benchmark/benchmarks_entrypoint.py @@ -3,7 +3,6 @@ import importlib.util import logging import os from typing import Dict -import psycopg2 import sys from psycopg2.extras import Json diff --git a/benchmark/llama.py b/benchmark/llama.py index bbe1afefd5e..c7ad049e6f6 100644 --- a/benchmark/llama.py +++ b/benchmark/llama.py @@ -215,7 +215,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, torch.cuda.synchronize() end = perf_counter() time_to_second_token = end - start - logger.info(f"completed second compile generation in: {time_to_first_token}s") + logger.info(f"completed second compile generation in: {time_to_second_token}s") cache_position += 1 all_generated_tokens += next_token.clone().detach().cpu().tolist() @@ -227,7 +227,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, torch.cuda.synchronize() end = perf_counter() time_to_third_token = end - start - logger.info(f"completed third compile forward in: {time_to_first_token}s") + logger.info(f"completed third compile forward in: {time_to_third_token}s") cache_position += 1 all_generated_tokens += next_token.clone().detach().cpu().tolist() @@ -298,7 +298,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, output = model.generate(**inputs, past_key_values=past_key_values) end = perf_counter() third_compile_generate_time = end - start - logger.info(f"completed second compile generation in: {third_compile_generate_time}s") + logger.info(f"completed third compile generation in: {third_compile_generate_time}s") logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") past_key_values = StaticCache( @@ -313,7 +313,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, output = model.generate(**inputs, past_key_values=past_key_values) end = perf_counter() fourth_compile_generate_time = end - start - logger.info(f"completed second compile generation in: {fourth_compile_generate_time}s") + logger.info(f"completed fourth compile generation in: {fourth_compile_generate_time}s") logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") metrics_recorder.collect_model_measurements(