From 144852fb6bbe584e9ff7d13511180aec42e1b366 Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Fri, 11 Oct 2024 18:03:29 +0200 Subject: [PATCH] refactor: benchmarks (#33896) * refactor: benchmarks Based on a discussion with @LysandreJik & @ArthurZucker, the goal of this PR is to improve transformers' benchmark system. This is a WIP, for the moment the infrastructure required to make things work is not ready. Will update the PR description when it is the case. * feat: add db init in benchmarks CI * fix: pg_config is missing in runner * fix: add psql to the runner * fix: connect info from env vars + PR comments * refactor: set database as env var * fix: invalid working directory * fix: `commit_msg` -> `commit_message` * fix: git marking checked out repo as unsafe * feat: add logging * fix: invalid device * feat: update grafana dashboard for prod grafana * feat: add `commit_id` to header table * feat: commit latest version of dashboard * feat: move measurements into json field * feat: remove drop table migration queries * fix: `torch.arrange` -> `torch.arange` * fix: add missing `s` to `cache_position` positional argument * fix: change model * revert: `cache_positions` -> `cache_position` * fix: set device for `StaticCache` * fix: set `StaticCache` dtype * feat: limit max cache len * fix script * raise error on failure! * not try catch * try to skip generate compilation * update * update docker image! * update * update again!@ * update * updates * ??? * ?? * use `torch.cuda.synchronize()` * fix json * nits * fix * fixed! * f**k * feat: add TTNT panels * feat: add try except --------- Co-authored-by: Arthur Zucker --- .github/workflows/benchmark.yml | 73 +- benchmark/grafana_dashboard.json | 2211 ++++++++++++++++++++++++++++++ benchmark/init_db.sql | 26 + benchmark/llama.py | 404 ++++++ benchmark/requirements.txt | 5 + 5 files changed, 2697 insertions(+), 22 deletions(-) create mode 100644 benchmark/grafana_dashboard.json create mode 100644 benchmark/init_db.sql create mode 100644 benchmark/llama.py create mode 100644 benchmark/requirements.txt diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 75a837d693e..c264dfe462a 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -1,43 +1,72 @@ name: Self-hosted runner (benchmark) on: - schedule: - - cron: "17 2 * * *" - workflow_call: + push: + branches: [main] + pull_request: + types: [ opened, labeled, reopened, synchronize ] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true env: HF_HOME: /mnt/cache - TF_FORCE_GPU_ALLOW_GROWTH: true - jobs: benchmark: name: Benchmark - runs-on: + runs-on: group: aws-g5-4xlarge-cache container: - image: huggingface/transformers-all-latest-gpu - options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + image: huggingface/transformers-pytorch-gpu + options: --gpus all --privileged --ipc host steps: - - name: Update clone - working-directory: /transformers + - name: Get repo + if: github.event_name == 'pull_request' + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + + - name: Get repo + if: github.event_name == 'push' + uses: actions/checkout@v4 + with: + ref: ${{ github.sha }} + + - name: Install libpq-dev & psql run: | - git fetch && git checkout ${{ github.sha }} + apt update + apt install -y libpq-dev postgresql-client + + - name: Install benchmark script dependencies + run: python3 -m pip install -r benchmark/requirements.txt - name: Reinstall transformers in edit mode (remove the one installed during docker image build) working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]" - - name: Benchmark (daily) - if: github.event_name == 'schedule' - working-directory: /transformers + - name: Run database init script run: | - python3 -m pip install optimum-benchmark>=0.3.0 - HF_TOKEN=${{ secrets.TRANSFORMERS_BENCHMARK_TOKEN }} python3 benchmark/benchmark.py --repo_id hf-internal-testing/benchmark_results --path_in_repo $(date +'%Y-%m-%d') --config-dir benchmark/config --config-name generation --commit=${{ github.sha }} backend.model=google/gemma-2b backend.cache_implementation=null,static backend.torch_compile=false,true --multirun + psql -f benchmark/init_db.sql + env: + PGDATABASE: metrics + PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }} + PGUSER: transformers_benchmarks + PGPASSWORD: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGPASSWORD }} - - name: Benchmark (merged to main event) - if: github.event_name == 'push' && github.ref_name == 'main' - working-directory: /transformers + - name: Run benchmark run: | - python3 -m pip install optimum-benchmark>=0.3.0 - HF_TOKEN=${{ secrets.TRANSFORMERS_BENCHMARK_TOKEN }} python3 benchmark/benchmark.py --repo_id hf-internal-testing/benchmark_results_merge_event --path_in_repo $(date +'%Y-%m-%d') --config-dir benchmark/config --config-name generation --commit=${{ github.sha }} backend.model=google/gemma-2b backend.cache_implementation=null,static backend.torch_compile=false,true --multirun + git config --global --add safe.directory /__w/transformers/transformers + if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then + commit_id=$(echo "${{ github.event.pull_request.head.sha }}") + elif [ "$GITHUB_EVENT_NAME" = "push" ]; then + commit_id=$GITHUB_SHA + fi + commit_msg=$(git show -s --format=%s | cut -c1-70) + python3 benchmark/llama.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg" + env: + HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }} + PGUSER: transformers_benchmarks + PGPASSWORD: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGPASSWORD }} diff --git a/benchmark/grafana_dashboard.json b/benchmark/grafana_dashboard.json new file mode 100644 index 00000000000..be471a6314e --- /dev/null +++ b/benchmark/grafana_dashboard.json @@ -0,0 +1,2211 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 1, + "links": [ + { + "asDropdown": false, + "icon": "external link", + "includeVars": false, + "keepTime": false, + "tags": [], + "targetBlank": false, + "title": "Go to data", + "tooltip": "Go to data", + "type": "link", + "url": "http://transformers-benchmarks.huggingface.co/d/fdz33iyzln9c0a/transformers-benchmarks?orgId=1&from=${StartTime}&to=${EndTime}" + } + ], + "liveNow": true, + "panels": [ + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "left", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "gpu_name" + }, + "properties": [ + { + "id": "custom.width", + "value": 364 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "left" + }, + "properties": [ + { + "id": "custom.width", + "value": 407 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "commit_message" + }, + "properties": [ + { + "id": "custom.width", + "value": 708 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "commit_id" + }, + "properties": [ + { + "id": "custom.width", + "value": 388 + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 5, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "11.2.2", + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT commit_id as commit_id, commit_message, gpu_name FROM benchmarks WHERE branch = ${branch};", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [ + { + "name": "commit_id", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_name", + "type": "functionParameter" + } + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baaa8aaa-89ab-4cde-b012-31922f96de3f", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" + }, + "whereString": "commit_id = '${commit}'" + }, + "table": "benchmarks" + } + ], + "transparent": true, + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 6 + }, + "id": 13, + "panels": [], + "title": "Eager Forward Pass", + "type": "row" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 7, + "options": { + "barRadius": 0.05, + "barWidth": 0.3, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "pluginVersion": "11.2.2", + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}'", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "First eager forward pass", + "transparent": true, + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 9, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}'", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Second eager forward pass", + "transparent": true, + "type": "barchart" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 16, + "panels": [], + "title": "Time to next token", + "type": "row" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 17, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "always", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}'", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Time to first token", + "transparent": true, + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 19 + }, + "id": 18, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "always", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}'", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Time to second token", + "transparent": true, + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 19, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "always", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}'", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Time to third token", + "transparent": true, + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 30 + }, + "id": 20, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "always", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}'", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Time to subsequent next tokens mean", + "transparent": true, + "type": "barchart" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 41 + }, + "id": 14, + "panels": [], + "title": "Compiled Generate", + "type": "row" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 8, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "always", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}'", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "First compile generate", + "transparent": true, + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 42 + }, + "id": 10, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}';", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Second compile generate", + "transparent": true, + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 53 + }, + "id": 11, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}';", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Third compile generate", + "transparent": true, + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 53 + }, + "id": 12, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = ${branch} AND gpu_name = '${gpu_name}';", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Fourth compile generate", + "transparent": true, + "type": "barchart" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 64 + }, + "id": 15, + "panels": [], + "title": "Usage metrics", + "type": "row" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": 60000, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 65 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT\n d.cpu_util,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = ${branch}", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [ + { + "name": "cpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "\"time\"", + "type": "functionParameter" + } + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baa888b8-89ab-4cde-b012-31922f8671e9", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" + }, + "whereString": "commit_id = '${commit}'" + }, + "table": "measurements" + } + ], + "title": "CPU Utilization", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": 60000, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 65 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT\n b.commit_id,\n d.gpu_util,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = ${branch}", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [ + { + "name": "cpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "\"time\"", + "type": "functionParameter" + } + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baa888b8-89ab-4cde-b012-31922f8671e9", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" + }, + "whereString": "commit_id = '${commit}'" + }, + "table": "measurements" + } + ], + "title": "GPU Utilization", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": 60000, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decmbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 74 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT d.mem_megabytes, d.time FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = ${branch}", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [ + { + "name": "cpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "\"time\"", + "type": "functionParameter" + } + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baa888b8-89ab-4cde-b012-31922f8671e9", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" + }, + "whereString": "commit_id = '${commit}'" + }, + "table": "measurements" + } + ], + "title": "Memory usage", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": 60000, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decmbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 74 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT\n d.gpu_mem_megabytes,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = ${branch}", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [ + { + "name": "cpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "\"time\"", + "type": "functionParameter" + } + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baa888b8-89ab-4cde-b012-31922f8671e9", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" + }, + "whereString": "commit_id = '${commit}'" + }, + "table": "measurements" + } + ], + "title": "GPU memory usage", + "transparent": true, + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": [ + "refactor/benchmarks" + ], + "value": [ + "refactor/benchmarks" + ] + }, + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "definition": "SELECT DISTINCT branch FROM benchmarks;", + "description": "", + "hide": 0, + "includeAll": false, + "label": "branch", + "multi": false, + "name": "branch", + "options": [], + "query": "SELECT DISTINCT branch FROM benchmarks;", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "1728570853117", + "value": "1728570853117" + }, + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "definition": "SELECT created_at - INTERVAL '5 secs' FROM benchmarks WHERE branch = ${branch} ORDER BY benchmark_id ASC LIMIT 1;", + "description": "", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "StartTime", + "options": [], + "query": "SELECT created_at - INTERVAL '5 secs' FROM benchmarks WHERE branch = ${branch} ORDER BY benchmark_id ASC LIMIT 1;", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "1728657828802", + "value": "1728657828802" + }, + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "de0dbhs18ho1sc" + }, + "definition": "SELECT time + INTERVAL '5 secs' FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = ${branch} ORDER BY b.benchmark_id DESC, d.measurement_id DESC LIMIT 1;", + "description": "", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "EndTime", + "options": [], + "query": "SELECT time + INTERVAL '5 secs' FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = ${branch} ORDER BY b.benchmark_id DESC, d.measurement_id DESC LIMIT 1;", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "NVIDIA A10G", + "value": "NVIDIA A10G" + }, + "definition": "SELECT DISTINCT gpu_name FROM benchmarks;", + "hide": 0, + "includeAll": false, + "label": "GPU", + "multi": false, + "name": "gpu_name", + "options": [], + "query": "SELECT DISTINCT gpu_name FROM benchmarks;", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "2024-10-11T13:10:01.641Z", + "to": "2024-10-11T13:25:21.783Z" + }, + "timepicker": { + "hidden": false + }, + "timezone": "browser", + "title": "Transformers benchmarks", + "uid": "fdz33iyzln9c0a", + "version": 9, + "weekStart": "" +} diff --git a/benchmark/init_db.sql b/benchmark/init_db.sql new file mode 100644 index 00000000000..4381b99cea6 --- /dev/null +++ b/benchmark/init_db.sql @@ -0,0 +1,26 @@ +CREATE TABLE IF NOT EXISTS benchmarks ( + benchmark_id SERIAL PRIMARY KEY, + branch VARCHAR(255), + commit_id VARCHAR(72), + commit_message VARCHAR(70), + gpu_name VARCHAR(255), + created_at timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC') +); + +CREATE TABLE IF NOT EXISTS device_measurements ( + measurement_id SERIAL PRIMARY KEY, + benchmark_id int REFERENCES benchmarks (benchmark_id), + cpu_util double precision, + mem_megabytes double precision, + gpu_util double precision, + gpu_mem_megabytes double precision, + time timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC') +); + +CREATE TABLE IF NOT EXISTS model_measurements ( + measurement_id SERIAL PRIMARY KEY, + benchmark_id int REFERENCES benchmarks (benchmark_id), + measurements jsonb, + time timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC') +); + diff --git a/benchmark/llama.py b/benchmark/llama.py new file mode 100644 index 00000000000..a926f903486 --- /dev/null +++ b/benchmark/llama.py @@ -0,0 +1,404 @@ +import argparse +import json +import logging +import os +import sys +from statistics import mean +from threading import Event, Thread +from time import perf_counter, sleep +from typing import Optional +import gpustat +import psutil +import psycopg2 +import torch + +from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache +from psycopg2.extras import Json +from psycopg2.extensions import register_adapter + + +os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +handler = logging.StreamHandler(sys.stdout) +handler.setLevel(logging.INFO) +formatter = logging.Formatter("[%(levelname)s - %(asctime)s] %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + +os.environ["TOKENIZERS_PARALLELISM"] = "1" +torch.set_float32_matmul_precision("high") +register_adapter(dict, Json) + + +def parse_arguments(): + """ + Parse command line arguments for the benchmarking CLI. + """ + parser = argparse.ArgumentParser(description="CLI for benchmarking the huggingface/transformers.") + + parser.add_argument( + "branch", + type=str, + help="The branch name on which the benchmarking is performed.", + ) + + parser.add_argument( + "commit_id", + type=str, + help="The commit hash on which the benchmarking is performed.", + ) + + parser.add_argument( + "commit_msg", + type=str, + help="The commit message associated with the commit, truncated to 70 characters.", + ) + + args = parser.parse_args() + + return args.branch, args.commit_id, args.commit_msg + + +def collect_metrics(benchmark_id, continue_metric_collection): + p = psutil.Process(os.getpid()) + conn = psycopg2.connect("dbname=metrics") + cur = conn.cursor() + while not continue_metric_collection.is_set(): + with p.oneshot(): + cpu_util = p.cpu_percent() + mem_megabytes = p.memory_info().rss / (1024 * 1024) + gpu_stats = gpustat.GPUStatCollection.new_query() + gpu_util = gpu_stats[0]["utilization.gpu"] + gpu_mem_megabytes = gpu_stats[0]["memory.used"] + cur.execute( + "INSERT INTO device_measurements (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes) VALUES (%s, %s, %s, %s, %s)", + (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes), + ) + sleep(0.01) + conn.commit() + conn.close() + + +def run_benchmark(branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100): + continue_metric_collection = Event() + metrics_thread = None + try: + gpu_stats = gpustat.GPUStatCollection.new_query() + gpu_name = gpu_stats[0]["name"] + conn = psycopg2.connect("dbname=metrics") + cur = conn.cursor() + cur.execute( + "INSERT INTO benchmarks (branch, commit_id, commit_message, gpu_name) VALUES (%s, %s, %s, %s) RETURNING benchmark_id", + (branch, commit_id, commit_msg, gpu_name), + ) + conn.commit() + benchmark_id = cur.fetchone()[0] + metrics_thread = Thread(target=collect_metrics, args=[benchmark_id, continue_metric_collection]) + metrics_thread.start() + + os.environ["TOKENIZERS_PARALLELISM"] = "false" # silence warnings when compiling + + device = "cuda" + ckpt = "meta-llama/Llama-2-7b-hf" + + # This is to avoid counting download in model load time measurement + model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16) + gen_config = GenerationConfig(do_sample=False, top_p=1, temperature=1) + start = perf_counter() + model = AutoModelForCausalLM.from_pretrained( + ckpt, torch_dtype=torch.float16, generation_config=gen_config + ).eval() + model.to(device) + torch.cuda.synchronize() + end = perf_counter() + model_load_time = end - start + logger.info(f"loaded model in: {model_load_time}s") + + tokenizer = AutoTokenizer.from_pretrained(ckpt) + + prompt = "Why dogs are so cute?" + inputs = tokenizer(prompt, return_tensors="pt").to(device) + + # Specify the max length (including both the prompt and the response) + # When calling `generate` with `cache_implementation="static" later, this is also used to create a `StaticCache` object + # with sequence length = `max_length`. The longer the more you will re-use it + seq_length = inputs["input_ids"].shape[1] + model.generation_config.max_length = seq_length + num_tokens_to_generate + batch_size = inputs["input_ids"].shape[0] + + # Copied from the gpt-fast repo + def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization + q = torch.empty_like(probs_sort).exponential_(1) + return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int) + + def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None): + logits = logits / max(temperature, 1e-5) + + if top_k is not None: + v, _ = torch.topk(logits, min(top_k, logits.size(-1))) + pivot = v.select(-1, -1).unsqueeze(-1) + logits = torch.where(logits < pivot, -float("Inf"), logits) + probs = torch.nn.functional.softmax(logits, dim=-1) + return probs + + def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None): + probs = logits_to_probs(logits[:, -1], temperature, top_k) + idx_next = multinomial_sample_one_no_sync(probs) + return idx_next, probs + + def decode_one_token(model, cur_token, cache_position, past_key_values): + logits = model( + cur_token, + cache_position=cache_position, + past_key_values=past_key_values, + return_dict=False, + use_cache=True, + )[0] + new_token = sample(logits, temperature=0.6, top_k=5)[0] + return new_token + + ######### + # Eager # + ######### + with torch.no_grad(): + past_key_values = StaticCache( + model.config, + batch_size=batch_size, + device=device, + dtype=torch.float16, + max_cache_len=seq_length + num_tokens_to_generate, + ) + cache_position = torch.arange(seq_length, device=device) + start = perf_counter() + model( + **inputs, + cache_position=cache_position, + past_key_values=past_key_values, + return_dict=False, + use_cache=True, + ) + end = perf_counter() + first_eager_fwd_pass_time = end - start + logger.info(f"completed first eager fwd pass in: {first_eager_fwd_pass_time}s") + start = perf_counter() + output = model.generate(**inputs, do_sample=False) + end = perf_counter() + first_eager_generate_time = end - start + logger.info(f"completed first eager generation in: {first_eager_generate_time}s") + logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") + + past_key_values = StaticCache( + model.config, + batch_size=batch_size, + device=device, + dtype=torch.float16, + max_cache_len=seq_length + num_tokens_to_generate, + ) + cache_position = torch.arange(seq_length, device=device) + start = perf_counter() + model( + **inputs, + cache_position=cache_position, + past_key_values=past_key_values, + return_dict=False, + use_cache=True, + ) + end = perf_counter() + second_eager_fwd_pass_time = end - start + logger.info(f"completed second eager fwd pass in: {second_eager_fwd_pass_time}s") + start = perf_counter() + model.generate(**inputs, do_sample=False) + end = perf_counter() + second_eager_generate_time = end - start + logger.info(f"completed second eager generation in: {second_eager_generate_time}s") + logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") + + torch.compiler.reset() + + ################ + # Forward pass # + ################ + + # `torch.compile(model, ...)` is not recommended as you compile callbacks + # and full generate. We recommend compiling only the forward for now. + # "reduce-overhead" will use cudagraphs. + generated_ids = torch.zeros( + (batch_size, num_tokens_to_generate + seq_length), dtype=torch.int, device=device + ) + + generated_ids[:, :seq_length] = inputs["input_ids"] + decode_one_token = torch.compile(decode_one_token, mode="reduce-overhead", fullgraph=True) + # model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) + # TODO use decode_one_token(model, input_id.clone(), cache_position) for verification + past_key_values = StaticCache( + model.config, + batch_size=batch_size, + device=device, + dtype=torch.float16, + max_cache_len=seq_length + num_tokens_to_generate + 10, + ) + cache_position = torch.arange(seq_length, device=device) + all_generated_tokens = [] + ### First compile, prefill + start = perf_counter() + next_token = decode_one_token( + model, inputs["input_ids"], cache_position=cache_position, past_key_values=past_key_values + ) + torch.cuda.synchronize() + end = perf_counter() + time_to_first_token = end - start + logger.info(f"completed first compile generation in: {time_to_first_token}s") + cache_position += 1 + all_generated_tokens += next_token.clone().detach().cpu().tolist() + + cache_position = torch.tensor([seq_length], device=device) + ### First compile, decoding + start = perf_counter() + next_token = decode_one_token( + model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values + ) + torch.cuda.synchronize() + end = perf_counter() + time_to_second_token = end - start + logger.info(f"completed second compile generation in: {time_to_first_token}s") + cache_position += 1 + all_generated_tokens += next_token.clone().detach().cpu().tolist() + + ### Second compile, decoding + start = perf_counter() + next_token = decode_one_token( + model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values + ) + torch.cuda.synchronize() + end = perf_counter() + time_to_third_token = end - start + logger.info(f"completed third compile forward in: {time_to_first_token}s") + cache_position += 1 + all_generated_tokens += next_token.clone().detach().cpu().tolist() + + ### Using cuda graphs decoding + + start = perf_counter() + for _ in range(1, num_tokens_to_generate): + all_generated_tokens += next_token.clone().detach().cpu().tolist() + next_token = decode_one_token( + model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values + ) + cache_position += 1 + torch.cuda.synchronize() + end = perf_counter() + mean_time_to_next_token = (end - start) / num_tokens_to_generate + logger.info(f"completed next compile generation in: {mean_time_to_next_token}s") + logger.info(f"generated: {tokenizer.batch_decode(all_generated_tokens)}") + + #################### + # Generate compile # + #################### + torch.compiler.reset() + # we will not compile full generate as it' s to intensive, tho we measure full forward! + + past_key_values = StaticCache( + model.config, + batch_size=batch_size, + device=device, + dtype=torch.float16, + max_cache_len=seq_length + 128, + ) + + # 1st call + start = perf_counter() + output = model.generate(**inputs, past_key_values=past_key_values) + torch.cuda.synchronize() + end = perf_counter() + first_compile_generate_time = end - start + logger.info(f"completed first compile generation in: {first_compile_generate_time}s") + logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") + + past_key_values = StaticCache( + model.config, + batch_size=batch_size, + device=device, + dtype=torch.float16, + max_cache_len=seq_length + 128, + ) + # 2nd call + start = perf_counter() + output = model.generate(**inputs, past_key_values=past_key_values) + torch.cuda.synchronize() + end = perf_counter() + second_compile_generate_time = end - start + logger.info(f"completed second compile generation in: {second_compile_generate_time}s") + logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") + + past_key_values = StaticCache( + model.config, + batch_size=batch_size, + device=device, + dtype=torch.float16, + max_cache_len=seq_length + 128, + ) + + # 3nd call + start = perf_counter() + output = model.generate(**inputs, past_key_values=past_key_values) + end = perf_counter() + third_compile_generate_time = end - start + logger.info(f"completed second compile generation in: {third_compile_generate_time}s") + logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") + + past_key_values = StaticCache( + model.config, + batch_size=batch_size, + device=device, + dtype=torch.float16, + max_cache_len=seq_length + 128, + ) + # 4th call + start = perf_counter() + output = model.generate(**inputs, past_key_values=past_key_values) + end = perf_counter() + fourth_compile_generate_time = end - start + logger.info(f"completed second compile generation in: {fourth_compile_generate_time}s") + logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") + + cur.execute( + """ + INSERT INTO model_measurements ( + benchmark_id, + measurements + ) VALUES (%s, %s) + """, + ( + benchmark_id, + { + "model_load_time": model_load_time, + "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time, + "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time, + "first_eager_generate_time_secs": first_eager_generate_time, + "second_eager_generate_time_secs": second_eager_generate_time, + "time_to_first_token_secs": time_to_first_token, + "time_to_second_token_secs": time_to_second_token, + "time_to_third_token_secs": time_to_third_token, + "time_to_next_token_mean_secs": mean_time_to_next_token, + "first_compile_generate_time_secs": first_compile_generate_time, + "second_compile_generate_time_secs": second_compile_generate_time, + "third_compile_generate_time_secs": third_compile_generate_time, + "fourth_compile_generate_time_secs": fourth_compile_generate_time, + }, + ), + ) + conn.commit() + conn.close() + except Exception as e: + logger.error(f"Caught exception: {e}") + continue_metric_collection.set() + if metrics_thread is not None: + metrics_thread.join() + + +if __name__ == "__main__": + branch, commit_id, commit_msg = parse_arguments() + run_benchmark(branch, commit_id, commit_msg, num_tokens_to_generate=20) diff --git a/benchmark/requirements.txt b/benchmark/requirements.txt new file mode 100644 index 00000000000..50e9dfaddfa --- /dev/null +++ b/benchmark/requirements.txt @@ -0,0 +1,5 @@ +gpustat==1.1.1 +psutil==6.0.0 +psycopg2==2.9.9 +torch>=2.4.0 +hf_transfer \ No newline at end of file