mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-04 05:10:06 +06:00

* stash for now * initial commit * small updated * up * up * works! * nits and fixes * don't loop too much * finish working example * update * fix the small freeblocks issue * feat: stream inputs to continuous batch * fix: update attn from `eager` to `sdpa` * refactor: fmt * refactor: cleanup unnecessary code * feat: add `update` fn to `PagedAttentionCache` * feat: broken optimal block size computation * fix: debugging invalid cache logic * fix: attention mask * refactor: use custom prompts for example * feat: add streaming output * fix: prefill split refactor: add doc strings and unsound/redundant logic fix: compute optimal blocks logic * fix: send decoded tokens when `prefilling_split` -> `decoding` * refactor: move logic to appropriate parent class * fix: remove truncation as we split prefilling anyways refactor: early return when we have enough selected requests * feat: add paged attention forward * push Ggraoh> * add paged sdpa * update * btter mps defaults * feat: add progress bar for `generate_batch` * feat: add opentelemetry metrics (ttft + batch fill %age) * feat: add tracing * Add cuda graphs (#38059) * draft cudagraphs addition * nits * styling * update * fix * kinda draft of what it should look like * fixes * lol * not sure why inf everywhere * can generate but output is shit * some fixes * we should have a single device synch * broken outputs but it does run * refactor * updates * updates with some fixes * fix mask causality * another commit that casts after * add error * simplify example * update * updates * revert llama changes * fix merge conflicts * fix: tracing and metrics * my updates * update script default values * fix block allocation issue * fix prefill split attnetion mask * no bugs * add paged eager * fix * update * style * feat: add pytorch traces * fix * fix * refactor: remove pytorch profiler data * style * nits * cleanup * draft test file * fix * fix * fix paged and graphs * small renamings * cleanups and push * refactor: move tracing and metrics logic to utils * refactor: trace more blocks of code * nits * nits * update * to profile or not to profile * refactor: create new output object * causal by default * cleanup but generations are still off for IDK what reason * simplifications but not running still * this does work. * small quality of life updates * nits * updaet * fix the scheduler * fix warning * ol * fully fixed * nits * different generation parameters * nice * just style * feat: add cache memory usage * feat: add kv cache free memory * feat: add active/waiting count & req latency * do the sampling * fix: synchronize CUDA only if available and improve error handling in ContinuousBatchingManager * fix on mps * feat: add dashboard & histogram buckets * perf: improve waiting reqs data structures * attempt to compile, but we should only do it on mps AFAIK * feat: decouple scheduling logic * just a draft * c;eanup and fixup * optional * style * update * update * remove the draft documentation * fix import as well * update * fix the test * style doomed --------- Co-authored-by: Luc Georges <luc.sydney.georges@gmail.com>
91 lines
2.6 KiB
YAML
91 lines
2.6 KiB
YAML
stream_over_http_enabled: true
|
|
server:
|
|
http_listen_port: 3200
|
|
log_level: info
|
|
|
|
|
|
cache:
|
|
background:
|
|
writeback_goroutines: 5
|
|
caches:
|
|
- roles:
|
|
- frontend-search
|
|
memcached:
|
|
addresses: dns+memcached:11211
|
|
|
|
query_frontend:
|
|
search:
|
|
duration_slo: 5s
|
|
throughput_bytes_slo: 1.073741824e+09
|
|
metadata_slo:
|
|
duration_slo: 5s
|
|
throughput_bytes_slo: 1.073741824e+09
|
|
trace_by_id:
|
|
duration_slo: 100ms
|
|
metrics:
|
|
max_duration: 200h # maximum duration of a metrics query, increase for local setups
|
|
query_backend_after: 5m
|
|
duration_slo: 5s
|
|
throughput_bytes_slo: 1.073741824e+09
|
|
|
|
distributor:
|
|
receivers: # this configuration will listen on all ports and protocols that tempo is capable of.
|
|
jaeger: # the receives all come from the OpenTelemetry collector. more configuration information can
|
|
protocols: # be found there: https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver
|
|
thrift_http: #
|
|
endpoint: "tempo:14268" # for a production deployment you should only enable the receivers you need!
|
|
grpc:
|
|
endpoint: "tempo:14250"
|
|
thrift_binary:
|
|
endpoint: "tempo:6832"
|
|
thrift_compact:
|
|
endpoint: "tempo:6831"
|
|
zipkin:
|
|
endpoint: "tempo:9411"
|
|
otlp:
|
|
protocols:
|
|
grpc:
|
|
endpoint: "tempo:4317"
|
|
http:
|
|
endpoint: "tempo:4318"
|
|
opencensus:
|
|
endpoint: "tempo:55678"
|
|
|
|
ingester:
|
|
max_block_duration: 5m # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally
|
|
|
|
compactor:
|
|
compaction:
|
|
block_retention: 720h # overall Tempo trace retention. set for demo purposes
|
|
|
|
metrics_generator:
|
|
registry:
|
|
external_labels:
|
|
source: tempo
|
|
cluster: docker-compose
|
|
storage:
|
|
path: /var/tempo/generator/wal
|
|
remote_write:
|
|
- url: http://prometheus:9090/api/v1/write
|
|
send_exemplars: true
|
|
traces_storage:
|
|
path: /var/tempo/generator/traces
|
|
processor:
|
|
local_blocks:
|
|
filter_server_spans: false
|
|
flush_to_storage: true
|
|
|
|
storage:
|
|
trace:
|
|
backend: local # backend configuration to use
|
|
wal:
|
|
path: /var/tempo/wal # where to store the wal locally
|
|
local:
|
|
path: /var/tempo/blocks
|
|
|
|
overrides:
|
|
defaults:
|
|
metrics_generator:
|
|
processors: [service-graphs, span-metrics, local-blocks] # enables metrics generator
|
|
generate_native_histograms: both
|