mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
[cursor] tmp commit
This commit is contained in:
parent
379dc36e2f
commit
c89d956cef
@ -66,7 +66,7 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct \
|
||||
## Serve CLI
|
||||
|
||||
> [!WARNING]
|
||||
> This section is experimental and subject to changes in future versions
|
||||
> This section is experimental and subject to change in future versions
|
||||
|
||||
<!-- TODO: LLMs -> models, after we add audio/image input/output support -->
|
||||
You can serve `transformers`-compatible LLMs with `transformers serve`. The server has a chat completion API compatible with the OpenAI SDK, so you can also quickly experiment with `transformers` models on existing aplications. To launch a server, use the `transformers serve` CLI:
|
||||
@ -78,12 +78,28 @@ transformers serve
|
||||
<!-- TODO: either fully align the two APIs, or link to the `transformers` version instead -->
|
||||
This server takes an extended version of the [`ChatCompletionInput`](https://huggingface.co/docs/huggingface_hub/v0.33.1/en/package_reference/inference_types#huggingface_hub.ChatCompletionInput), accepting a serialized `GenerationConfig` in its `extra_body` field for full `generate` parameterization. The CLI will dynamically load a new model as needed, following the `model` field in the request.
|
||||
|
||||
The simplest way to interact with the server is by sending an HTTP request with `cURL`, e.g.
|
||||
|
||||
```shell
|
||||
curl -X POST http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{"messages": [{"role": "system", "content": "hello"}], "temperature": 0.9, "max_tokens": 1000, "stream": true, "model": "Qwen/Qwen2.5-0.5B-Instruct"}'
|
||||
```
|
||||
|
||||
from which you'll receive multiple chunks
|
||||
|
||||
```shell
|
||||
data: {"object": "chat.completion.chunk", "id": "req_0", "created": 1751377863, "model": "Qwen/Qwen2.5-0.5B-Instruct", "system_fingerprint": "", "choices": [{"delta": {"role": "assistant", "content": "", "tool_call_id": null, "tool_calls": null}, "index": 0, "finish_reason": null, "logprobs": null}]}
|
||||
|
||||
data: {"object": "chat.completion.chunk", "id": "req_0", "created": 1751377863, "model": "Qwen/Qwen2.5-0.5B-Instruct", "system_fingerprint": "", "choices": [{"delta": {"role": "assistant", "content": "", "tool_call_id": null, "tool_calls": null}, "index": 0, "finish_reason": null, "logprobs": null}]}
|
||||
|
||||
(...)
|
||||
```
|
||||
|
||||
This server is also an MCP client, which can receive information available MCP servers (i.e. tools), massage their information into the model prompt, and prepare calls to these tools when the model commands to do so. Naturally, this requires a model that is trained to use tools.
|
||||
|
||||
> [!TIP]
|
||||
> At the moment, MCP tool usage in `transformers` is limited to the `qwen` family of models.
|
||||
|
||||
<!-- TODO: section with a minimal python example -->
|
||||
<!-- TODO: example with a minimal python example -->
|
||||
|
||||
### Example 1: `tiny-agents` and MCP Tools
|
||||
|
||||
|
@ -133,15 +133,17 @@ def create_generation_config_from_req(req: "ChatCompletionInput") -> "Generation
|
||||
generation_config = GenerationConfig()
|
||||
|
||||
if req.frequency_penalty is not None:
|
||||
generation_config.repetition_penalty = req.frequency_penalty
|
||||
generation_config.repetition_penalty = float(req.frequency_penalty)
|
||||
if req.logit_bias is not None:
|
||||
generation_config.sequence_bias = req.logit_bias
|
||||
if req.stop is not None:
|
||||
generation_config.stop_strings = req.stop
|
||||
if req.temperature is not None:
|
||||
generation_config.temperature = req.temperature
|
||||
generation_config.temperature = float(req.temperature)
|
||||
if float(req.temperature) == 0.0:
|
||||
generation_config.do_sample = False
|
||||
if req.top_p is not None:
|
||||
generation_config.top_p = req.top_p
|
||||
generation_config.top_p = float(req.top_p)
|
||||
if req.seed is not None:
|
||||
torch.manual_seed(req.seed)
|
||||
|
||||
@ -256,6 +258,9 @@ class ServeCommand(BaseTransformersCLICommand):
|
||||
finish_reason: Optional[str] = None,
|
||||
tool_calls: Optional[list[ChatCompletionStreamOutputDeltaToolCall]] = None,
|
||||
) -> str:
|
||||
print("role", role)
|
||||
print("content", content)
|
||||
print("finish_reason", finish_reason)
|
||||
payload = {
|
||||
"object": "chat.completion.chunk",
|
||||
"id": request_id,
|
||||
@ -280,6 +285,16 @@ class ServeCommand(BaseTransformersCLICommand):
|
||||
def run(self):
|
||||
app = FastAPI()
|
||||
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=['*'],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
if self.use_continuous_batching:
|
||||
self.continuous_batching(app)
|
||||
else:
|
||||
@ -401,8 +416,8 @@ class ServeCommand(BaseTransformersCLICommand):
|
||||
# No cached messages: this is a new request
|
||||
if self.last_messages is None:
|
||||
req_continues_last_messages = False
|
||||
# The new request has fewer rounds of conversation: this is a new request
|
||||
elif len(self.last_messages) > len(req.messages):
|
||||
# The new request no new rounds of conversation: this is a new request
|
||||
elif len(self.last_messages) >= len(req.messages):
|
||||
req_continues_last_messages = False
|
||||
# Otherwise, check that the last messages are a subset of the new request
|
||||
else:
|
||||
|
@ -1773,6 +1773,10 @@ class GenerationMixin(ContinuousMixin):
|
||||
):
|
||||
modified_values[key] = model_gen_config_value
|
||||
setattr(generation_config, key, model_gen_config_value)
|
||||
# edge case: we may set `temperature=0.0` and `do_sample=False`, but the model defaults to
|
||||
# `do_sample=True`
|
||||
if generation_config.temperature == 0.0:
|
||||
generation_config.do_sample = False
|
||||
if use_model_defaults is None and len(modified_values) > 0:
|
||||
logger.warning_once(
|
||||
f"`generation_config` default values have been modified to match model-specific defaults: "
|
||||
|
Loading…
Reference in New Issue
Block a user