[cursor] tmp commit

This commit is contained in:
Joao Gante 2025-07-01 16:53:48 +00:00
parent 379dc36e2f
commit c89d956cef
3 changed files with 42 additions and 7 deletions

View File

@ -66,7 +66,7 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct \
## Serve CLI
> [!WARNING]
> This section is experimental and subject to changes in future versions
> This section is experimental and subject to change in future versions
<!-- TODO: LLMs -> models, after we add audio/image input/output support -->
You can serve `transformers`-compatible LLMs with `transformers serve`. The server has a chat completion API compatible with the OpenAI SDK, so you can also quickly experiment with `transformers` models on existing aplications. To launch a server, use the `transformers serve` CLI:
@ -78,12 +78,28 @@ transformers serve
<!-- TODO: either fully align the two APIs, or link to the `transformers` version instead -->
This server takes an extended version of the [`ChatCompletionInput`](https://huggingface.co/docs/huggingface_hub/v0.33.1/en/package_reference/inference_types#huggingface_hub.ChatCompletionInput), accepting a serialized `GenerationConfig` in its `extra_body` field for full `generate` parameterization. The CLI will dynamically load a new model as needed, following the `model` field in the request.
The simplest way to interact with the server is by sending an HTTP request with `cURL`, e.g.
```shell
curl -X POST http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{"messages": [{"role": "system", "content": "hello"}], "temperature": 0.9, "max_tokens": 1000, "stream": true, "model": "Qwen/Qwen2.5-0.5B-Instruct"}'
```
from which you'll receive multiple chunks
```shell
data: {"object": "chat.completion.chunk", "id": "req_0", "created": 1751377863, "model": "Qwen/Qwen2.5-0.5B-Instruct", "system_fingerprint": "", "choices": [{"delta": {"role": "assistant", "content": "", "tool_call_id": null, "tool_calls": null}, "index": 0, "finish_reason": null, "logprobs": null}]}
data: {"object": "chat.completion.chunk", "id": "req_0", "created": 1751377863, "model": "Qwen/Qwen2.5-0.5B-Instruct", "system_fingerprint": "", "choices": [{"delta": {"role": "assistant", "content": "", "tool_call_id": null, "tool_calls": null}, "index": 0, "finish_reason": null, "logprobs": null}]}
(...)
```
This server is also an MCP client, which can receive information available MCP servers (i.e. tools), massage their information into the model prompt, and prepare calls to these tools when the model commands to do so. Naturally, this requires a model that is trained to use tools.
> [!TIP]
> At the moment, MCP tool usage in `transformers` is limited to the `qwen` family of models.
<!-- TODO: section with a minimal python example -->
<!-- TODO: example with a minimal python example -->
### Example 1: `tiny-agents` and MCP Tools

View File

@ -133,15 +133,17 @@ def create_generation_config_from_req(req: "ChatCompletionInput") -> "Generation
generation_config = GenerationConfig()
if req.frequency_penalty is not None:
generation_config.repetition_penalty = req.frequency_penalty
generation_config.repetition_penalty = float(req.frequency_penalty)
if req.logit_bias is not None:
generation_config.sequence_bias = req.logit_bias
if req.stop is not None:
generation_config.stop_strings = req.stop
if req.temperature is not None:
generation_config.temperature = req.temperature
generation_config.temperature = float(req.temperature)
if float(req.temperature) == 0.0:
generation_config.do_sample = False
if req.top_p is not None:
generation_config.top_p = req.top_p
generation_config.top_p = float(req.top_p)
if req.seed is not None:
torch.manual_seed(req.seed)
@ -256,6 +258,9 @@ class ServeCommand(BaseTransformersCLICommand):
finish_reason: Optional[str] = None,
tool_calls: Optional[list[ChatCompletionStreamOutputDeltaToolCall]] = None,
) -> str:
print("role", role)
print("content", content)
print("finish_reason", finish_reason)
payload = {
"object": "chat.completion.chunk",
"id": request_id,
@ -280,6 +285,16 @@ class ServeCommand(BaseTransformersCLICommand):
def run(self):
app = FastAPI()
from fastapi.middleware.cors import CORSMiddleware
app.add_middleware(
CORSMiddleware,
allow_origins=['*'],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
if self.use_continuous_batching:
self.continuous_batching(app)
else:
@ -401,8 +416,8 @@ class ServeCommand(BaseTransformersCLICommand):
# No cached messages: this is a new request
if self.last_messages is None:
req_continues_last_messages = False
# The new request has fewer rounds of conversation: this is a new request
elif len(self.last_messages) > len(req.messages):
# The new request no new rounds of conversation: this is a new request
elif len(self.last_messages) >= len(req.messages):
req_continues_last_messages = False
# Otherwise, check that the last messages are a subset of the new request
else:

View File

@ -1773,6 +1773,10 @@ class GenerationMixin(ContinuousMixin):
):
modified_values[key] = model_gen_config_value
setattr(generation_config, key, model_gen_config_value)
# edge case: we may set `temperature=0.0` and `do_sample=False`, but the model defaults to
# `do_sample=True`
if generation_config.temperature == 0.0:
generation_config.do_sample = False
if use_model_defaults is None and len(modified_values) > 0:
logger.warning_once(
f"`generation_config` default values have been modified to match model-specific defaults: "