Implement AsyncTextIteratorStreamer for asynchronous streaming (#34931)

* Add AsyncTextIteratorStreamer class * export AsyncTextIteratorStreamer * export AsyncTextIteratorStreamer * improve docs * missing import * missing import * doc example fix * doc example output fix * add pytest-asyncio * first attempt at tests * missing import * add pytest-asyncio * fallback to wait_for and raise TimeoutError on timeout * check for TimeoutError * autodoc * reorder imports * fix style --------- Co-authored-by: Arthur Zucker <arthur.zucker@gmail.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
2025-07-03 12:50:06 +06:00 · 2024-12-20 12:08:12 +01:00 · 2024-12-20 12:08:12 +01:00 · eafbb0eca7
commit eafbb0eca7
parent b5a557e5fe
7 changed files with 154 additions and 4 deletions
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@ -352,6 +352,8 @@ A [`Constraint`] can be used to force the generation to include specific tokens

 [[autodoc]] TextIteratorStreamer

+[[autodoc]] AsyncTextIteratorStreamer
+
 ## Caches

 [[autodoc]] Cache
--- a/setup.py
+++ b/setup.py
@ -148,6 +148,7 @@ _deps = [
    "pyyaml>=5.1",
    "pydantic",
    "pytest>=7.2.0,<8.0.0",
+    "pytest-asyncio",
    "pytest-timeout",
    "pytest-xdist",
    "python>=3.9.0",
@ -319,6 +320,7 @@ extras["tiktoken"] = deps_list("tiktoken", "blobfile")
 extras["testing"] = (
    deps_list(
        "pytest",
+        "pytest-asyncio",
        "pytest-rich",
        "pytest-xdist",
        "timeout-decorator",
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -122,6 +122,7 @@ _import_structure = {
    "feature_extraction_utils": ["BatchFeature", "FeatureExtractionMixin"],
    "file_utils": [],
    "generation": [
+        "AsyncTextIteratorStreamer",
        "CompileConfig",
        "GenerationConfig",
        "TextIteratorStreamer",
@ -5055,7 +5056,14 @@ if TYPE_CHECKING:
    from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin

    # Generation
-    from .generation import CompileConfig, GenerationConfig, TextIteratorStreamer, TextStreamer, WatermarkingConfig
+    from .generation import (
+        AsyncTextIteratorStreamer,
+        CompileConfig,
+        GenerationConfig,
+        TextIteratorStreamer,
+        TextStreamer,
+        WatermarkingConfig,
+    )
    from .hf_argparser import HfArgumentParser

    # Integrations
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@ -54,6 +54,7 @@ deps = {
    "pyyaml": "pyyaml>=5.1",
    "pydantic": "pydantic",
    "pytest": "pytest>=7.2.0,<8.0.0",
+    "pytest-asyncio": "pytest-asyncio",
    "pytest-timeout": "pytest-timeout",
    "pytest-xdist": "pytest-xdist",
    "python": "python>=3.9.0",
--- a/src/transformers/generation/init.py
+++ b/src/transformers/generation/init.py
@ -26,7 +26,7 @@ _import_structure = {
        "SynthIDTextWatermarkingConfig",
        "WatermarkingConfig",
    ],
-    "streamers": ["TextIteratorStreamer", "TextStreamer"],
+    "streamers": ["AsyncTextIteratorStreamer", "TextIteratorStreamer", "TextStreamer"],
 }

 try:
@ -199,7 +199,7 @@ if TYPE_CHECKING:
        SynthIDTextWatermarkingConfig,
        WatermarkingConfig,
    )
-    from .streamers import TextIteratorStreamer, TextStreamer
+    from .streamers import AsyncTextIteratorStreamer, TextIteratorStreamer, TextStreamer

    try:
        if not is_torch_available():
--- a/src/transformers/generation/streamers.py
+++ b/src/transformers/generation/streamers.py
@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import asyncio
 from queue import Queue
 from typing import TYPE_CHECKING, Optional

@ -225,3 +226,91 @@ class TextIteratorStreamer(TextStreamer):
            raise StopIteration()
        else:
            return value
+
+
+class AsyncTextIteratorStreamer(TextStreamer):
+    """
+    Streamer that stores print-ready text in a queue, to be used by a downstream application as an async iterator.
+    This is useful for applications that benefit from acessing the generated text asynchronously (e.g. in an
+    interactive Gradio demo).
+
+    <Tip warning={true}>
+
+    The API for the streamer classes is still under development and may change in the future.
+
+    </Tip>
+
+    Parameters:
+        tokenizer (`AutoTokenizer`):
+            The tokenized used to decode the tokens.
+        skip_prompt (`bool`, *optional*, defaults to `False`):
+            Whether to skip the prompt to `.generate()` or not. Useful e.g. for chatbots.
+        timeout (`float`, *optional*):
+            The timeout for the text queue. If `None`, the queue will block indefinitely. Useful to handle exceptions
+            in `.generate()`, when it is called in a separate thread.
+        decode_kwargs (`dict`, *optional*):
+            Additional keyword arguments to pass to the tokenizer's `decode` method.
+
+    Raises:
+        TimeoutError: If token generation time exceeds timeout value.
+
+    Examples:
+
+        ```python
+        >>> from transformers import AutoModelForCausalLM, AutoTokenizer, AsyncTextIteratorStreamer
+        >>> from threading import Thread
+        >>> import asyncio
+
+        >>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+        >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
+
+        >>> # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
+        >>> async def main():
+        ...     # Important: AsyncTextIteratorStreamer must be initialized inside a coroutine!
+        ...     streamer = AsyncTextIteratorStreamer(tok)
+        ...     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=20)
+        ...     thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        ...     thread.start()
+        ...     generated_text = ""
+        ...     async for new_text in streamer:
+        ...         generated_text += new_text
+        >>>     print(generated_text)
+        >>> asyncio.run(main())
+        An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,
+        ```
+    """
+
+    def __init__(
+        self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, timeout: float | None = None, **decode_kwargs
+    ):
+        super().__init__(tokenizer, skip_prompt, **decode_kwargs)
+        self.text_queue = asyncio.Queue()
+        self.stop_signal = None
+        self.timeout = timeout
+        self.loop = asyncio.get_running_loop()
+        self.has_asyncio_timeout = hasattr(asyncio, "timeout")
+
+    def on_finalized_text(self, text: str, stream_end: bool = False):
+        """Put the new text in the queue. If the stream is ending, also put a stop signal in the queue."""
+        self.loop.call_soon_threadsafe(self.text_queue.put_nowait, text)
+        if stream_end:
+            self.loop.call_soon_threadsafe(self.text_queue.put_nowait, self.stop_signal)
+
+    def __aiter__(self):
+        return self
+
+    async def __anext__(self):
+        try:
+            if self.has_asyncio_timeout:
+                async with asyncio.timeout(self.timeout):
+                    value = await self.text_queue.get()
+            else:
+                value = await asyncio.wait_for(self.text_queue.get(), timeout=self.timeout)
+        except asyncio.TimeoutError:
+            raise TimeoutError()
+        else:
+            if value == self.stop_signal:
+                raise StopAsyncIteration()
+            else:
+                return value
--- a/tests/generation/test_streamers.py
+++ b/tests/generation/test_streamers.py
@ -17,7 +17,15 @@ import unittest
 from queue import Empty
 from threading import Thread

-from transformers import AutoTokenizer, TextIteratorStreamer, TextStreamer, is_torch_available
+import pytest
+
+from transformers import (
+    AsyncTextIteratorStreamer,
+    AutoTokenizer,
+    TextIteratorStreamer,
+    TextStreamer,
+    is_torch_available,
+)
 from transformers.testing_utils import CaptureStdout, require_torch, torch_device

 from ..test_modeling_common import ids_tensor
@ -120,3 +128,43 @@ class StreamerTester(unittest.TestCase):
            streamer_text = ""
            for new_text in streamer:
                streamer_text += new_text
+
+
+@require_torch
+@pytest.mark.asyncio(loop_scope="class")
+class AsyncStreamerTester(unittest.IsolatedAsyncioTestCase):
+    async def test_async_iterator_streamer_matches_non_streaming(self):
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
+        model.config.eos_token_id = -1
+
+        input_ids = ids_tensor((1, 5), vocab_size=model.config.vocab_size).to(torch_device)
+        greedy_ids = model.generate(input_ids, max_new_tokens=10, do_sample=False)
+        greedy_text = tokenizer.decode(greedy_ids[0])
+
+        streamer = AsyncTextIteratorStreamer(tokenizer)
+        generation_kwargs = {"input_ids": input_ids, "max_new_tokens": 10, "do_sample": False, "streamer": streamer}
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        streamer_text = ""
+        async for new_text in streamer:
+            streamer_text += new_text
+
+        self.assertEqual(streamer_text, greedy_text)
+
+    async def test_async_iterator_streamer_timeout(self):
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
+        model.config.eos_token_id = -1
+
+        input_ids = ids_tensor((1, 5), vocab_size=model.config.vocab_size).to(torch_device)
+        streamer = AsyncTextIteratorStreamer(tokenizer, timeout=0.001)
+        generation_kwargs = {"input_ids": input_ids, "max_new_tokens": 10, "do_sample": False, "streamer": streamer}
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+
+        # The streamer will timeout after 0.001 seconds, so TimeoutError will be raised
+        with self.assertRaises(TimeoutError):
+            streamer_text = ""
+            async for new_text in streamer:
+                streamer_text += new_text