diff --git a/invokeai/app/api/routers/utilities.py b/invokeai/app/api/routers/utilities.py
index 568546603ab..06f3e4a44c2 100644
--- a/invokeai/app/api/routers/utilities.py
+++ b/invokeai/app/api/routers/utilities.py
@@ -74,6 +74,9 @@ class ExpandPromptRequest(BaseModel):
     model_key: str
     max_tokens: int = Field(default=300, ge=1, le=2048)
     system_prompt: str | None = None
+    task_id: str | None = Field(
+        default=None, description="Client-supplied task ID used to correlate socket progress events to this request"
+    )
 
 
 class ExpandPromptResponse(BaseModel):
@@ -90,14 +93,25 @@ def _resolve_model_path(model_config_path: str) -> Path:
     return (base_models_path / model_path).resolve()
 
 
-def _run_expand_prompt(prompt: str, model_key: str, max_tokens: int, system_prompt: str | None) -> str:
+def _run_expand_prompt(
+    prompt: str,
+    model_key: str,
+    max_tokens: int,
+    system_prompt: str | None,
+    task_id: str | None,
+    user_id: str,
+) -> str:
     """Run text LLM inference synchronously (called from thread)."""
     model_manager = ApiDependencies.invoker.services.model_manager
+    events = ApiDependencies.invoker.services.events
     model_config = model_manager.store.get_model(model_key)
 
     if model_config.type != ModelType.TextLLM:
         raise ValueError(f"Model '{model_key}' is not a TextLLM model (got {model_config.type})")
 
+    if task_id is not None:
+        events.emit_llm_task_progress(task_id=task_id, user_id=user_id, phase="loading_model", message="Loading model")
+
     with _model_load_lock:
         loaded_model = model_manager.load.load_model(model_config)
 
@@ -107,12 +121,28 @@ def _run_expand_prompt(prompt: str, model_key: str, max_tokens: int, system_prom
 
         pipeline = TextLLMPipeline(model, tokenizer)
         model_device = next(model.parameters()).device
+
+        progress_callback = None
+        if task_id is not None:
+
+            def progress_callback(current: int, total: int) -> None:
+                events.emit_llm_task_progress(
+                    task_id=task_id,
+                    user_id=user_id,
+                    phase="generating",
+                    message="Generating",
+                    percentage=(current / total) if total > 0 else None,
+                    current_tokens=current,
+                    total_tokens=total,
+                )
+
         output = pipeline.run(
             prompt=prompt,
             system_prompt=system_prompt or DEFAULT_SYSTEM_PROMPT,
             max_new_tokens=max_tokens,
             device=model_device,
             dtype=TorchDevice.choose_torch_dtype(),
+            progress_callback=progress_callback,
         )
 
     return output
@@ -127,6 +157,7 @@ def _run_expand_prompt(prompt: str, model_key: str, max_tokens: int, system_prom
 )
 async def expand_prompt(current_user: CurrentUserOrDefault, body: ExpandPromptRequest) -> ExpandPromptResponse:
     """Expand a brief prompt into a detailed image generation prompt using a text LLM."""
+    events = ApiDependencies.invoker.services.events
     try:
         expanded = await asyncio.to_thread(
             _run_expand_prompt,
@@ -134,13 +165,23 @@ async def expand_prompt(current_user: CurrentUserOrDefault, body: ExpandPromptRe
             body.model_key,
             body.max_tokens,
             body.system_prompt,
+            body.task_id,
+            current_user.user_id,
         )
+        if body.task_id is not None:
+            events.emit_llm_task_complete(task_id=body.task_id, user_id=current_user.user_id)
         return ExpandPromptResponse(expanded_prompt=expanded)
     except UnknownModelException:
+        if body.task_id is not None:
+            events.emit_llm_task_error(task_id=body.task_id, user_id=current_user.user_id, error="Model not found")
         raise HTTPException(status_code=404, detail=f"Model '{body.model_key}' not found")
     except ValueError as e:
+        if body.task_id is not None:
+            events.emit_llm_task_error(task_id=body.task_id, user_id=current_user.user_id, error=str(e))
         raise HTTPException(status_code=422, detail=str(e))
     except Exception as e:
+        if body.task_id is not None:
+            events.emit_llm_task_error(task_id=body.task_id, user_id=current_user.user_id, error=str(e))
         logger.error(f"Error expanding prompt: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 
@@ -152,6 +193,9 @@ class ImageToPromptRequest(BaseModel):
     image_name: str
     model_key: str
     instruction: str = "Describe this image in detail for use as an AI image generation prompt."
+    task_id: str | None = Field(
+        default=None, description="Client-supplied task ID used to correlate socket progress events to this request"
+    )
 
 
 class ImageToPromptResponse(BaseModel):
@@ -159,14 +203,24 @@ class ImageToPromptResponse(BaseModel):
     error: str | None = None
 
 
-def _run_image_to_prompt(image_name: str, model_key: str, instruction: str) -> str:
+def _run_image_to_prompt(
+    image_name: str,
+    model_key: str,
+    instruction: str,
+    task_id: str | None,
+    user_id: str,
+) -> str:
     """Run LLaVA OneVision inference synchronously (called from thread)."""
     model_manager = ApiDependencies.invoker.services.model_manager
+    events = ApiDependencies.invoker.services.events
     model_config = model_manager.store.get_model(model_key)
 
     if model_config.type != ModelType.LlavaOnevision:
         raise ValueError(f"Model '{model_key}' is not a LLaVA OneVision model (got {model_config.type})")
 
+    if task_id is not None:
+        events.emit_llm_task_progress(task_id=task_id, user_id=user_id, phase="loading_model", message="Loading model")
+
     with _model_load_lock:
         loaded_model = model_manager.load.load_model(model_config)
 
@@ -185,11 +239,27 @@ def _run_image_to_prompt(image_name: str, model_key: str, instruction: str) -> s
 
         pipeline = LlavaOnevisionPipeline(model, processor)
         model_device = next(model.parameters()).device
+
+        progress_callback = None
+        if task_id is not None:
+
+            def progress_callback(current: int, total: int) -> None:
+                events.emit_llm_task_progress(
+                    task_id=task_id,
+                    user_id=user_id,
+                    phase="generating",
+                    message="Generating",
+                    percentage=(current / total) if total > 0 else None,
+                    current_tokens=current,
+                    total_tokens=total,
+                )
+
         output = pipeline.run(
             prompt=instruction,
             images=[image],
             device=model_device,
             dtype=TorchDevice.choose_torch_dtype(),
+            progress_callback=progress_callback,
         )
 
     return output
@@ -208,20 +278,33 @@ async def image_to_prompt(current_user: CurrentUserOrDefault, body: ImageToPromp
     # via this endpoint (mirrors the policy in routers/images.py).
     assert_image_read_access(body.image_name, current_user)
 
+    events = ApiDependencies.invoker.services.events
     try:
         prompt = await asyncio.to_thread(
             _run_image_to_prompt,
             body.image_name,
             body.model_key,
             body.instruction,
+            body.task_id,
+            current_user.user_id,
         )
+        if body.task_id is not None:
+            events.emit_llm_task_complete(task_id=body.task_id, user_id=current_user.user_id)
         return ImageToPromptResponse(prompt=prompt)
     except UnknownModelException:
+        if body.task_id is not None:
+            events.emit_llm_task_error(task_id=body.task_id, user_id=current_user.user_id, error="Model not found")
         raise HTTPException(status_code=404, detail=f"Model '{body.model_key}' not found")
     except ImageFileNotFoundException:
+        if body.task_id is not None:
+            events.emit_llm_task_error(task_id=body.task_id, user_id=current_user.user_id, error="Image not found")
         raise HTTPException(status_code=404, detail=f"Image '{body.image_name}' not found")
     except (ValueError, TypeError) as e:
+        if body.task_id is not None:
+            events.emit_llm_task_error(task_id=body.task_id, user_id=current_user.user_id, error=str(e))
         raise HTTPException(status_code=422, detail=str(e))
     except Exception as e:
+        if body.task_id is not None:
+            events.emit_llm_task_error(task_id=body.task_id, user_id=current_user.user_id, error=str(e))
         logger.error(f"Error generating prompt from image: {e}")
         raise HTTPException(status_code=500, detail=str(e))
diff --git a/invokeai/app/api/sockets.py b/invokeai/app/api/sockets.py
index 5783b804c0b..2c6e428d8f0 100644
--- a/invokeai/app/api/sockets.py
+++ b/invokeai/app/api/sockets.py
@@ -24,6 +24,10 @@
     InvocationErrorEvent,
     InvocationProgressEvent,
     InvocationStartedEvent,
+    LLMTaskCompleteEvent,
+    LLMTaskErrorEvent,
+    LLMTaskEventBase,
+    LLMTaskProgressEvent,
     ModelEventBase,
     ModelInstallCancelledEvent,
     ModelInstallCompleteEvent,
@@ -87,6 +91,8 @@ class BulkDownloadSubscriptionEvent(BaseModel):
 
 BULK_DOWNLOAD_EVENTS = {BulkDownloadStartedEvent, BulkDownloadCompleteEvent, BulkDownloadErrorEvent}
 
+LLM_TASK_EVENTS = {LLMTaskProgressEvent, LLMTaskCompleteEvent, LLMTaskErrorEvent}
+
 
 class SocketIO:
     _sub_queue = "subscribe_queue"
@@ -115,6 +121,7 @@ def __init__(self, app: FastAPI):
         register_events(QUEUE_EVENTS, self._handle_queue_event)
         register_events(MODEL_EVENTS, self._handle_model_event)
         register_events(BULK_DOWNLOAD_EVENTS, self._handle_bulk_image_download_event)
+        register_events(LLM_TASK_EVENTS, self._handle_llm_task_event)
 
     async def _handle_connect(self, sid: str, environ: dict, auth: dict | None) -> bool:
         """Handle socket connection and authenticate the user.
@@ -345,6 +352,18 @@ async def _handle_queue_event(self, event: FastAPIEvent[QueueEventBase]):
     async def _handle_model_event(self, event: FastAPIEvent[ModelEventBase | DownloadEventBase]) -> None:
         await self._sio.emit(event=event[0], data=event[1].model_dump(mode="json"))
 
+    async def _handle_llm_task_event(self, event: FastAPIEvent[LLMTaskEventBase]) -> None:
+        """Route LLM utility task events privately to the originating user + admins.
+
+        These events carry partial prompt content (via the task_id correlation) and
+        must not be broadcast to other users.
+        """
+        event_name, event_data = event
+        user_room = f"user:{event_data.user_id}"
+        payload = event_data.model_dump(mode="json")
+        await self._sio.emit(event=event_name, data=payload, room=user_room)
+        await self._sio.emit(event=event_name, data=payload, room="admin")
+
     async def _handle_bulk_image_download_event(self, event: FastAPIEvent[BulkDownloadEventBase]) -> None:
         event_name, event_data = event
         # Route to user-specific + admin rooms so that other authenticated
diff --git a/invokeai/app/services/events/events_base.py b/invokeai/app/services/events/events_base.py
index 935b422a732..1dfd02728da 100644
--- a/invokeai/app/services/events/events_base.py
+++ b/invokeai/app/services/events/events_base.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2022 Kyle Schouviller (https://github.com/kyle0654)
 
 
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Literal, Optional
 
 from invokeai.app.services.events.events_common import (
     BatchEnqueuedEvent,
@@ -19,6 +19,9 @@
     InvocationErrorEvent,
     InvocationProgressEvent,
     InvocationStartedEvent,
+    LLMTaskCompleteEvent,
+    LLMTaskErrorEvent,
+    LLMTaskProgressEvent,
     ModelInstallCancelledEvent,
     ModelInstallCompleteEvent,
     ModelInstallDownloadProgressEvent,
@@ -191,6 +194,41 @@ def emit_model_install_error(self, job: "ModelInstallJob") -> None:
 
     # endregion
 
+    # region LLM utility tasks
+
+    def emit_llm_task_progress(
+        self,
+        task_id: str,
+        user_id: str,
+        phase: Literal["loading_model", "generating"],
+        message: str,
+        percentage: float | None = None,
+        current_tokens: int | None = None,
+        total_tokens: int | None = None,
+    ) -> None:
+        """Emit a progress event for an LLM utility task (expand-prompt, image-to-prompt)."""
+        self.dispatch(
+            LLMTaskProgressEvent(
+                task_id=task_id,
+                user_id=user_id,
+                phase=phase,
+                message=message,
+                percentage=percentage,
+                current_tokens=current_tokens,
+                total_tokens=total_tokens,
+            )
+        )
+
+    def emit_llm_task_complete(self, task_id: str, user_id: str) -> None:
+        """Emit a completion event for an LLM utility task."""
+        self.dispatch(LLMTaskCompleteEvent(task_id=task_id, user_id=user_id))
+
+    def emit_llm_task_error(self, task_id: str, user_id: str, error: str) -> None:
+        """Emit an error event for an LLM utility task."""
+        self.dispatch(LLMTaskErrorEvent(task_id=task_id, user_id=user_id, error=error))
+
+    # endregion
+
     # region Bulk image download
 
     def emit_bulk_download_started(
diff --git a/invokeai/app/services/events/events_common.py b/invokeai/app/services/events/events_common.py
index 0c530f9a2f7..6b06c7be060 100644
--- a/invokeai/app/services/events/events_common.py
+++ b/invokeai/app/services/events/events_common.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Any, ClassVar, Coroutine, Generic, Optional, Protocol, TypeAlias, TypeVar
+from typing import TYPE_CHECKING, Any, ClassVar, Coroutine, Generic, Literal, Optional, Protocol, TypeAlias, TypeVar
 
 from fastapi_events.handlers.local import local_handler
 from fastapi_events.registry.payload_schema import registry as payload_schema
@@ -689,6 +689,51 @@ def build(
         )
 
 
+class LLMTaskEventBase(EventBase):
+    """Base class for LLM utility task events (expand-prompt, image-to-prompt).
+
+    These events are correlated to a specific HTTP request via a client-supplied
+    task_id and routed privately to the originating user so partial prompt content
+    is not broadcast.
+    """
+
+    task_id: str = Field(description="Client-supplied task ID correlating events to a single request")
+    user_id: str = Field(default="system", description="ID of the user who initiated the task")
+
+
+@payload_schema.register
+class LLMTaskProgressEvent(LLMTaskEventBase):
+    """Event model for llm_task_progress"""
+
+    __event_name__ = "llm_task_progress"
+
+    phase: Literal["loading_model", "generating"] = Field(description="Which phase of the task is in progress")
+    message: str = Field(description="A short message describing the current phase")
+    percentage: float | None = Field(
+        default=None, ge=0, le=1, description="Progress fraction in [0, 1]; omit for indeterminate progress"
+    )
+    current_tokens: int | None = Field(default=None, description="Number of tokens generated so far (generating phase)")
+    total_tokens: int | None = Field(
+        default=None, description="Max tokens the request will generate (generating phase)"
+    )
+
+
+@payload_schema.register
+class LLMTaskCompleteEvent(LLMTaskEventBase):
+    """Event model for llm_task_complete"""
+
+    __event_name__ = "llm_task_complete"
+
+
+@payload_schema.register
+class LLMTaskErrorEvent(LLMTaskEventBase):
+    """Event model for llm_task_error"""
+
+    __event_name__ = "llm_task_error"
+
+    error: str = Field(description="The error message")
+
+
 @payload_schema.register
 class RecallParametersUpdatedEvent(QueueEventBase):
     """Event model for recall_parameters_updated"""
diff --git a/invokeai/backend/llava_onevision_pipeline.py b/invokeai/backend/llava_onevision_pipeline.py
index 93614f40654..abb136ba9fe 100644
--- a/invokeai/backend/llava_onevision_pipeline.py
+++ b/invokeai/backend/llava_onevision_pipeline.py
@@ -1,6 +1,11 @@
+import threading
+from typing import Callable
+
 import torch
 from PIL.Image import Image
-from transformers import LlavaOnevisionForConditionalGeneration, LlavaOnevisionProcessor
+from transformers import LlavaOnevisionForConditionalGeneration, LlavaOnevisionProcessor, TextIteratorStreamer
+
+ProgressCallback = Callable[[int, int], None]
 
 
 class LlavaOnevisionPipeline:
@@ -10,7 +15,15 @@ def __init__(self, vllm_model: LlavaOnevisionForConditionalGeneration, processor
         self._vllm_model = vllm_model
         self._processor = processor
 
-    def run(self, prompt: str, images: list[Image], device: torch.device, dtype: torch.dtype) -> str:
+    def run(
+        self,
+        prompt: str,
+        images: list[Image],
+        device: torch.device,
+        dtype: torch.dtype,
+        max_new_tokens: int = 400,
+        progress_callback: ProgressCallback | None = None,
+    ) -> str:
         # TODO(ryand): Tune the max number of images that are useful for the model.
         if len(images) > 3:
             raise ValueError(
@@ -21,15 +34,46 @@ def run(self, prompt: str, images: list[Image], device: torch.device, dtype: tor
         # Define a chat history and use `apply_chat_template` to get correctly formatted prompt.
         # "content" is a list of dicts with types "text" or "image".
         content = [{"type": "text", "text": prompt}]
-        # Add the correct number of images.
         for _ in images:
             content.append({"type": "image"})
 
         conversation = [{"role": "user", "content": content}]
-        prompt = self._processor.apply_chat_template(conversation, add_generation_prompt=True)
-        inputs = self._processor(images=images or None, text=prompt, return_tensors="pt").to(device=device, dtype=dtype)
-        output = self._vllm_model.generate(**inputs, max_new_tokens=400, do_sample=False)
-        output_str: str = self._processor.decode(output[0][2:], skip_special_tokens=True)
-        # The output_str will include the prompt, so we extract the response.
-        response = output_str.split("assistant\n", 1)[1].strip()
-        return response
+        formatted_prompt = self._processor.apply_chat_template(conversation, add_generation_prompt=True)
+        inputs = self._processor(images=images or None, text=formatted_prompt, return_tensors="pt").to(
+            device=device, dtype=dtype
+        )
+
+        tokenizer = self._processor.tokenizer
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = dict(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=False,
+            streamer=streamer,
+        )
+
+        generation_error: list[BaseException] = []
+
+        def _generate() -> None:
+            try:
+                self._vllm_model.generate(**generation_kwargs)
+            except BaseException as e:
+                generation_error.append(e)
+
+        thread = threading.Thread(target=_generate, daemon=True)
+        thread.start()
+
+        chunks: list[str] = []
+        for chunk in streamer:
+            if not chunk:
+                continue
+            chunks.append(chunk)
+            if progress_callback is not None:
+                token_count = len(tokenizer.encode("".join(chunks), add_special_tokens=False))
+                progress_callback(min(token_count, max_new_tokens), max_new_tokens)
+
+        thread.join()
+        if generation_error:
+            raise generation_error[0]
+
+        return "".join(chunks).strip()
diff --git a/invokeai/backend/text_llm_pipeline.py b/invokeai/backend/text_llm_pipeline.py
index 69815c1a7f7..d0eb534adb4 100644
--- a/invokeai/backend/text_llm_pipeline.py
+++ b/invokeai/backend/text_llm_pipeline.py
@@ -1,5 +1,8 @@
+import threading
+from typing import Callable
+
 import torch
-from transformers import PreTrainedModel, PreTrainedTokenizerBase
+from transformers import PreTrainedModel, PreTrainedTokenizerBase, TextIteratorStreamer
 
 DEFAULT_SYSTEM_PROMPT = (
     "You are an expert prompt writer for AI image generation. "
@@ -8,6 +11,9 @@
 )
 
 
+ProgressCallback = Callable[[int, int], None]
+
+
 class TextLLMPipeline:
     """A wrapper for a causal language model + tokenizer for text generation."""
 
@@ -22,6 +28,7 @@ def run(
         max_new_tokens: int = 300,
         device: torch.device = torch.device("cpu"),
         dtype: torch.dtype = torch.float16,
+        progress_callback: ProgressCallback | None = None,
     ) -> str:
         # Build messages for chat template if supported, otherwise use raw prompt.
         if hasattr(self._tokenizer, "apply_chat_template") and self._tokenizer.chat_template is not None:
@@ -33,24 +40,51 @@ def run(
                 messages, tokenize=False, add_generation_prompt=True
             )
         else:
-            # Fallback for models without chat template
             if system_prompt:
                 formatted_prompt = f"{system_prompt}\n\nUser: {prompt}\nAssistant:"
             else:
                 formatted_prompt = prompt
 
         inputs = self._tokenizer(formatted_prompt, return_tensors="pt").to(device=device)
-        output = self._model.generate(
+
+        streamer = TextIteratorStreamer(self._tokenizer, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = dict(
             **inputs,
             max_new_tokens=max_new_tokens,
             do_sample=True,
             temperature=0.7,
             top_p=0.9,
+            streamer=streamer,
         )
 
-        # Decode only the newly generated tokens (exclude the input prompt tokens).
-        input_length = inputs["input_ids"].shape[1]
-        generated_tokens = output[0][input_length:]
-        response = self._tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
+        # model.generate blocks until done; run it in a thread so we can consume the
+        # streamer iteratively and emit progress.
+        generation_error: list[BaseException] = []
+
+        def _generate() -> None:
+            try:
+                self._model.generate(**generation_kwargs)
+            except BaseException as e:
+                generation_error.append(e)
+
+        thread = threading.Thread(target=_generate, daemon=True)
+        thread.start()
+
+        chunks: list[str] = []
+        token_count = 0
+        for chunk in streamer:
+            if not chunk:
+                continue
+            chunks.append(chunk)
+            # The streamer yields decoded text chunks rather than individual tokens.
+            # Re-tokenizing each chunk to count tokens is expensive; instead approximate
+            # by re-tokenizing the accumulated text. This is exact enough for a progress bar.
+            token_count = len(self._tokenizer.encode("".join(chunks), add_special_tokens=False))
+            if progress_callback is not None:
+                progress_callback(min(token_count, max_new_tokens), max_new_tokens)
+
+        thread.join()
+        if generation_error:
+            raise generation_error[0]
 
-        return response
+        return "".join(chunks).strip()
diff --git a/invokeai/frontend/web/openapi.json b/invokeai/frontend/web/openapi.json
index e13946511e2..e44571afc1d 100644
--- a/invokeai/frontend/web/openapi.json
+++ b/invokeai/frontend/web/openapi.json
@@ -22178,6 +22178,18 @@
               }
             ],
             "title": "System Prompt"
+          },
+          "task_id": {
+            "anyOf": [
+              {
+                "type": "string"
+              },
+              {
+                "type": "null"
+              }
+            ],
+            "title": "Task Id",
+            "description": "Client-supplied task ID used to correlate socket progress events to this request"
           }
         },
         "type": "object",
@@ -34684,6 +34696,18 @@
             "type": "string",
             "title": "Instruction",
             "default": "Describe this image in detail for use as an AI image generation prompt."
+          },
+          "task_id": {
+            "anyOf": [
+              {
+                "type": "string"
+              },
+              {
+                "type": "null"
+              }
+            ],
+            "title": "Task Id",
+            "description": "Client-supplied task ID used to correlate socket progress events to this request"
           }
         },
         "type": "object",
@@ -42623,6 +42647,144 @@
         "type": "object"
       },
       "JsonValue": {},
+      "LLMTaskCompleteEvent": {
+        "description": "Event model for llm_task_complete",
+        "properties": {
+          "timestamp": {
+            "description": "The timestamp of the event",
+            "title": "Timestamp",
+            "type": "integer"
+          },
+          "task_id": {
+            "description": "Client-supplied task ID correlating events to a single request",
+            "title": "Task Id",
+            "type": "string"
+          },
+          "user_id": {
+            "default": "system",
+            "description": "ID of the user who initiated the task",
+            "title": "User Id",
+            "type": "string"
+          }
+        },
+        "required": ["timestamp", "task_id", "user_id"],
+        "title": "LLMTaskCompleteEvent",
+        "type": "object"
+      },
+      "LLMTaskErrorEvent": {
+        "description": "Event model for llm_task_error",
+        "properties": {
+          "timestamp": {
+            "description": "The timestamp of the event",
+            "title": "Timestamp",
+            "type": "integer"
+          },
+          "task_id": {
+            "description": "Client-supplied task ID correlating events to a single request",
+            "title": "Task Id",
+            "type": "string"
+          },
+          "user_id": {
+            "default": "system",
+            "description": "ID of the user who initiated the task",
+            "title": "User Id",
+            "type": "string"
+          },
+          "error": {
+            "description": "The error message",
+            "title": "Error",
+            "type": "string"
+          }
+        },
+        "required": ["timestamp", "task_id", "user_id", "error"],
+        "title": "LLMTaskErrorEvent",
+        "type": "object"
+      },
+      "LLMTaskProgressEvent": {
+        "description": "Event model for llm_task_progress",
+        "properties": {
+          "timestamp": {
+            "description": "The timestamp of the event",
+            "title": "Timestamp",
+            "type": "integer"
+          },
+          "task_id": {
+            "description": "Client-supplied task ID correlating events to a single request",
+            "title": "Task Id",
+            "type": "string"
+          },
+          "user_id": {
+            "default": "system",
+            "description": "ID of the user who initiated the task",
+            "title": "User Id",
+            "type": "string"
+          },
+          "phase": {
+            "description": "Which phase of the task is in progress",
+            "enum": ["loading_model", "generating"],
+            "title": "Phase",
+            "type": "string"
+          },
+          "message": {
+            "description": "A short message describing the current phase",
+            "title": "Message",
+            "type": "string"
+          },
+          "percentage": {
+            "anyOf": [
+              {
+                "maximum": 1,
+                "minimum": 0,
+                "type": "number"
+              },
+              {
+                "type": "null"
+              }
+            ],
+            "default": null,
+            "description": "Progress fraction in [0, 1]; omit for indeterminate progress",
+            "title": "Percentage"
+          },
+          "current_tokens": {
+            "anyOf": [
+              {
+                "type": "integer"
+              },
+              {
+                "type": "null"
+              }
+            ],
+            "default": null,
+            "description": "Number of tokens generated so far (generating phase)",
+            "title": "Current Tokens"
+          },
+          "total_tokens": {
+            "anyOf": [
+              {
+                "type": "integer"
+              },
+              {
+                "type": "null"
+              }
+            ],
+            "default": null,
+            "description": "Max tokens the request will generate (generating phase)",
+            "title": "Total Tokens"
+          }
+        },
+        "required": [
+          "timestamp",
+          "task_id",
+          "user_id",
+          "phase",
+          "message",
+          "percentage",
+          "current_tokens",
+          "total_tokens"
+        ],
+        "title": "LLMTaskProgressEvent",
+        "type": "object"
+      },
       "LaMaInfillInvocation": {
         "category": "inpaint",
         "class": "invocation",
diff --git a/invokeai/frontend/web/public/locales/en.json b/invokeai/frontend/web/public/locales/en.json
index 3e88d460e55..da256e17fb1 100644
--- a/invokeai/frontend/web/public/locales/en.json
+++ b/invokeai/frontend/web/public/locales/en.json
@@ -392,7 +392,9 @@
         "noTextLLMInstalledDescription": "Prompt expansion needs a Text LLM (causal language model). We recommend Qwen2.5-1.5B-Instruct (~3 GB) — small, fast, and available as a starter model.",
         "noVisionModelInstalledTitle": "No vision model installed",
         "noVisionModelInstalledDescription": "Image-to-prompt needs a vision-language model (e.g. LLaVA Onevision). The 0.5B starter (~1 GB) is the lightweight default.",
-        "openModelManager": "Open Model Manager"
+        "openModelManager": "Open Model Manager",
+        "llmTaskLoadingModel": "Loading model…",
+        "llmTaskGenerating": "Generating…"
     },
     "queue": {
         "queue": "Queue",
diff --git a/invokeai/frontend/web/src/features/prompt/ExpandPromptButton.tsx b/invokeai/frontend/web/src/features/prompt/ExpandPromptButton.tsx
index e0f035963a2..272b7f6810b 100644
--- a/invokeai/frontend/web/src/features/prompt/ExpandPromptButton.tsx
+++ b/invokeai/frontend/web/src/features/prompt/ExpandPromptButton.tsx
@@ -18,6 +18,7 @@ import { useDisclosure } from 'common/hooks/useBoolean';
 import { positivePromptChanged, selectPositivePrompt } from 'features/controlLayers/store/paramsSlice';
 import { setInstallModelsTabByName } from 'features/modelManagerV2/store/installModelsStore';
 import { ModelPicker } from 'features/parameters/components/ModelPicker';
+import { LLMTaskProgressDisplay } from 'features/prompt/LLMTaskProgressDisplay';
 import { setPromptUndo } from 'features/prompt/promptUndo';
 import { navigationApi } from 'features/ui/layouts/navigation-api';
 import { memo, useCallback, useState } from 'react';
@@ -26,6 +27,8 @@ import { PiSparkleBold } from 'react-icons/pi';
 import { useExpandPromptMutation } from 'services/api/endpoints/utilities';
 import { useTextLLMModels } from 'services/api/hooks/modelsByType';
 import type { AnyModelConfig } from 'services/api/types';
+import { clearLLMTaskState } from 'services/events/stores';
+import { v4 as uuidv4 } from 'uuid';
 
 const loadingStyles: SystemStyleObject = {
   svg: { animation: spinAnimation },
@@ -38,6 +41,7 @@ export const ExpandPromptButton = memo(() => {
   const [modelConfigs] = useTextLLMModels();
   const popover = useDisclosure(false);
   const [selectedModel, setSelectedModel] = useState<AnyModelConfig | undefined>(undefined);
+  const [taskId, setTaskId] = useState<string | null>(null);
   const [expandPrompt, { isLoading }] = useExpandPromptMutation();
 
   const hasModels = modelConfigs.length > 0;
@@ -50,10 +54,13 @@ export const ExpandPromptButton = memo(() => {
     if (!selectedModel || !prompt.trim()) {
       return;
     }
+    const newTaskId = uuidv4();
+    setTaskId(newTaskId);
     try {
       const result = await expandPrompt({
         prompt,
         model_key: selectedModel.key,
+        task_id: newTaskId,
       }).unwrap();
       if (result.expanded_prompt) {
         setPromptUndo(prompt);
@@ -62,6 +69,9 @@ export const ExpandPromptButton = memo(() => {
       popover.close();
     } catch {
       // Error is handled by RTK Query
+    } finally {
+      clearLLMTaskState(newTaskId);
+      setTaskId(null);
     }
   }, [selectedModel, prompt, expandPrompt, dispatch, popover]);
 
@@ -110,6 +120,7 @@ export const ExpandPromptButton = memo(() => {
                   onChange={handleModelChange}
                   placeholder={t('prompt.selectTextLLM')}
                 />
+                {isLoading ? <LLMTaskProgressDisplay taskId={taskId} /> : null}
                 <Button
                   size="sm"
                   colorScheme="invokeBlue"
diff --git a/invokeai/frontend/web/src/features/prompt/ImageToPromptButton.tsx b/invokeai/frontend/web/src/features/prompt/ImageToPromptButton.tsx
index fb4dbb668c8..5818883c2c6 100644
--- a/invokeai/frontend/web/src/features/prompt/ImageToPromptButton.tsx
+++ b/invokeai/frontend/web/src/features/prompt/ImageToPromptButton.tsx
@@ -20,6 +20,7 @@ import { useImageUploadButton } from 'common/hooks/useImageUploadButton';
 import { positivePromptChanged, selectPositivePrompt } from 'features/controlLayers/store/paramsSlice';
 import { setInstallModelsTabByName } from 'features/modelManagerV2/store/installModelsStore';
 import { ModelPicker } from 'features/parameters/components/ModelPicker';
+import { LLMTaskProgressDisplay } from 'features/prompt/LLMTaskProgressDisplay';
 import { setPromptUndo } from 'features/prompt/promptUndo';
 import { navigationApi } from 'features/ui/layouts/navigation-api';
 import { memo, useCallback, useEffect, useState } from 'react';
@@ -28,6 +29,8 @@ import { PiImageBold } from 'react-icons/pi';
 import { useImageToPromptMutation } from 'services/api/endpoints/utilities';
 import { useLlavaModels } from 'services/api/hooks/modelsByType';
 import type { AnyModelConfig, ImageDTO } from 'services/api/types';
+import { clearLLMTaskState } from 'services/events/stores';
+import { v4 as uuidv4 } from 'uuid';
 
 const loadingStyles: SystemStyleObject = {
   svg: { animation: spinAnimation },
@@ -46,6 +49,7 @@ export const ImageToPromptButton = memo(({ droppedImage, onClearDroppedImage }:
   const popover = useDisclosure(false);
   const [selectedModel, setSelectedModel] = useState<AnyModelConfig | undefined>(undefined);
   const [uploadedImage, setUploadedImage] = useState<ImageDTO | undefined>(undefined);
+  const [taskId, setTaskId] = useState<string | null>(null);
   const [imageToPrompt, { isLoading }] = useImageToPromptMutation();
 
   const hasModels = modelConfigs.length > 0;
@@ -76,10 +80,13 @@ export const ImageToPromptButton = memo(({ droppedImage, onClearDroppedImage }:
     if (!selectedModel || !uploadedImage) {
       return;
     }
+    const newTaskId = uuidv4();
+    setTaskId(newTaskId);
     try {
       const result = await imageToPrompt({
         image_name: uploadedImage.image_name,
         model_key: selectedModel.key,
+        task_id: newTaskId,
       }).unwrap();
       if (result.prompt) {
         setPromptUndo(currentPrompt);
@@ -89,6 +96,9 @@ export const ImageToPromptButton = memo(({ droppedImage, onClearDroppedImage }:
       setUploadedImage(undefined);
     } catch {
       // Error is handled by RTK Query
+    } finally {
+      clearLLMTaskState(newTaskId);
+      setTaskId(null);
     }
   }, [selectedModel, uploadedImage, imageToPrompt, dispatch, popover, currentPrompt]);
 
@@ -157,6 +167,7 @@ export const ImageToPromptButton = memo(({ droppedImage, onClearDroppedImage }:
                     />
                   )}
                 </Flex>
+                {isLoading ? <LLMTaskProgressDisplay taskId={taskId} /> : null}
                 <Button
                   size="sm"
                   colorScheme="invokeBlue"
diff --git a/invokeai/frontend/web/src/features/prompt/LLMTaskProgressDisplay.tsx b/invokeai/frontend/web/src/features/prompt/LLMTaskProgressDisplay.tsx
new file mode 100644
index 00000000000..e548237d253
--- /dev/null
+++ b/invokeai/frontend/web/src/features/prompt/LLMTaskProgressDisplay.tsx
@@ -0,0 +1,51 @@
+import { Flex, Progress, Text } from '@invoke-ai/ui-library';
+import { useStore } from '@nanostores/react';
+import { round } from 'es-toolkit/compat';
+import { memo, useMemo } from 'react';
+import { useTranslation } from 'react-i18next';
+import { $llmTaskStates } from 'services/events/stores';
+
+type Props = {
+  taskId: string | null;
+};
+
+export const LLMTaskProgressDisplay = memo(({ taskId }: Props) => {
+  const { t } = useTranslation();
+  const allStates = useStore($llmTaskStates);
+  const state = useMemo(() => (taskId ? allStates[taskId] : undefined), [allStates, taskId]);
+
+  if (!taskId || !state || state.status === 'complete') {
+    return null;
+  }
+
+  if (state.status === 'error') {
+    return (
+      <Text fontSize="xs" color="error.300">
+        {state.error}
+      </Text>
+    );
+  }
+
+  const { phase, percentage, current_tokens, total_tokens } = state.payload;
+  const label = phase === 'loading_model' ? t('prompt.llmTaskLoadingModel') : t('prompt.llmTaskGenerating');
+  const isIndeterminate = phase === 'loading_model' || percentage === null;
+  const pct = percentage !== null ? round(percentage * 100, 1) : 0;
+
+  return (
+    <Flex flexDir="column" gap={1}>
+      <Flex justifyContent="space-between" alignItems="center">
+        <Text fontSize="xs" color="base.300">
+          {label}
+        </Text>
+        {phase === 'generating' && current_tokens !== null && total_tokens !== null ? (
+          <Text fontSize="xs" color="base.400">
+            {current_tokens} / {total_tokens}
+          </Text>
+        ) : null}
+      </Flex>
+      <Progress size="xs" value={pct} isIndeterminate={isIndeterminate} colorScheme="invokeBlue" borderRadius="base" />
+    </Flex>
+  );
+});
+
+LLMTaskProgressDisplay.displayName = 'LLMTaskProgressDisplay';
diff --git a/invokeai/frontend/web/src/services/api/endpoints/utilities.ts b/invokeai/frontend/web/src/services/api/endpoints/utilities.ts
index 44e1bedcc26..239cf568ddd 100644
--- a/invokeai/frontend/web/src/services/api/endpoints/utilities.ts
+++ b/invokeai/frontend/web/src/services/api/endpoints/utilities.ts
@@ -17,6 +17,7 @@ type ExpandPromptRequest = {
   model_key: string;
   max_tokens?: number;
   system_prompt?: string | null;
+  task_id?: string | null;
 };
 
 type ExpandPromptResponse = {
@@ -28,6 +29,7 @@ type ImageToPromptRequest = {
   image_name: string;
   model_key: string;
   instruction?: string;
+  task_id?: string | null;
 };
 
 type ImageToPromptResponse = {
diff --git a/invokeai/frontend/web/src/services/api/schema.ts b/invokeai/frontend/web/src/services/api/schema.ts
index 7ca0f26fe9f..9d509099efd 100644
--- a/invokeai/frontend/web/src/services/api/schema.ts
+++ b/invokeai/frontend/web/src/services/api/schema.ts
@@ -9106,6 +9106,11 @@ export type components = {
             max_tokens?: number;
             /** System Prompt */
             system_prompt?: string | null;
+            /**
+             * Task Id
+             * @description Client-supplied task ID used to correlate socket progress events to this request
+             */
+            task_id?: string | null;
         };
         /** ExpandPromptResponse */
         ExpandPromptResponse: {
@@ -14915,6 +14920,11 @@ export type components = {
              * @default Describe this image in detail for use as an AI image generation prompt.
              */
             instruction?: string;
+            /**
+             * Task Id
+             * @description Client-supplied task ID used to correlate socket progress events to this request
+             */
+            task_id?: string | null;
         };
         /** ImageToPromptResponse */
         ImageToPromptResponse: {
@@ -17272,6 +17282,106 @@ export type components = {
             type: "iterate_output";
         };
         JsonValue: unknown;
+        /**
+         * LLMTaskCompleteEvent
+         * @description Event model for llm_task_complete
+         */
+        LLMTaskCompleteEvent: {
+            /**
+             * Timestamp
+             * @description The timestamp of the event
+             */
+            timestamp: number;
+            /**
+             * Task Id
+             * @description Client-supplied task ID correlating events to a single request
+             */
+            task_id: string;
+            /**
+             * User Id
+             * @description ID of the user who initiated the task
+             * @default system
+             */
+            user_id: string;
+        };
+        /**
+         * LLMTaskErrorEvent
+         * @description Event model for llm_task_error
+         */
+        LLMTaskErrorEvent: {
+            /**
+             * Timestamp
+             * @description The timestamp of the event
+             */
+            timestamp: number;
+            /**
+             * Task Id
+             * @description Client-supplied task ID correlating events to a single request
+             */
+            task_id: string;
+            /**
+             * User Id
+             * @description ID of the user who initiated the task
+             * @default system
+             */
+            user_id: string;
+            /**
+             * Error
+             * @description The error message
+             */
+            error: string;
+        };
+        /**
+         * LLMTaskProgressEvent
+         * @description Event model for llm_task_progress
+         */
+        LLMTaskProgressEvent: {
+            /**
+             * Timestamp
+             * @description The timestamp of the event
+             */
+            timestamp: number;
+            /**
+             * Task Id
+             * @description Client-supplied task ID correlating events to a single request
+             */
+            task_id: string;
+            /**
+             * User Id
+             * @description ID of the user who initiated the task
+             * @default system
+             */
+            user_id: string;
+            /**
+             * Phase
+             * @description Which phase of the task is in progress
+             * @enum {string}
+             */
+            phase: "loading_model" | "generating";
+            /**
+             * Message
+             * @description A short message describing the current phase
+             */
+            message: string;
+            /**
+             * Percentage
+             * @description Progress fraction in [0, 1]; omit for indeterminate progress
+             * @default null
+             */
+            percentage: number | null;
+            /**
+             * Current Tokens
+             * @description Number of tokens generated so far (generating phase)
+             * @default null
+             */
+            current_tokens: number | null;
+            /**
+             * Total Tokens
+             * @description Max tokens the request will generate (generating phase)
+             * @default null
+             */
+            total_tokens: number | null;
+        };
         /**
          * LaMa Infill
          * @description Infills transparent areas of an image using the LaMa model
diff --git a/invokeai/frontend/web/src/services/events/setEventListeners.tsx b/invokeai/frontend/web/src/services/events/setEventListeners.tsx
index 1e73abb2027..345ad556e96 100644
--- a/invokeai/frontend/web/src/services/events/setEventListeners.tsx
+++ b/invokeai/frontend/web/src/services/events/setEventListeners.tsx
@@ -53,7 +53,7 @@ import type { ClientToServerEvents, ServerToClientEvents } from 'services/events
 import type { Socket } from 'socket.io-client';
 import type { JsonObject } from 'type-fest';
 
-import { $lastProgressEvent, $loadingModelsCount } from './stores';
+import { $lastProgressEvent, $loadingModelsCount, setLLMTaskState } from './stores';
 
 const log = logger('events');
 
@@ -1017,4 +1017,19 @@ export const setEventListeners = ({ socket, store, setIsConnected }: SetEventLis
       duration: null,
     });
   });
+
+  socket.on('llm_task_progress', (data) => {
+    log.trace({ data } as JsonObject, 'LLM task progress');
+    setLLMTaskState(data.task_id, { status: 'progress', payload: data });
+  });
+
+  socket.on('llm_task_complete', (data) => {
+    log.trace({ data } as JsonObject, 'LLM task complete');
+    setLLMTaskState(data.task_id, { status: 'complete' });
+  });
+
+  socket.on('llm_task_error', (data) => {
+    log.warn({ data } as JsonObject, 'LLM task error');
+    setLLMTaskState(data.task_id, { status: 'error', error: data.error });
+  });
 };
diff --git a/invokeai/frontend/web/src/services/events/stores.ts b/invokeai/frontend/web/src/services/events/stores.ts
index 180f4a3a636..8b45d0a04ff 100644
--- a/invokeai/frontend/web/src/services/events/stores.ts
+++ b/invokeai/frontend/web/src/services/events/stores.ts
@@ -1,13 +1,37 @@
 import { round } from 'es-toolkit/compat';
 import { atom, computed } from 'nanostores';
 import type { S } from 'services/api/types';
-import type { AppSocket } from 'services/events/types';
+import type { AppSocket, LLMTaskProgressEventPayload } from 'services/events/types';
 
 export const $socket = atom<AppSocket | null>(null);
 export const $isConnected = atom<boolean>(false);
 export const $lastProgressEvent = atom<S['InvocationProgressEvent'] | null>(null);
 export const $loadingModelsCount = atom<number>(0);
 
+// LLM utility task progress (expand-prompt, image-to-prompt). Keyed by task_id so
+// concurrent tasks don't clobber each other. Components subscribe and read the
+// entry matching their current task.
+type LLMTaskState =
+  | { status: 'progress'; payload: LLMTaskProgressEventPayload }
+  | { status: 'complete' }
+  | { status: 'error'; error: string };
+
+export const $llmTaskStates = atom<Record<string, LLMTaskState>>({});
+
+export const setLLMTaskState = (taskId: string, state: LLMTaskState): void => {
+  $llmTaskStates.set({ ...$llmTaskStates.get(), [taskId]: state });
+};
+
+export const clearLLMTaskState = (taskId: string): void => {
+  const current = $llmTaskStates.get();
+  if (!(taskId in current)) {
+    return;
+  }
+  const next = { ...current };
+  delete next[taskId];
+  $llmTaskStates.set(next);
+};
+
 export const $lastProgressMessage = computed($lastProgressEvent, (val) => {
   if (!val) {
     return null;
diff --git a/invokeai/frontend/web/src/services/events/types.ts b/invokeai/frontend/web/src/services/events/types.ts
index 8937dcc451d..05ddca5d7bf 100644
--- a/invokeai/frontend/web/src/services/events/types.ts
+++ b/invokeai/frontend/web/src/services/events/types.ts
@@ -6,6 +6,29 @@ type ClientEmitUnsubscribeQueue = ClientEmitSubscribeQueue;
 type ClientEmitSubscribeBulkDownload = { bulk_download_id: string };
 type ClientEmitUnsubscribeBulkDownload = ClientEmitSubscribeBulkDownload;
 
+// LLM utility task events (expand-prompt, image-to-prompt). Hand-typed until schema regen.
+export type LLMTaskProgressEventPayload = {
+  task_id: string;
+  user_id: string;
+  phase: 'loading_model' | 'generating';
+  message: string;
+  percentage: number | null;
+  current_tokens: number | null;
+  total_tokens: number | null;
+  timestamp: number;
+};
+type LLMTaskCompleteEventPayload = {
+  task_id: string;
+  user_id: string;
+  timestamp: number;
+};
+type LLMTaskErrorEventPayload = {
+  task_id: string;
+  user_id: string;
+  error: string;
+  timestamp: number;
+};
+
 export type ServerToClientEvents = {
   invocation_progress: (payload: S['InvocationProgressEvent']) => void;
   invocation_complete: (payload: S['InvocationCompleteEvent']) => void;
@@ -33,6 +56,9 @@ export type ServerToClientEvents = {
   bulk_download_started: (payload: S['BulkDownloadStartedEvent']) => void;
   bulk_download_complete: (payload: S['BulkDownloadCompleteEvent']) => void;
   bulk_download_error: (payload: S['BulkDownloadErrorEvent']) => void;
+  llm_task_progress: (payload: LLMTaskProgressEventPayload) => void;
+  llm_task_complete: (payload: LLMTaskCompleteEventPayload) => void;
+  llm_task_error: (payload: LLMTaskErrorEventPayload) => void;
 };
 
 export type ClientToServerEvents = {
diff --git a/tests/backend/text_llm/test_text_llm_pipeline.py b/tests/backend/text_llm/test_text_llm_pipeline.py
index 82dbd6c4f41..544374be328 100644
--- a/tests/backend/text_llm/test_text_llm_pipeline.py
+++ b/tests/backend/text_llm/test_text_llm_pipeline.py
@@ -1,6 +1,6 @@
 """Tests for the TextLLMPipeline class."""
 
-from unittest.mock import MagicMock
+from unittest.mock import MagicMock, patch
 
 import torch
 
@@ -16,23 +16,38 @@ def _make_mock_tokenizer(has_chat_template: bool = True) -> MagicMock:
     else:
         tokenizer.chat_template = None
 
-    # Simulate tokenizer __call__ returning dict with input_ids
     input_ids = torch.tensor([[1, 2, 3, 4, 5]])
     tokenizer_output = MagicMock()
     tokenizer_output.__getitem__ = lambda self, key: {"input_ids": input_ids}[key]
     tokenizer_output.to.return_value = tokenizer_output
     tokenizer.return_value = tokenizer_output
 
-    tokenizer.decode.return_value = "A detailed landscape with mountains"
+    # Token-counting for progress: pretend each accumulated string is N tokens long.
+    tokenizer.encode.return_value = [10, 11, 12]
     return tokenizer
 
 
 def _make_mock_model() -> MagicMock:
-    """Create a mock causal LM model."""
-    model = MagicMock()
-    # generate returns tensor that includes input + generated tokens
-    model.generate.return_value = torch.tensor([[1, 2, 3, 4, 5, 10, 11, 12]])
-    return model
+    return MagicMock()
+
+
+class FakeStreamer:
+    """Stand-in for TextIteratorStreamer — yields a fixed sequence of text chunks."""
+
+    def __init__(self, chunks: list[str]):
+        self._chunks = chunks
+
+    def __iter__(self):
+        return iter(self._chunks)
+
+
+def _patch_streamer(chunks: list[str] | None = None):
+    """Patch TextIteratorStreamer in the pipeline module to return a FakeStreamer."""
+    chunks = chunks if chunks is not None else ["A detailed ", "landscape ", "with mountains"]
+    return patch(
+        "invokeai.backend.text_llm_pipeline.TextIteratorStreamer",
+        return_value=FakeStreamer(chunks),
+    )
 
 
 def test_pipeline_uses_chat_template_when_available():
@@ -41,7 +56,8 @@ def test_pipeline_uses_chat_template_when_available():
     model = _make_mock_model()
     pipeline = TextLLMPipeline(model, tokenizer)
 
-    pipeline.run(prompt="a cat", device=torch.device("cpu"), dtype=torch.float32)
+    with _patch_streamer():
+        pipeline.run(prompt="a cat", device=torch.device("cpu"), dtype=torch.float32)
 
     tokenizer.apply_chat_template.assert_called_once()
     call_args = tokenizer.apply_chat_template.call_args
@@ -56,10 +72,10 @@ def test_pipeline_fallback_without_chat_template():
     model = _make_mock_model()
     pipeline = TextLLMPipeline(model, tokenizer)
 
-    pipeline.run(prompt="a cat", system_prompt="Be helpful", device=torch.device("cpu"), dtype=torch.float32)
+    with _patch_streamer():
+        pipeline.run(prompt="a cat", system_prompt="Be helpful", device=torch.device("cpu"), dtype=torch.float32)
 
     tokenizer.apply_chat_template.assert_not_called()
-    # Check that the tokenizer was called with the fallback format
     call_args = tokenizer.call_args[0][0]
     assert "Be helpful" in call_args
     assert "a cat" in call_args
@@ -72,56 +88,62 @@ def test_pipeline_no_system_prompt():
     model = _make_mock_model()
     pipeline = TextLLMPipeline(model, tokenizer)
 
-    pipeline.run(prompt="a dog", system_prompt="", device=torch.device("cpu"), dtype=torch.float32)
+    with _patch_streamer():
+        pipeline.run(prompt="a dog", system_prompt="", device=torch.device("cpu"), dtype=torch.float32)
 
     call_args = tokenizer.apply_chat_template.call_args
     messages = call_args[0][0]
-    # No system message when system_prompt is empty
     assert not any(m["role"] == "system" for m in messages)
     assert any(m["role"] == "user" and m["content"] == "a dog" for m in messages)
 
 
-def test_pipeline_decodes_only_generated_tokens():
-    """Pipeline should strip input tokens and only decode newly generated ones."""
-    tokenizer = _make_mock_tokenizer(has_chat_template=True)
-    model = _make_mock_model()
-    pipeline = TextLLMPipeline(model, tokenizer)
-
-    pipeline.run(prompt="test", device=torch.device("cpu"), dtype=torch.float32)
-
-    # The mock model returns [1,2,3,4,5,10,11,12], input is [1,2,3,4,5]
-    # So decode should be called with [10, 11, 12]
-    decode_call = tokenizer.decode.call_args
-    decoded_tokens = decode_call[0][0]
-    assert decoded_tokens.tolist() == [10, 11, 12]
-    assert decode_call[1]["skip_special_tokens"] is True
-
-
 def test_pipeline_passes_generation_params():
-    """Pipeline should pass max_new_tokens and sampling params to model.generate."""
+    """Pipeline should pass max_new_tokens and sampling params to model.generate, plus a streamer."""
     tokenizer = _make_mock_tokenizer(has_chat_template=True)
     model = _make_mock_model()
     pipeline = TextLLMPipeline(model, tokenizer)
 
-    pipeline.run(prompt="test", max_new_tokens=100, device=torch.device("cpu"), dtype=torch.float32)
+    with _patch_streamer():
+        pipeline.run(prompt="test", max_new_tokens=100, device=torch.device("cpu"), dtype=torch.float32)
 
     generate_kwargs = model.generate.call_args[1]
     assert generate_kwargs["max_new_tokens"] == 100
     assert generate_kwargs["do_sample"] is True
     assert generate_kwargs["temperature"] == 0.7
     assert generate_kwargs["top_p"] == 0.9
+    assert "streamer" in generate_kwargs
 
 
-def test_pipeline_returns_stripped_string():
-    """Pipeline should return a stripped string from the decoded output."""
+def test_pipeline_returns_joined_streamed_chunks():
+    """Pipeline should return the concatenated, stripped streamer output."""
     tokenizer = _make_mock_tokenizer(has_chat_template=True)
-    tokenizer.decode.return_value = "  generated text with spaces  "
     model = _make_mock_model()
     pipeline = TextLLMPipeline(model, tokenizer)
 
-    result = pipeline.run(prompt="test", device=torch.device("cpu"), dtype=torch.float32)
+    with _patch_streamer(["  hello ", "world  "]):
+        result = pipeline.run(prompt="test", device=torch.device("cpu"), dtype=torch.float32)
+
+    assert result == "hello world"
 
-    assert result == "generated text with spaces"
+
+def test_pipeline_invokes_progress_callback_per_chunk():
+    """Pipeline should call progress_callback once per non-empty streamed chunk."""
+    tokenizer = _make_mock_tokenizer(has_chat_template=True)
+    model = _make_mock_model()
+    pipeline = TextLLMPipeline(model, tokenizer)
+    calls: list[tuple[int, int]] = []
+
+    with _patch_streamer(["a ", "b ", "c"]):
+        pipeline.run(
+            prompt="test",
+            max_new_tokens=50,
+            device=torch.device("cpu"),
+            dtype=torch.float32,
+            progress_callback=lambda current, total: calls.append((current, total)),
+        )
+
+    assert len(calls) == 3
+    assert all(total == 50 for _, total in calls)
 
 
 def test_default_system_prompt_content():