From ccdd5e86151ac2934389ca5d9d567adec0232d1b Mon Sep 17 00:00:00 2001
From: Quan Truong <quan@deepinfra.com>
Date: Sun, 5 Apr 2026 23:57:43 +0000
Subject: [PATCH 1/6] [None][feat] Add per-request stream_interval_ms support

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Quan Truong <quan@deepinfra.com>
---
 .../_torch/auto_deploy/shim/ad_executor.py    |  1 +
 tensorrt_llm/_torch/pyexecutor/llm_request.py |  4 +
 tensorrt_llm/_torch/pyexecutor/py_executor.py | 32 ++++---
 tensorrt_llm/executor/base_worker.py          |  1 +
 tensorrt_llm/llmapi/llm.py                    |  4 +
 tensorrt_llm/llmapi/llm_args.py               | 12 ++-
 tensorrt_llm/sampling_params.py               |  5 ++
 tensorrt_llm/serve/openai_protocol.py         | 24 +++++
 .../references_committed/llm.yaml             |  3 +
 .../references_committed/sampling_params.yaml |  3 +
 tests/unittest/llmapi/test_executor.py        | 89 +++++++++++++++++++
 11 files changed, 160 insertions(+), 18 deletions(-)

diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
index a4f59a27bcca..9b5cf7e3ca2c 100644
--- a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
+++ b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -493,6 +493,7 @@ def __init__(
         self.llm_args.enable_iter_perf_stats = reporting_info.enable_iter_perf_stats
         self.llm_args.enable_iter_req_stats = reporting_info.enable_iter_req_stats
         self.llm_args.stream_interval = 1
+        self.llm_args.stream_interval_ms = 0
         self.llm_args.attention_dp_config = None
         self.llm_args.batch_wait_timeout_ms = 0
         self.llm_args.batch_wait_timeout_iters = 0
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index bbf373ec6f3b..ab5a7df0501e 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -622,7 +622,9 @@ def __init__(
         self.py_rewind_draft_token_separate_adjustment = 0
         self.py_decoding_iter = 0
         self.py_stream_interval = None
+        self.py_stream_interval_ms = None
         self.py_last_stream_emit_time = None
+        self.py_iters_since_last_emit = 0
         self.is_attention_dp_dummy = False
         self.is_cuda_graph_dummy = False
         self.py_kv_transfer_start_time = None
@@ -930,6 +932,8 @@ def executor_request_to_llm_request(
 
     llm_request.py_stream_interval = getattr(executor_request,
                                              "py_stream_interval", None)
+    llm_request.py_stream_interval_ms = getattr(executor_request,
+                                                "py_stream_interval_ms", None)
     llm_request.py_disaggregated_params = getattr(executor_request,
                                                   "py_disaggregated_params",
                                                   None)
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index d05cbbc427b6..93227f6fc5cd 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -335,7 +335,7 @@ def __init__(
         self.enable_iter_perf_stats = self.llm_args.enable_iter_perf_stats
         self.enable_iter_req_stats = self.llm_args.enable_iter_req_stats
         self.stream_interval = self.llm_args.stream_interval
-        self.stream_emit_interval_ms = self.llm_args.stream_emit_interval_ms
+        self.stream_interval_ms = self.llm_args.stream_interval_ms
         self.attention_dp_enable_balance = (
             self.llm_args.attention_dp_config is not None
             and self.llm_args.attention_dp_config.enable_balance)
@@ -3210,17 +3210,27 @@ def _handle_responses(self):
                 request.update_perf_metrics(self.iter_counter)
 
             request_done = False
-            now = get_steady_clock_now_in_seconds()
-            should_emit = (
-                request.py_decoding_iter == 1
-                or request.is_finished
-                or request.py_decoding_iter % (request.py_stream_interval or self.stream_interval) == 0
-                or (self.stream_emit_interval_ms > 0
-                    and request.py_last_stream_emit_time
-                    and (now - request.py_last_stream_emit_time) * 1000 >
-                    self.stream_emit_interval_ms)
-            )
+            request.py_last_stream_emit_iter += 1
+            now = time.monotonic()
+
+            # Resolve effective intervals (per-request > engine > default)
+            token_interval = request.py_stream_interval or self.stream_interval
+            time_interval_ms = request.py_stream_interval_ms or self.stream_interval_ms or 0
+
+            # Check triggers
+            token_triggered = request.py_last_stream_emit_iter >= token_interval
+            time_triggered = (
+                time_interval_ms > 0
+                and request.py_last_stream_emit_time is not None
+                and (now - request.py_last_stream_emit_time) * 1000
+                >= time_interval_ms)
+
+            should_emit = (request.py_decoding_iter == 1
+                           or request.is_finished or token_triggered
+                           or time_triggered)
+
             if should_emit:
+                request.py_last_stream_emit_iter = 0
                 request.py_last_stream_emit_time = now
                 response = request.create_response(False, self.dist.rank)
                 if response:
diff --git a/tensorrt_llm/executor/base_worker.py b/tensorrt_llm/executor/base_worker.py
index dad04c26ca53..4b5352ab23c6 100644
--- a/tensorrt_llm/executor/base_worker.py
+++ b/tensorrt_llm/executor/base_worker.py
@@ -568,6 +568,7 @@ def _deduce_max_tokens(request: GenerationRequest,
             executor_request.py_lora_path = py_lora_path
             executor_request.py_logprobs_mode = request.sampling_params.logprobs_mode
             executor_request.py_stream_interval = request.sampling_params.stream_interval
+            executor_request.py_stream_interval_ms = request.sampling_params.stream_interval_ms
 
             # here we add executor_request.py_disaggregated_params= request.disaggregated_params for python cache transceiver
             if self._is_pytorch_backend and request.disaggregated_params is not None:
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
index b1216afd7928..6ccadbba8757 100644
--- a/tensorrt_llm/llmapi/llm.py
+++ b/tensorrt_llm/llmapi/llm.py
@@ -789,6 +789,10 @@ def _prepare_sampling_params(
         if sampling_params.stream_interval is None:
             sampling_params.stream_interval = getattr(self.args,
                                                       "stream_interval", 1)
+        if sampling_params.stream_interval_ms is None:
+            engine_val = getattr(self.args, "stream_interval_ms", 0)
+            if engine_val > 0:
+                sampling_params.stream_interval_ms = engine_val
         sampling_params.return_perf_metrics = sampling_params.return_perf_metrics or self.args.return_perf_metrics
         return sampling_params
 
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index d05f670cb494..b8705e599383 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -3003,11 +3003,13 @@ class TorchLlmArgs(BaseLlmArgs):
         "The iteration interval to create responses under the streaming mode. "
         "Set this to a larger value when the batch size is large, which helps reduce the streaming overhead.",
     )
-    stream_emit_interval_ms: int = Field(
+    stream_interval_ms: int = Field(
         default=0,
+        ge=0,
         description=
-        "The time interval (milliseconds) to create responses under the streaming mode. "
-        "Set to 0 to disable time-based throttling.",
+        "The time interval in milliseconds to create responses under the streaming mode. "
+        "Set to 0 to disable time-based streaming throttle. "
+        "When both stream_interval and stream_interval_ms are set, whichever triggers first will emit a response.",
     )
 
     force_dynamic_quantization: bool = Field(
@@ -3225,10 +3227,6 @@ def validate_stream_interval(self):
         if self.stream_interval <= 0:
             raise ValueError(
                 f"stream_interval must be positive, got {self.stream_interval}")
-        if self.stream_emit_interval_ms < 0:
-            raise ValueError(
-                "stream_emit_interval_ms must be non-negative, got "
-                f"{self.stream_emit_interval_ms}")
         return self
 
     @model_validator(mode="after")
diff --git a/tensorrt_llm/sampling_params.py b/tensorrt_llm/sampling_params.py
index cdad60846e25..f0cd30faf040 100644
--- a/tensorrt_llm/sampling_params.py
+++ b/tensorrt_llm/sampling_params.py
@@ -292,6 +292,7 @@ class SamplingParams:
     skip_special_tokens: bool = True
     spaces_between_special_tokens: bool = True
     stream_interval: Optional[int] = None
+    stream_interval_ms: Optional[int] = None
 
     def __post_init__(self):
         if self.pad_id is None:
@@ -319,6 +320,10 @@ def _validate(self):
             raise ValueError(
                 f"require stream_interval > 0, got stream_interval={self.stream_interval}"
             )
+        if self.stream_interval_ms is not None and self.stream_interval_ms <= 0:
+            raise ValueError(
+                f"require stream_interval_ms > 0, got stream_interval_ms={self.stream_interval_ms}"
+            )
         if self.top_p is not None and (self.top_p < 0 or self.top_p > 1):
             raise ValueError(f"require 0 <= top_p <= 1, got top_p={self.top_p}")
         if self.top_k is not None and self.top_k < 0:
diff --git a/tensorrt_llm/serve/openai_protocol.py b/tensorrt_llm/serve/openai_protocol.py
index 458f33a4b866..a3b4f52a0a2f 100644
--- a/tensorrt_llm/serve/openai_protocol.py
+++ b/tensorrt_llm/serve/openai_protocol.py
@@ -391,6 +391,13 @@ class CompletionRequest(OpenAIBaseModel):
             "The iteration interval to create responses under the streaming mode. "
             "If not set, the engine-level default is used."),
     )
+    stream_interval_ms: Optional[int] = Field(
+        default=None,
+        gt=0,
+        description=(
+            "The time interval in milliseconds to create responses under the streaming mode. "
+            "If not set, the engine-level default is used."),
+    )
 
     # doc: end-completion-extra-params
 
@@ -439,6 +446,7 @@ def to_sampling_params(self,
             # completion-extra-params
             add_special_tokens=self.add_special_tokens,
             stream_interval=self.stream_interval,
+            stream_interval_ms=self.stream_interval_ms,
 
             # TODO: migrate to use logprobs and prompt_logprobs
             _return_log_probs=bool(self.logprobs),
@@ -761,6 +769,13 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "The iteration interval to create responses under the streaming mode. "
             "If not set, the engine-level default is used."),
     )
+    stream_interval_ms: Optional[int] = Field(
+        default=None,
+        gt=0,
+        description=(
+            "The time interval in milliseconds to create responses under the streaming mode. "
+            "If not set, the engine-level default is used."),
+    )
 
     # doc: end-chat-completion-extra-params
 
@@ -808,6 +823,7 @@ def to_sampling_params(self,
             # chat-completion-extra-params
             add_special_tokens=self.add_special_tokens,
             stream_interval=self.stream_interval,
+            stream_interval_ms=self.stream_interval_ms,
 
             # TODO: migrate to use logprobs and prompt_logprobs
             _return_log_probs=bool(self.logprobs),
@@ -926,6 +942,13 @@ class ResponsesRequest(OpenAIBaseModel):
             "The iteration interval to create responses under the streaming mode. "
             "If not set, the engine-level default is used."),
     )
+    stream_interval_ms: Optional[int] = Field(
+        default=None,
+        gt=0,
+        description=(
+            "The time interval in milliseconds to create responses under the streaming mode. "
+            "If not set, the engine-level default is used."),
+    )
 
     request_id: str = Field(
         default_factory=lambda: f"resp_{str(uuid.uuid4().hex)}",
@@ -972,6 +995,7 @@ def to_sampling_params(
             stop_token_ids=stop_token_ids,
             guided_decoding=guided_decoding,
             stream_interval=self.stream_interval,
+            stream_interval_ms=self.stream_interval_ms,
         )
 
     @model_validator(mode="before")
diff --git a/tests/unittest/api_stability/references_committed/llm.yaml b/tests/unittest/api_stability/references_committed/llm.yaml
index 16481bac6f3e..61d272b55449 100644
--- a/tests/unittest/api_stability/references_committed/llm.yaml
+++ b/tests/unittest/api_stability/references_committed/llm.yaml
@@ -93,6 +93,9 @@ methods:
       stream_interval:
         annotation: int
         default: 1
+      stream_interval_ms:
+        annotation: int
+        default: 0
 
       kwargs:
         annotation: Any
diff --git a/tests/unittest/api_stability/references_committed/sampling_params.yaml b/tests/unittest/api_stability/references_committed/sampling_params.yaml
index 2cc051e5d081..e98875723b85 100644
--- a/tests/unittest/api_stability/references_committed/sampling_params.yaml
+++ b/tests/unittest/api_stability/references_committed/sampling_params.yaml
@@ -126,6 +126,9 @@ methods:
       stream_interval:
         annotation: Optional[int]
         default: null
+      stream_interval_ms:
+        annotation: Optional[int]
+        default: null
       # Returning controls
       logprobs:
         annotation: Optional[int]
diff --git a/tests/unittest/llmapi/test_executor.py b/tests/unittest/llmapi/test_executor.py
index 9e329bbc1475..a9421aef0afa 100644
--- a/tests/unittest/llmapi/test_executor.py
+++ b/tests/unittest/llmapi/test_executor.py
@@ -247,6 +247,95 @@ def test_stream_interval_openai_protocol():
         ResponsesRequest(model="m", input="hi", stream_interval=-1)
 
 
+def test_stream_interval_ms_validation():
+    # Valid values
+    sp = SamplingParams(stream_interval_ms=100)
+    assert sp.stream_interval_ms == 100
+    sp = SamplingParams(stream_interval_ms=1)
+    assert sp.stream_interval_ms == 1
+    sp = SamplingParams(stream_interval_ms=None)
+    assert sp.stream_interval_ms is None
+
+    # Both can be set simultaneously
+    sp = SamplingParams(stream_interval=5, stream_interval_ms=100)
+    assert sp.stream_interval == 5
+    assert sp.stream_interval_ms == 100
+
+    # Invalid values
+    with pytest.raises(ValueError, match="stream_interval_ms"):
+        SamplingParams(stream_interval_ms=0)
+    with pytest.raises(ValueError, match="stream_interval_ms"):
+        SamplingParams(stream_interval_ms=-1)
+
+
+def test_stream_interval_ms_openai_protocol():
+    from pydantic import ValidationError
+
+    from tensorrt_llm.serve.openai_protocol import (
+        ChatCompletionRequest,
+        CompletionRequest,
+        ResponsesRequest,
+    )
+
+    # CompletionRequest: valid stream_interval_ms passes through
+    req = CompletionRequest(model="m", prompt="hi", stream_interval_ms=100)
+    sp = req.to_sampling_params()
+    assert sp.stream_interval_ms == 100
+
+    # CompletionRequest: None leaves stream_interval_ms unset
+    req = CompletionRequest(model="m", prompt="hi")
+    sp = req.to_sampling_params()
+    assert sp.stream_interval_ms is None
+
+    # CompletionRequest: invalid stream_interval_ms rejected at construction
+    with pytest.raises(ValidationError, match="stream_interval_ms"):
+        CompletionRequest(model="m", prompt="hi", stream_interval_ms=-1)
+    with pytest.raises(ValidationError, match="stream_interval_ms"):
+        CompletionRequest(model="m", prompt="hi", stream_interval_ms=0)
+
+    # ChatCompletionRequest: valid stream_interval_ms passes through
+    req = ChatCompletionRequest(model="m",
+                                messages=[{
+                                    "role": "user",
+                                    "content": "hi"
+                                }],
+                                stream_interval_ms=200)
+    sp = req.to_sampling_params()
+    assert sp.stream_interval_ms == 200
+
+    # ChatCompletionRequest: invalid stream_interval_ms rejected at construction
+    with pytest.raises(ValidationError, match="stream_interval_ms"):
+        ChatCompletionRequest(model="m",
+                              messages=[{
+                                  "role": "user",
+                                  "content": "hi"
+                              }],
+                              stream_interval_ms=0)
+
+    # ResponsesRequest: valid stream_interval_ms passes through
+    req = ResponsesRequest(model="m", input="hi", stream_interval_ms=50)
+    sp = req.to_sampling_params()
+    assert sp.stream_interval_ms == 50
+
+    # ResponsesRequest: None leaves stream_interval_ms unset
+    req = ResponsesRequest(model="m", input="hi")
+    sp = req.to_sampling_params()
+    assert sp.stream_interval_ms is None
+
+    # ResponsesRequest: invalid stream_interval_ms rejected at construction
+    with pytest.raises(ValidationError, match="stream_interval_ms"):
+        ResponsesRequest(model="m", input="hi", stream_interval_ms=-1)
+
+    # Both can be set together
+    req = CompletionRequest(model="m",
+                            prompt="hi",
+                            stream_interval=5,
+                            stream_interval_ms=100)
+    sp = req.to_sampling_params()
+    assert sp.stream_interval == 5
+    assert sp.stream_interval_ms == 100
+
+
 @pytest.mark.skipif(torch.cuda.device_count() < 2 or WORLD_SIZE != 2,
                     reason="Must run on 2 MPI ranks with at least 2 GPUs")
 def test_sync_generation_tp_main_node_only(llama_7b_tp2_path: Path):

From 253775fee816394912a2e93760f016fbf9500648 Mon Sep 17 00:00:00 2001
From: Quan Truong <quan@deepinfra.com>
Date: Sun, 5 Apr 2026 23:57:47 +0000
Subject: [PATCH 2/6] change variable name and use different time function

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Quan Truong <quan@deepinfra.com>
---
 tensorrt_llm/_torch/pyexecutor/llm_request.py | 2 +-
 tensorrt_llm/_torch/pyexecutor/py_executor.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index ab5a7df0501e..4a286a5adfc3 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -624,7 +624,7 @@ def __init__(
         self.py_stream_interval = None
         self.py_stream_interval_ms = None
         self.py_last_stream_emit_time = None
-        self.py_iters_since_last_emit = 0
+        self.py_last_stream_emit_iter = 0
         self.is_attention_dp_dummy = False
         self.is_cuda_graph_dummy = False
         self.py_kv_transfer_start_time = None
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 93227f6fc5cd..db63e331fc2c 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -3211,7 +3211,7 @@ def _handle_responses(self):
 
             request_done = False
             request.py_last_stream_emit_iter += 1
-            now = time.monotonic()
+            now = get_steady_clock_now_in_seconds()
 
             # Resolve effective intervals (per-request > engine > default)
             token_interval = request.py_stream_interval or self.stream_interval

From 00e8cf794bd48a96f985acb18f479701614015b1 Mon Sep 17 00:00:00 2001
From: Quan Truong <quan@deepinfra.com>
Date: Sun, 5 Apr 2026 23:58:09 +0000
Subject: [PATCH 3/6] [None][fix] Use Field constraint for stream_interval
 validation and add type annotation

Replace model_validator with ge=1 on Field for stream_interval.
Add Optional[float] type annotation to py_last_stream_emit_time.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Quan Truong <quan@deepinfra.com>
---
 tensorrt_llm/_torch/pyexecutor/llm_request.py | 2 +-
 tensorrt_llm/llmapi/llm_args.py               | 8 +-------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index 4a286a5adfc3..347f2eafdb7a 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -623,7 +623,7 @@ def __init__(
         self.py_decoding_iter = 0
         self.py_stream_interval = None
         self.py_stream_interval_ms = None
-        self.py_last_stream_emit_time = None
+        self.py_last_stream_emit_time: Optional[float] = None
         self.py_last_stream_emit_iter = 0
         self.is_attention_dp_dummy = False
         self.is_cuda_graph_dummy = False
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index b8705e599383..80505dced231 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -2999,6 +2999,7 @@ class TorchLlmArgs(BaseLlmArgs):
     # TODO: make this a per-request parameter
     stream_interval: int = Field(
         default=1,
+        ge=1,
         description=
         "The iteration interval to create responses under the streaming mode. "
         "Set this to a larger value when the batch size is large, which helps reduce the streaming overhead.",
@@ -3222,13 +3223,6 @@ def validate_speculative_config(self):
 
         return self
 
-    @model_validator(mode="after")
-    def validate_stream_interval(self):
-        if self.stream_interval <= 0:
-            raise ValueError(
-                f"stream_interval must be positive, got {self.stream_interval}")
-        return self
-
     @model_validator(mode="after")
     def validate_checkpoint_format(self):
         if self.checkpoint_format is not None and self.checkpoint_loader is not None:

From a71a3fcdf7e05726dae77056d02a15dc551d2e3e Mon Sep 17 00:00:00 2001
From: Quan Truong <quan@deepinfra.com>
Date: Tue, 7 Apr 2026 08:28:18 +0000
Subject: [PATCH 4/6] Fix toke_interval=1 issue

---
 tensorrt_llm/_torch/pyexecutor/py_executor.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index db63e331fc2c..dc44fc8cfc70 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -3218,7 +3218,9 @@ def _handle_responses(self):
             time_interval_ms = request.py_stream_interval_ms or self.stream_interval_ms or 0
 
             # Check triggers
-            token_triggered = request.py_last_stream_emit_iter >= token_interval
+            # If token_interval is 1 (default value), we will try to use time_interval_ms
+            token_triggered = (request.py_last_stream_emit_iter >= token_interval
+                              and not (token_interval == 1 and time_interval_ms > 0))
             time_triggered = (
                 time_interval_ms > 0
                 and request.py_last_stream_emit_time is not None

From 8ddbca2547194c5c4cfe098b06562e81e2f3d8a2 Mon Sep 17 00:00:00 2001
From: Quan Truong <quan@deepinfra.com>
Date: Tue, 7 Apr 2026 23:27:20 +0000
Subject: [PATCH 5/6] reformat

---
 tensorrt_llm/_torch/pyexecutor/py_executor.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index dc44fc8cfc70..9520da138fac 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -3220,12 +3220,11 @@ def _handle_responses(self):
             # Check triggers
             # If token_interval is 1 (default value), we will try to use time_interval_ms
             token_triggered = (request.py_last_stream_emit_iter >= token_interval
-                              and not (token_interval == 1 and time_interval_ms > 0))
-            time_triggered = (
-                time_interval_ms > 0
-                and request.py_last_stream_emit_time is not None
-                and (now - request.py_last_stream_emit_time) * 1000
-                >= time_interval_ms)
+                               and not (token_interval == 1 and time_interval_ms > 0))
+            time_triggered = (time_interval_ms > 0
+                              and request.py_last_stream_emit_time is not None
+                              and (now - request.py_last_stream_emit_time) * 1000
+                              >= time_interval_ms)
 
             should_emit = (request.py_decoding_iter == 1
                            or request.is_finished or token_triggered

From 07245111faafa46745566f24e3e2c6679de8a0b8 Mon Sep 17 00:00:00 2001
From: Quan Truong <quan@deepinfra.com>
Date: Tue, 7 Apr 2026 23:40:46 +0000
Subject: [PATCH 6/6] New logic

---
 tensorrt_llm/_torch/pyexecutor/py_executor.py | 19 +++++++++----------
 tensorrt_llm/llmapi/llm_args.py               |  2 +-
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 9520da138fac..9d2cd5d35d14 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -3217,18 +3217,17 @@ def _handle_responses(self):
             token_interval = request.py_stream_interval or self.stream_interval
             time_interval_ms = request.py_stream_interval_ms or self.stream_interval_ms or 0
 
-            # Check triggers
-            # If token_interval is 1 (default value), we will try to use time_interval_ms
-            token_triggered = (request.py_last_stream_emit_iter >= token_interval
-                               and not (token_interval == 1 and time_interval_ms > 0))
-            time_triggered = (time_interval_ms > 0
-                              and request.py_last_stream_emit_time is not None
-                              and (now - request.py_last_stream_emit_time) * 1000
-                              >= time_interval_ms)
+            # Time interval takes priority; fall back to token interval
+            if time_interval_ms > 0:
+                interval_triggered = (
+                    request.py_last_stream_emit_time is not None
+                    and (now - request.py_last_stream_emit_time) * 1000
+                    >= time_interval_ms)
+            else:
+                interval_triggered = request.py_last_stream_emit_iter >= token_interval
 
             should_emit = (request.py_decoding_iter == 1
-                           or request.is_finished or token_triggered
-                           or time_triggered)
+                           or request.is_finished or interval_triggered)
 
             if should_emit:
                 request.py_last_stream_emit_iter = 0
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index 80505dced231..04ccdc6feb93 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -3010,7 +3010,7 @@ class TorchLlmArgs(BaseLlmArgs):
         description=
         "The time interval in milliseconds to create responses under the streaming mode. "
         "Set to 0 to disable time-based streaming throttle. "
-        "When both stream_interval and stream_interval_ms are set, whichever triggers first will emit a response.",
+        "When stream_interval_ms is set (> 0), it takes priority over stream_interval.",
     )
 
     force_dynamic_quantization: bool = Field(