From ccdd5e86151ac2934389ca5d9d567adec0232d1b Mon Sep 17 00:00:00 2001 From: Quan Truong Date: Sun, 5 Apr 2026 23:57:43 +0000 Subject: [PATCH 1/6] [None][feat] Add per-request stream_interval_ms support Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Quan Truong --- .../_torch/auto_deploy/shim/ad_executor.py | 1 + tensorrt_llm/_torch/pyexecutor/llm_request.py | 4 + tensorrt_llm/_torch/pyexecutor/py_executor.py | 32 ++++--- tensorrt_llm/executor/base_worker.py | 1 + tensorrt_llm/llmapi/llm.py | 4 + tensorrt_llm/llmapi/llm_args.py | 12 ++- tensorrt_llm/sampling_params.py | 5 ++ tensorrt_llm/serve/openai_protocol.py | 24 +++++ .../references_committed/llm.yaml | 3 + .../references_committed/sampling_params.yaml | 3 + tests/unittest/llmapi/test_executor.py | 89 +++++++++++++++++++ 11 files changed, 160 insertions(+), 18 deletions(-) diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py index a4f59a27bcca..9b5cf7e3ca2c 100644 --- a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py +++ b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py @@ -493,6 +493,7 @@ def __init__( self.llm_args.enable_iter_perf_stats = reporting_info.enable_iter_perf_stats self.llm_args.enable_iter_req_stats = reporting_info.enable_iter_req_stats self.llm_args.stream_interval = 1 + self.llm_args.stream_interval_ms = 0 self.llm_args.attention_dp_config = None self.llm_args.batch_wait_timeout_ms = 0 self.llm_args.batch_wait_timeout_iters = 0 diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index bbf373ec6f3b..ab5a7df0501e 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -622,7 +622,9 @@ def __init__( self.py_rewind_draft_token_separate_adjustment = 0 self.py_decoding_iter = 0 self.py_stream_interval = None + self.py_stream_interval_ms = None self.py_last_stream_emit_time = None + self.py_iters_since_last_emit = 0 self.is_attention_dp_dummy = False self.is_cuda_graph_dummy = False self.py_kv_transfer_start_time = None @@ -930,6 +932,8 @@ def executor_request_to_llm_request( llm_request.py_stream_interval = getattr(executor_request, "py_stream_interval", None) + llm_request.py_stream_interval_ms = getattr(executor_request, + "py_stream_interval_ms", None) llm_request.py_disaggregated_params = getattr(executor_request, "py_disaggregated_params", None) diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index d05cbbc427b6..93227f6fc5cd 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -335,7 +335,7 @@ def __init__( self.enable_iter_perf_stats = self.llm_args.enable_iter_perf_stats self.enable_iter_req_stats = self.llm_args.enable_iter_req_stats self.stream_interval = self.llm_args.stream_interval - self.stream_emit_interval_ms = self.llm_args.stream_emit_interval_ms + self.stream_interval_ms = self.llm_args.stream_interval_ms self.attention_dp_enable_balance = ( self.llm_args.attention_dp_config is not None and self.llm_args.attention_dp_config.enable_balance) @@ -3210,17 +3210,27 @@ def _handle_responses(self): request.update_perf_metrics(self.iter_counter) request_done = False - now = get_steady_clock_now_in_seconds() - should_emit = ( - request.py_decoding_iter == 1 - or request.is_finished - or request.py_decoding_iter % (request.py_stream_interval or self.stream_interval) == 0 - or (self.stream_emit_interval_ms > 0 - and request.py_last_stream_emit_time - and (now - request.py_last_stream_emit_time) * 1000 > - self.stream_emit_interval_ms) - ) + request.py_last_stream_emit_iter += 1 + now = time.monotonic() + + # Resolve effective intervals (per-request > engine > default) + token_interval = request.py_stream_interval or self.stream_interval + time_interval_ms = request.py_stream_interval_ms or self.stream_interval_ms or 0 + + # Check triggers + token_triggered = request.py_last_stream_emit_iter >= token_interval + time_triggered = ( + time_interval_ms > 0 + and request.py_last_stream_emit_time is not None + and (now - request.py_last_stream_emit_time) * 1000 + >= time_interval_ms) + + should_emit = (request.py_decoding_iter == 1 + or request.is_finished or token_triggered + or time_triggered) + if should_emit: + request.py_last_stream_emit_iter = 0 request.py_last_stream_emit_time = now response = request.create_response(False, self.dist.rank) if response: diff --git a/tensorrt_llm/executor/base_worker.py b/tensorrt_llm/executor/base_worker.py index dad04c26ca53..4b5352ab23c6 100644 --- a/tensorrt_llm/executor/base_worker.py +++ b/tensorrt_llm/executor/base_worker.py @@ -568,6 +568,7 @@ def _deduce_max_tokens(request: GenerationRequest, executor_request.py_lora_path = py_lora_path executor_request.py_logprobs_mode = request.sampling_params.logprobs_mode executor_request.py_stream_interval = request.sampling_params.stream_interval + executor_request.py_stream_interval_ms = request.sampling_params.stream_interval_ms # here we add executor_request.py_disaggregated_params= request.disaggregated_params for python cache transceiver if self._is_pytorch_backend and request.disaggregated_params is not None: diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index b1216afd7928..6ccadbba8757 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -789,6 +789,10 @@ def _prepare_sampling_params( if sampling_params.stream_interval is None: sampling_params.stream_interval = getattr(self.args, "stream_interval", 1) + if sampling_params.stream_interval_ms is None: + engine_val = getattr(self.args, "stream_interval_ms", 0) + if engine_val > 0: + sampling_params.stream_interval_ms = engine_val sampling_params.return_perf_metrics = sampling_params.return_perf_metrics or self.args.return_perf_metrics return sampling_params diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index d05f670cb494..b8705e599383 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -3003,11 +3003,13 @@ class TorchLlmArgs(BaseLlmArgs): "The iteration interval to create responses under the streaming mode. " "Set this to a larger value when the batch size is large, which helps reduce the streaming overhead.", ) - stream_emit_interval_ms: int = Field( + stream_interval_ms: int = Field( default=0, + ge=0, description= - "The time interval (milliseconds) to create responses under the streaming mode. " - "Set to 0 to disable time-based throttling.", + "The time interval in milliseconds to create responses under the streaming mode. " + "Set to 0 to disable time-based streaming throttle. " + "When both stream_interval and stream_interval_ms are set, whichever triggers first will emit a response.", ) force_dynamic_quantization: bool = Field( @@ -3225,10 +3227,6 @@ def validate_stream_interval(self): if self.stream_interval <= 0: raise ValueError( f"stream_interval must be positive, got {self.stream_interval}") - if self.stream_emit_interval_ms < 0: - raise ValueError( - "stream_emit_interval_ms must be non-negative, got " - f"{self.stream_emit_interval_ms}") return self @model_validator(mode="after") diff --git a/tensorrt_llm/sampling_params.py b/tensorrt_llm/sampling_params.py index cdad60846e25..f0cd30faf040 100644 --- a/tensorrt_llm/sampling_params.py +++ b/tensorrt_llm/sampling_params.py @@ -292,6 +292,7 @@ class SamplingParams: skip_special_tokens: bool = True spaces_between_special_tokens: bool = True stream_interval: Optional[int] = None + stream_interval_ms: Optional[int] = None def __post_init__(self): if self.pad_id is None: @@ -319,6 +320,10 @@ def _validate(self): raise ValueError( f"require stream_interval > 0, got stream_interval={self.stream_interval}" ) + if self.stream_interval_ms is not None and self.stream_interval_ms <= 0: + raise ValueError( + f"require stream_interval_ms > 0, got stream_interval_ms={self.stream_interval_ms}" + ) if self.top_p is not None and (self.top_p < 0 or self.top_p > 1): raise ValueError(f"require 0 <= top_p <= 1, got top_p={self.top_p}") if self.top_k is not None and self.top_k < 0: diff --git a/tensorrt_llm/serve/openai_protocol.py b/tensorrt_llm/serve/openai_protocol.py index 458f33a4b866..a3b4f52a0a2f 100644 --- a/tensorrt_llm/serve/openai_protocol.py +++ b/tensorrt_llm/serve/openai_protocol.py @@ -391,6 +391,13 @@ class CompletionRequest(OpenAIBaseModel): "The iteration interval to create responses under the streaming mode. " "If not set, the engine-level default is used."), ) + stream_interval_ms: Optional[int] = Field( + default=None, + gt=0, + description=( + "The time interval in milliseconds to create responses under the streaming mode. " + "If not set, the engine-level default is used."), + ) # doc: end-completion-extra-params @@ -439,6 +446,7 @@ def to_sampling_params(self, # completion-extra-params add_special_tokens=self.add_special_tokens, stream_interval=self.stream_interval, + stream_interval_ms=self.stream_interval_ms, # TODO: migrate to use logprobs and prompt_logprobs _return_log_probs=bool(self.logprobs), @@ -761,6 +769,13 @@ class ChatCompletionRequest(OpenAIBaseModel): "The iteration interval to create responses under the streaming mode. " "If not set, the engine-level default is used."), ) + stream_interval_ms: Optional[int] = Field( + default=None, + gt=0, + description=( + "The time interval in milliseconds to create responses under the streaming mode. " + "If not set, the engine-level default is used."), + ) # doc: end-chat-completion-extra-params @@ -808,6 +823,7 @@ def to_sampling_params(self, # chat-completion-extra-params add_special_tokens=self.add_special_tokens, stream_interval=self.stream_interval, + stream_interval_ms=self.stream_interval_ms, # TODO: migrate to use logprobs and prompt_logprobs _return_log_probs=bool(self.logprobs), @@ -926,6 +942,13 @@ class ResponsesRequest(OpenAIBaseModel): "The iteration interval to create responses under the streaming mode. " "If not set, the engine-level default is used."), ) + stream_interval_ms: Optional[int] = Field( + default=None, + gt=0, + description=( + "The time interval in milliseconds to create responses under the streaming mode. " + "If not set, the engine-level default is used."), + ) request_id: str = Field( default_factory=lambda: f"resp_{str(uuid.uuid4().hex)}", @@ -972,6 +995,7 @@ def to_sampling_params( stop_token_ids=stop_token_ids, guided_decoding=guided_decoding, stream_interval=self.stream_interval, + stream_interval_ms=self.stream_interval_ms, ) @model_validator(mode="before") diff --git a/tests/unittest/api_stability/references_committed/llm.yaml b/tests/unittest/api_stability/references_committed/llm.yaml index 16481bac6f3e..61d272b55449 100644 --- a/tests/unittest/api_stability/references_committed/llm.yaml +++ b/tests/unittest/api_stability/references_committed/llm.yaml @@ -93,6 +93,9 @@ methods: stream_interval: annotation: int default: 1 + stream_interval_ms: + annotation: int + default: 0 kwargs: annotation: Any diff --git a/tests/unittest/api_stability/references_committed/sampling_params.yaml b/tests/unittest/api_stability/references_committed/sampling_params.yaml index 2cc051e5d081..e98875723b85 100644 --- a/tests/unittest/api_stability/references_committed/sampling_params.yaml +++ b/tests/unittest/api_stability/references_committed/sampling_params.yaml @@ -126,6 +126,9 @@ methods: stream_interval: annotation: Optional[int] default: null + stream_interval_ms: + annotation: Optional[int] + default: null # Returning controls logprobs: annotation: Optional[int] diff --git a/tests/unittest/llmapi/test_executor.py b/tests/unittest/llmapi/test_executor.py index 9e329bbc1475..a9421aef0afa 100644 --- a/tests/unittest/llmapi/test_executor.py +++ b/tests/unittest/llmapi/test_executor.py @@ -247,6 +247,95 @@ def test_stream_interval_openai_protocol(): ResponsesRequest(model="m", input="hi", stream_interval=-1) +def test_stream_interval_ms_validation(): + # Valid values + sp = SamplingParams(stream_interval_ms=100) + assert sp.stream_interval_ms == 100 + sp = SamplingParams(stream_interval_ms=1) + assert sp.stream_interval_ms == 1 + sp = SamplingParams(stream_interval_ms=None) + assert sp.stream_interval_ms is None + + # Both can be set simultaneously + sp = SamplingParams(stream_interval=5, stream_interval_ms=100) + assert sp.stream_interval == 5 + assert sp.stream_interval_ms == 100 + + # Invalid values + with pytest.raises(ValueError, match="stream_interval_ms"): + SamplingParams(stream_interval_ms=0) + with pytest.raises(ValueError, match="stream_interval_ms"): + SamplingParams(stream_interval_ms=-1) + + +def test_stream_interval_ms_openai_protocol(): + from pydantic import ValidationError + + from tensorrt_llm.serve.openai_protocol import ( + ChatCompletionRequest, + CompletionRequest, + ResponsesRequest, + ) + + # CompletionRequest: valid stream_interval_ms passes through + req = CompletionRequest(model="m", prompt="hi", stream_interval_ms=100) + sp = req.to_sampling_params() + assert sp.stream_interval_ms == 100 + + # CompletionRequest: None leaves stream_interval_ms unset + req = CompletionRequest(model="m", prompt="hi") + sp = req.to_sampling_params() + assert sp.stream_interval_ms is None + + # CompletionRequest: invalid stream_interval_ms rejected at construction + with pytest.raises(ValidationError, match="stream_interval_ms"): + CompletionRequest(model="m", prompt="hi", stream_interval_ms=-1) + with pytest.raises(ValidationError, match="stream_interval_ms"): + CompletionRequest(model="m", prompt="hi", stream_interval_ms=0) + + # ChatCompletionRequest: valid stream_interval_ms passes through + req = ChatCompletionRequest(model="m", + messages=[{ + "role": "user", + "content": "hi" + }], + stream_interval_ms=200) + sp = req.to_sampling_params() + assert sp.stream_interval_ms == 200 + + # ChatCompletionRequest: invalid stream_interval_ms rejected at construction + with pytest.raises(ValidationError, match="stream_interval_ms"): + ChatCompletionRequest(model="m", + messages=[{ + "role": "user", + "content": "hi" + }], + stream_interval_ms=0) + + # ResponsesRequest: valid stream_interval_ms passes through + req = ResponsesRequest(model="m", input="hi", stream_interval_ms=50) + sp = req.to_sampling_params() + assert sp.stream_interval_ms == 50 + + # ResponsesRequest: None leaves stream_interval_ms unset + req = ResponsesRequest(model="m", input="hi") + sp = req.to_sampling_params() + assert sp.stream_interval_ms is None + + # ResponsesRequest: invalid stream_interval_ms rejected at construction + with pytest.raises(ValidationError, match="stream_interval_ms"): + ResponsesRequest(model="m", input="hi", stream_interval_ms=-1) + + # Both can be set together + req = CompletionRequest(model="m", + prompt="hi", + stream_interval=5, + stream_interval_ms=100) + sp = req.to_sampling_params() + assert sp.stream_interval == 5 + assert sp.stream_interval_ms == 100 + + @pytest.mark.skipif(torch.cuda.device_count() < 2 or WORLD_SIZE != 2, reason="Must run on 2 MPI ranks with at least 2 GPUs") def test_sync_generation_tp_main_node_only(llama_7b_tp2_path: Path): From 253775fee816394912a2e93760f016fbf9500648 Mon Sep 17 00:00:00 2001 From: Quan Truong Date: Sun, 5 Apr 2026 23:57:47 +0000 Subject: [PATCH 2/6] change variable name and use different time function Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Quan Truong --- tensorrt_llm/_torch/pyexecutor/llm_request.py | 2 +- tensorrt_llm/_torch/pyexecutor/py_executor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index ab5a7df0501e..4a286a5adfc3 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -624,7 +624,7 @@ def __init__( self.py_stream_interval = None self.py_stream_interval_ms = None self.py_last_stream_emit_time = None - self.py_iters_since_last_emit = 0 + self.py_last_stream_emit_iter = 0 self.is_attention_dp_dummy = False self.is_cuda_graph_dummy = False self.py_kv_transfer_start_time = None diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 93227f6fc5cd..db63e331fc2c 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -3211,7 +3211,7 @@ def _handle_responses(self): request_done = False request.py_last_stream_emit_iter += 1 - now = time.monotonic() + now = get_steady_clock_now_in_seconds() # Resolve effective intervals (per-request > engine > default) token_interval = request.py_stream_interval or self.stream_interval From 00e8cf794bd48a96f985acb18f479701614015b1 Mon Sep 17 00:00:00 2001 From: Quan Truong Date: Sun, 5 Apr 2026 23:58:09 +0000 Subject: [PATCH 3/6] [None][fix] Use Field constraint for stream_interval validation and add type annotation Replace model_validator with ge=1 on Field for stream_interval. Add Optional[float] type annotation to py_last_stream_emit_time. Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Quan Truong --- tensorrt_llm/_torch/pyexecutor/llm_request.py | 2 +- tensorrt_llm/llmapi/llm_args.py | 8 +------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index 4a286a5adfc3..347f2eafdb7a 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -623,7 +623,7 @@ def __init__( self.py_decoding_iter = 0 self.py_stream_interval = None self.py_stream_interval_ms = None - self.py_last_stream_emit_time = None + self.py_last_stream_emit_time: Optional[float] = None self.py_last_stream_emit_iter = 0 self.is_attention_dp_dummy = False self.is_cuda_graph_dummy = False diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index b8705e599383..80505dced231 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -2999,6 +2999,7 @@ class TorchLlmArgs(BaseLlmArgs): # TODO: make this a per-request parameter stream_interval: int = Field( default=1, + ge=1, description= "The iteration interval to create responses under the streaming mode. " "Set this to a larger value when the batch size is large, which helps reduce the streaming overhead.", @@ -3222,13 +3223,6 @@ def validate_speculative_config(self): return self - @model_validator(mode="after") - def validate_stream_interval(self): - if self.stream_interval <= 0: - raise ValueError( - f"stream_interval must be positive, got {self.stream_interval}") - return self - @model_validator(mode="after") def validate_checkpoint_format(self): if self.checkpoint_format is not None and self.checkpoint_loader is not None: From a71a3fcdf7e05726dae77056d02a15dc551d2e3e Mon Sep 17 00:00:00 2001 From: Quan Truong Date: Tue, 7 Apr 2026 08:28:18 +0000 Subject: [PATCH 4/6] Fix toke_interval=1 issue --- tensorrt_llm/_torch/pyexecutor/py_executor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index db63e331fc2c..dc44fc8cfc70 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -3218,7 +3218,9 @@ def _handle_responses(self): time_interval_ms = request.py_stream_interval_ms or self.stream_interval_ms or 0 # Check triggers - token_triggered = request.py_last_stream_emit_iter >= token_interval + # If token_interval is 1 (default value), we will try to use time_interval_ms + token_triggered = (request.py_last_stream_emit_iter >= token_interval + and not (token_interval == 1 and time_interval_ms > 0)) time_triggered = ( time_interval_ms > 0 and request.py_last_stream_emit_time is not None From 8ddbca2547194c5c4cfe098b06562e81e2f3d8a2 Mon Sep 17 00:00:00 2001 From: Quan Truong Date: Tue, 7 Apr 2026 23:27:20 +0000 Subject: [PATCH 5/6] reformat --- tensorrt_llm/_torch/pyexecutor/py_executor.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index dc44fc8cfc70..9520da138fac 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -3220,12 +3220,11 @@ def _handle_responses(self): # Check triggers # If token_interval is 1 (default value), we will try to use time_interval_ms token_triggered = (request.py_last_stream_emit_iter >= token_interval - and not (token_interval == 1 and time_interval_ms > 0)) - time_triggered = ( - time_interval_ms > 0 - and request.py_last_stream_emit_time is not None - and (now - request.py_last_stream_emit_time) * 1000 - >= time_interval_ms) + and not (token_interval == 1 and time_interval_ms > 0)) + time_triggered = (time_interval_ms > 0 + and request.py_last_stream_emit_time is not None + and (now - request.py_last_stream_emit_time) * 1000 + >= time_interval_ms) should_emit = (request.py_decoding_iter == 1 or request.is_finished or token_triggered From 07245111faafa46745566f24e3e2c6679de8a0b8 Mon Sep 17 00:00:00 2001 From: Quan Truong Date: Tue, 7 Apr 2026 23:40:46 +0000 Subject: [PATCH 6/6] New logic --- tensorrt_llm/_torch/pyexecutor/py_executor.py | 19 +++++++++---------- tensorrt_llm/llmapi/llm_args.py | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 9520da138fac..9d2cd5d35d14 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -3217,18 +3217,17 @@ def _handle_responses(self): token_interval = request.py_stream_interval or self.stream_interval time_interval_ms = request.py_stream_interval_ms or self.stream_interval_ms or 0 - # Check triggers - # If token_interval is 1 (default value), we will try to use time_interval_ms - token_triggered = (request.py_last_stream_emit_iter >= token_interval - and not (token_interval == 1 and time_interval_ms > 0)) - time_triggered = (time_interval_ms > 0 - and request.py_last_stream_emit_time is not None - and (now - request.py_last_stream_emit_time) * 1000 - >= time_interval_ms) + # Time interval takes priority; fall back to token interval + if time_interval_ms > 0: + interval_triggered = ( + request.py_last_stream_emit_time is not None + and (now - request.py_last_stream_emit_time) * 1000 + >= time_interval_ms) + else: + interval_triggered = request.py_last_stream_emit_iter >= token_interval should_emit = (request.py_decoding_iter == 1 - or request.is_finished or token_triggered - or time_triggered) + or request.is_finished or interval_triggered) if should_emit: request.py_last_stream_emit_iter = 0 diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 80505dced231..04ccdc6feb93 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -3010,7 +3010,7 @@ class TorchLlmArgs(BaseLlmArgs): description= "The time interval in milliseconds to create responses under the streaming mode. " "Set to 0 to disable time-based streaming throttle. " - "When both stream_interval and stream_interval_ms are set, whichever triggers first will emit a response.", + "When stream_interval_ms is set (> 0), it takes priority over stream_interval.", ) force_dynamic_quantization: bool = Field(