From 4a90e5d34b23f667b7baa2f6281069898d5988ad Mon Sep 17 00:00:00 2001 From: Taksh Date: Thu, 9 Apr 2026 15:36:41 +0530 Subject: [PATCH] Fix off-by-one in completion_tokens count in generate_stream The loop `for i in range(max_new_tokens)` appends a token before yielding, but reports `completion_tokens: i` instead of `i + 1`. Since i is 0-indexed and the token is already appended, this undercounts by 1 on every streaming chunk and on the final response. Co-Authored-By: Claude Opus 4.6 (1M context) --- fastchat/serve/inference.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fastchat/serve/inference.py b/fastchat/serve/inference.py index 6d155aab7..c76850fa0 100644 --- a/fastchat/serve/inference.py +++ b/fastchat/serve/inference.py @@ -279,8 +279,8 @@ def generate_stream( "logprobs": ret_logprobs, "usage": { "prompt_tokens": input_echo_len, - "completion_tokens": i, - "total_tokens": input_echo_len + i, + "completion_tokens": i + 1, + "total_tokens": input_echo_len + i + 1, }, "finish_reason": None, } @@ -300,8 +300,8 @@ def generate_stream( "logprobs": ret_logprobs, "usage": { "prompt_tokens": input_echo_len, - "completion_tokens": i, - "total_tokens": input_echo_len + i, + "completion_tokens": i + 1, + "total_tokens": input_echo_len + i + 1, }, "finish_reason": finish_reason, }