From 4a90e5d34b23f667b7baa2f6281069898d5988ad Mon Sep 17 00:00:00 2001
From: Taksh <takshkothari09@gmail.com>
Date: Thu, 9 Apr 2026 15:36:41 +0530
Subject: [PATCH] Fix off-by-one in completion_tokens count in generate_stream

The loop `for i in range(max_new_tokens)` appends a token before
yielding, but reports `completion_tokens: i` instead of `i + 1`.
Since i is 0-indexed and the token is already appended, this
undercounts by 1 on every streaming chunk and on the final response.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 fastchat/serve/inference.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fastchat/serve/inference.py b/fastchat/serve/inference.py
index 6d155aab7..c76850fa0 100644
--- a/fastchat/serve/inference.py
+++ b/fastchat/serve/inference.py
@@ -279,8 +279,8 @@ def generate_stream(
                     "logprobs": ret_logprobs,
                     "usage": {
                         "prompt_tokens": input_echo_len,
-                        "completion_tokens": i,
-                        "total_tokens": input_echo_len + i,
+                        "completion_tokens": i + 1,
+                        "total_tokens": input_echo_len + i + 1,
                     },
                     "finish_reason": None,
                 }
@@ -300,8 +300,8 @@ def generate_stream(
         "logprobs": ret_logprobs,
         "usage": {
             "prompt_tokens": input_echo_len,
-            "completion_tokens": i,
-            "total_tokens": input_echo_len + i,
+            "completion_tokens": i + 1,
+            "total_tokens": input_echo_len + i + 1,
         },
         "finish_reason": finish_reason,
     }