Fix dead condition, division-by-zero, and uninitialized members in LLM stats (#18819)

kirklandsign · meta-codesync[bot] · commit f42844ba127b · 2026-04-10T11:51:07.000-07:00
Summary: Pull Request resolved: #18819 Three issues fixed: 1. text_llm_runner.cpp: The condition num_generated_tokens == max_new_tokens was always false because TextTokenGenerator::generate() receives max_new_tokens - 1. Fixed to compare against max_new_tokens - 1. 2. stats.h print_report(): Division by zero when inference/prefill/decode time is zero (e.g., during very fast warmup runs). Added guards matching the pattern already used in stats_to_json_string(). 3. stats.h Stats: Added default initializers (= 0) to all timestamp and counter members to prevent undefined behavior from uninitialized reads. Reviewed By: manuelcandales Differential Revision: D99708774
diff --git a/extension/llm/runner/stats.h b/extension/llm/runner/stats.h
@@ -23,32 +23,32 @@ struct ET_EXPERIMENTAL Stats {
   const long SCALING_FACTOR_UNITS_PER_SECOND = 1000;
   // Time stamps for the different stages of the execution
   // model_load_start_ms: Start of model loading.
-  long model_load_start_ms;
+  long model_load_start_ms = 0;
   // model_load_end_ms: End of model loading.
-  long model_load_end_ms;
+  long model_load_end_ms = 0;
   // inference_start_ms: Immediately after the model is loaded (or we check
   // for model load), measure the inference time.
   // NOTE: It's actually the tokenizer encode + model execution time.
-  long inference_start_ms;
+  long inference_start_ms = 0;
   // End of the tokenizer encode time.
-  long token_encode_end_ms;
+  long token_encode_end_ms = 0;
   // Start of the model execution (forward function) time.
-  long model_execution_start_ms;
+  long model_execution_start_ms = 0;
   // End of the model execution (forward function) time.
-  long model_execution_end_ms;
+  long model_execution_end_ms = 0;
   // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
   // before the inference loop starts
-  long prompt_eval_end_ms;
+  long prompt_eval_end_ms = 0;
   // first_token: Timestamp when the first generated token is emitted
-  long first_token_ms;
+  long first_token_ms = 0;
   // inference_end_ms: End of inference/generation.
-  long inference_end_ms;
+  long inference_end_ms = 0;
   // Keep a running total of the time spent in sampling.
   long aggregate_sampling_time_ms = 0;
   // Token count from prompt
-  int64_t num_prompt_tokens;
+  int64_t num_prompt_tokens = 0;
   // Token count from generated (total - prompt)
-  int64_t num_generated_tokens;
+  int64_t num_generated_tokens = 0;
   // GPU memory stats (optional; may be zero if not available)
   // GPU memory stats (optional). Use sentinel UINT64_MAX / -1.0 to indicate
   // "not available".
@@ -171,18 +171,18 @@ inline void print_report(const Stats& stats) {
       Info,
       "\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
       inference_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND,
-
-      (stats.num_generated_tokens) /
-          (double)(stats.inference_end_ms - stats.inference_start_ms) *
-          stats.SCALING_FACTOR_UNITS_PER_SECOND);
+      inference_time_ms > 0 ? (stats.num_generated_tokens) / inference_time_ms *
+              stats.SCALING_FACTOR_UNITS_PER_SECOND
+        : 0.0);
   double prompt_eval_time =
       (double)(stats.prompt_eval_end_ms - stats.inference_start_ms);
   ET_LOG(
       Info,
       "\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
       prompt_eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
-      (stats.num_prompt_tokens) / prompt_eval_time *
-          stats.SCALING_FACTOR_UNITS_PER_SECOND);
+      prompt_eval_time > 0 ? (stats.num_prompt_tokens) / prompt_eval_time *
+              stats.SCALING_FACTOR_UNITS_PER_SECOND
+                           : 0.0);
 
   double eval_time =
       (double)(stats.inference_end_ms - stats.prompt_eval_end_ms);
@@ -192,8 +192,9 @@ inline void print_report(const Stats& stats) {
       " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
       stats.num_generated_tokens,
       eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
-      stats.num_generated_tokens / eval_time *
-          stats.SCALING_FACTOR_UNITS_PER_SECOND);
+      eval_time > 0 ? stats.num_generated_tokens / eval_time *
+              stats.SCALING_FACTOR_UNITS_PER_SECOND
+                    : 0.0);
 
   // Time to first token is measured from the start of inference, excluding
   // model load time.
diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp
@@ -233,7 +233,7 @@ Error TextLLMRunner::generate(
       "RSS after finishing text generation: %f MiB (0 if unsupported)",
       get_rss_bytes() / 1024.0 / 1024.0);
 
-  if (num_generated_tokens == max_new_tokens) {
+  if (num_generated_tokens == max_new_tokens - 1) {
     RUNNER_ET_LOG(config.warming, "Max new tokens %i reached!", max_new_tokens);
   }
 

Original file line number	Diff line number	Diff line change
`@@ -233,7 +233,7 @@ Error TextLLMRunner::generate(`
`233`	`233`	`"RSS after finishing text generation: %f MiB (0 if unsupported)",`
`234`	`234`	`get_rss_bytes() / 1024.0 / 1024.0);`
`235`	`235`
`236`		`- if (num_generated_tokens == max_new_tokens) {`
	`236`	`+ if (num_generated_tokens == max_new_tokens - 1) {`
`237`	`237`	`RUNNER_ET_LOG(config.warming, "Max new tokens %i reached!", max_new_tokens);`
`238`	`238`	`}`
`239`	`239`