Skip to content

Commit f42844b

Browse files
kirklandsignmeta-codesync[bot]
authored andcommitted
Fix dead condition, division-by-zero, and uninitialized members in LLM stats (#18819)
Summary: Pull Request resolved: #18819 Three issues fixed: 1. text_llm_runner.cpp: The condition num_generated_tokens == max_new_tokens was always false because TextTokenGenerator::generate() receives max_new_tokens - 1. Fixed to compare against max_new_tokens - 1. 2. stats.h print_report(): Division by zero when inference/prefill/decode time is zero (e.g., during very fast warmup runs). Added guards matching the pattern already used in stats_to_json_string(). 3. stats.h Stats: Added default initializers (= 0) to all timestamp and counter members to prevent undefined behavior from uninitialized reads. Reviewed By: manuelcandales Differential Revision: D99708774
1 parent 5e8a0df commit f42844b

File tree

2 files changed

+21
-20
lines changed

2 files changed

+21
-20
lines changed

extension/llm/runner/stats.h

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -23,32 +23,32 @@ struct ET_EXPERIMENTAL Stats {
2323
const long SCALING_FACTOR_UNITS_PER_SECOND = 1000;
2424
// Time stamps for the different stages of the execution
2525
// model_load_start_ms: Start of model loading.
26-
long model_load_start_ms;
26+
long model_load_start_ms = 0;
2727
// model_load_end_ms: End of model loading.
28-
long model_load_end_ms;
28+
long model_load_end_ms = 0;
2929
// inference_start_ms: Immediately after the model is loaded (or we check
3030
// for model load), measure the inference time.
3131
// NOTE: It's actually the tokenizer encode + model execution time.
32-
long inference_start_ms;
32+
long inference_start_ms = 0;
3333
// End of the tokenizer encode time.
34-
long token_encode_end_ms;
34+
long token_encode_end_ms = 0;
3535
// Start of the model execution (forward function) time.
36-
long model_execution_start_ms;
36+
long model_execution_start_ms = 0;
3737
// End of the model execution (forward function) time.
38-
long model_execution_end_ms;
38+
long model_execution_end_ms = 0;
3939
// prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
4040
// before the inference loop starts
41-
long prompt_eval_end_ms;
41+
long prompt_eval_end_ms = 0;
4242
// first_token: Timestamp when the first generated token is emitted
43-
long first_token_ms;
43+
long first_token_ms = 0;
4444
// inference_end_ms: End of inference/generation.
45-
long inference_end_ms;
45+
long inference_end_ms = 0;
4646
// Keep a running total of the time spent in sampling.
4747
long aggregate_sampling_time_ms = 0;
4848
// Token count from prompt
49-
int64_t num_prompt_tokens;
49+
int64_t num_prompt_tokens = 0;
5050
// Token count from generated (total - prompt)
51-
int64_t num_generated_tokens;
51+
int64_t num_generated_tokens = 0;
5252
// GPU memory stats (optional; may be zero if not available)
5353
// GPU memory stats (optional). Use sentinel UINT64_MAX / -1.0 to indicate
5454
// "not available".
@@ -171,18 +171,18 @@ inline void print_report(const Stats& stats) {
171171
Info,
172172
"\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
173173
inference_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND,
174-
175-
(stats.num_generated_tokens) /
176-
(double)(stats.inference_end_ms - stats.inference_start_ms) *
177-
stats.SCALING_FACTOR_UNITS_PER_SECOND);
174+
inference_time_ms > 0 ? (stats.num_generated_tokens) / inference_time_ms *
175+
stats.SCALING_FACTOR_UNITS_PER_SECOND
176+
: 0.0);
178177
double prompt_eval_time =
179178
(double)(stats.prompt_eval_end_ms - stats.inference_start_ms);
180179
ET_LOG(
181180
Info,
182181
"\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
183182
prompt_eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
184-
(stats.num_prompt_tokens) / prompt_eval_time *
185-
stats.SCALING_FACTOR_UNITS_PER_SECOND);
183+
prompt_eval_time > 0 ? (stats.num_prompt_tokens) / prompt_eval_time *
184+
stats.SCALING_FACTOR_UNITS_PER_SECOND
185+
: 0.0);
186186

187187
double eval_time =
188188
(double)(stats.inference_end_ms - stats.prompt_eval_end_ms);
@@ -192,8 +192,9 @@ inline void print_report(const Stats& stats) {
192192
" tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
193193
stats.num_generated_tokens,
194194
eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
195-
stats.num_generated_tokens / eval_time *
196-
stats.SCALING_FACTOR_UNITS_PER_SECOND);
195+
eval_time > 0 ? stats.num_generated_tokens / eval_time *
196+
stats.SCALING_FACTOR_UNITS_PER_SECOND
197+
: 0.0);
197198

198199
// Time to first token is measured from the start of inference, excluding
199200
// model load time.

extension/llm/runner/text_llm_runner.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ Error TextLLMRunner::generate(
233233
"RSS after finishing text generation: %f MiB (0 if unsupported)",
234234
get_rss_bytes() / 1024.0 / 1024.0);
235235

236-
if (num_generated_tokens == max_new_tokens) {
236+
if (num_generated_tokens == max_new_tokens - 1) {
237237
RUNNER_ET_LOG(config.warming, "Max new tokens %i reached!", max_new_tokens);
238238
}
239239

0 commit comments

Comments
 (0)