xref: /aosp_15_r20/external/executorch/extension/llm/runner/stats.h (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Runner stats for LLM
10 #pragma once
11 #include <executorch/extension/llm/runner/util.h>
12 #include <executorch/runtime/platform/log.h>
13 #include <cinttypes>
14 #include <sstream>
15 #include <string>
16 
17 namespace executorch {
18 namespace extension {
19 namespace llm {
20 
21 struct ET_EXPERIMENTAL Stats {
22   // Scaling factor for timestamps - in this case, we use ms.
23   const long SCALING_FACTOR_UNITS_PER_SECOND = 1000;
24   // Time stamps for the different stages of the execution
25   // model_load_start_ms: Start of model loading.
26   long model_load_start_ms;
27   // model_load_end_ms: End of model loading.
28   long model_load_end_ms;
29   // inference_start_ms: Immediately after the model is loaded (or we check
30   // for model load), measure the inference time.
31   // NOTE: It's actually the tokenizer encode + model execution time.
32   long inference_start_ms;
33   // End of the tokenizer encode time.
34   long token_encode_end_ms;
35   // Start of the model execution (forward function) time.
36   long model_execution_start_ms;
37   // End of the model execution (forward function) time.
38   long model_execution_end_ms;
39   // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
40   // before the inference loop starts
41   long prompt_eval_end_ms;
42   // first_token: Timestamp when the first generated token is emitted
43   long first_token_ms;
44   // inference_end_ms: End of inference/generation.
45   long inference_end_ms;
46   // Keep a running total of the time spent in sampling.
47   long aggregate_sampling_time_ms;
48   // Token count from prompt
49   int64_t num_prompt_tokens;
50   // Token count from generated (total - prompt)
51   int64_t num_generated_tokens;
on_sampling_beginStats52   inline void on_sampling_begin() {
53     aggregate_sampling_timer_start_timestamp = time_in_ms();
54   }
on_sampling_endStats55   inline void on_sampling_end() {
56     aggregate_sampling_time_ms +=
57         time_in_ms() - aggregate_sampling_timer_start_timestamp;
58     aggregate_sampling_timer_start_timestamp = 0;
59   }
60 
61   void reset(bool all_stats = false) {
62     // Not resetting model_load_start_ms and model_load_end_ms because reset is
63     // typically called after warmup and before running the actual run.
64     // However, we don't load the model again during the actual run after
65     // warmup. So, we don't want to reset these timestamps unless we are
66     // resetting everything.
67     if (all_stats) {
68       model_load_start_ms = 0;
69       model_load_end_ms = 0;
70     }
71     inference_start_ms = 0;
72     prompt_eval_end_ms = 0;
73     first_token_ms = 0;
74     inference_end_ms = 0;
75     aggregate_sampling_time_ms = 0;
76     num_prompt_tokens = 0;
77     num_generated_tokens = 0;
78     aggregate_sampling_timer_start_timestamp = 0;
79   }
80 
81  private:
82   long aggregate_sampling_timer_start_timestamp = 0;
83 };
84 
85 static constexpr auto kTopp = 0.9f;
86 
stats_to_json_string(const Stats & stats)87 inline std::string stats_to_json_string(const Stats& stats) {
88   std::stringstream ss;
89   ss << "{\"prompt_tokens\":" << stats.num_prompt_tokens << ","
90      << "\"generated_tokens\":" << stats.num_generated_tokens << ","
91      << "\"model_load_start_ms\":" << stats.model_load_start_ms << ","
92      << "\"model_load_end_ms\":" << stats.model_load_end_ms << ","
93      << "\"inference_start_ms\":" << stats.inference_start_ms << ","
94      << "\"inference_end_ms\":" << stats.inference_end_ms << ","
95      << "\"prompt_eval_end_ms\":" << stats.prompt_eval_end_ms << ","
96      << "\"first_token_ms\":" << stats.first_token_ms << ","
97      << "\"aggregate_sampling_time_ms\":" << stats.aggregate_sampling_time_ms
98      << "," << "\"SCALING_FACTOR_UNITS_PER_SECOND\":"
99      << stats.SCALING_FACTOR_UNITS_PER_SECOND << "}";
100   return ss.str();
101 }
102 
print_report(const Stats & stats)103 inline void print_report(const Stats& stats) {
104   printf("PyTorchObserver %s\n", stats_to_json_string(stats).c_str());
105 
106   ET_LOG(
107       Info,
108       "\tPrompt Tokens: %" PRIu64 "    Generated Tokens: %" PRIu64,
109       stats.num_prompt_tokens,
110       stats.num_generated_tokens);
111 
112   ET_LOG(
113       Info,
114       "\tModel Load Time:\t\t%f (seconds)",
115       ((double)(stats.model_load_end_ms - stats.model_load_start_ms) /
116        stats.SCALING_FACTOR_UNITS_PER_SECOND));
117   double inference_time_ms =
118       (double)(stats.inference_end_ms - stats.inference_start_ms);
119   ET_LOG(
120       Info,
121       "\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
122       inference_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND,
123 
124       (stats.num_generated_tokens) /
125           (double)(stats.inference_end_ms - stats.inference_start_ms) *
126           stats.SCALING_FACTOR_UNITS_PER_SECOND);
127   double prompt_eval_time =
128       (double)(stats.prompt_eval_end_ms - stats.inference_start_ms);
129   ET_LOG(
130       Info,
131       "\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
132       prompt_eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
133       (stats.num_prompt_tokens) / prompt_eval_time *
134           stats.SCALING_FACTOR_UNITS_PER_SECOND);
135 
136   double eval_time =
137       (double)(stats.inference_end_ms - stats.prompt_eval_end_ms);
138   ET_LOG(
139       Info,
140       "\t\tGenerated %" PRIu64
141       " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
142       stats.num_generated_tokens,
143       eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
144       stats.num_generated_tokens / eval_time *
145           stats.SCALING_FACTOR_UNITS_PER_SECOND);
146 
147   // Time to first token is measured from the start of inference, excluding
148   // model load time.
149   ET_LOG(
150       Info,
151       "\tTime to first generated token:\t%f (seconds)",
152       ((double)(stats.first_token_ms - stats.inference_start_ms) /
153        stats.SCALING_FACTOR_UNITS_PER_SECOND));
154 
155   ET_LOG(
156       Info,
157       "\tSampling time over %" PRIu64 " tokens:\t%f (seconds)",
158       stats.num_prompt_tokens + stats.num_generated_tokens,
159       (double)stats.aggregate_sampling_time_ms /
160           stats.SCALING_FACTOR_UNITS_PER_SECOND);
161 }
162 
163 } // namespace llm
164 } // namespace extension
165 } // namespace executorch
166 
167 namespace executorch {
168 namespace llm {
169 // TODO(T197294990): Remove these deprecated aliases once all users have moved
170 // to the new `::executorch` namespaces.
171 using ::executorch::extension::llm::kTopp;
172 using ::executorch::extension::llm::print_report;
173 using ::executorch::extension::llm::Stats;
174 } // namespace llm
175 } // namespace executorch
176