1 /*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 // Runner stats for LLM
10 #pragma once
11 #include <executorch/extension/llm/runner/util.h>
12 #include <executorch/runtime/platform/log.h>
13 #include <cinttypes>
14 #include <sstream>
15 #include <string>
16
17 namespace executorch {
18 namespace extension {
19 namespace llm {
20
21 struct ET_EXPERIMENTAL Stats {
22 // Scaling factor for timestamps - in this case, we use ms.
23 const long SCALING_FACTOR_UNITS_PER_SECOND = 1000;
24 // Time stamps for the different stages of the execution
25 // model_load_start_ms: Start of model loading.
26 long model_load_start_ms;
27 // model_load_end_ms: End of model loading.
28 long model_load_end_ms;
29 // inference_start_ms: Immediately after the model is loaded (or we check
30 // for model load), measure the inference time.
31 // NOTE: It's actually the tokenizer encode + model execution time.
32 long inference_start_ms;
33 // End of the tokenizer encode time.
34 long token_encode_end_ms;
35 // Start of the model execution (forward function) time.
36 long model_execution_start_ms;
37 // End of the model execution (forward function) time.
38 long model_execution_end_ms;
39 // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
40 // before the inference loop starts
41 long prompt_eval_end_ms;
42 // first_token: Timestamp when the first generated token is emitted
43 long first_token_ms;
44 // inference_end_ms: End of inference/generation.
45 long inference_end_ms;
46 // Keep a running total of the time spent in sampling.
47 long aggregate_sampling_time_ms;
48 // Token count from prompt
49 int64_t num_prompt_tokens;
50 // Token count from generated (total - prompt)
51 int64_t num_generated_tokens;
on_sampling_beginStats52 inline void on_sampling_begin() {
53 aggregate_sampling_timer_start_timestamp = time_in_ms();
54 }
on_sampling_endStats55 inline void on_sampling_end() {
56 aggregate_sampling_time_ms +=
57 time_in_ms() - aggregate_sampling_timer_start_timestamp;
58 aggregate_sampling_timer_start_timestamp = 0;
59 }
60
61 void reset(bool all_stats = false) {
62 // Not resetting model_load_start_ms and model_load_end_ms because reset is
63 // typically called after warmup and before running the actual run.
64 // However, we don't load the model again during the actual run after
65 // warmup. So, we don't want to reset these timestamps unless we are
66 // resetting everything.
67 if (all_stats) {
68 model_load_start_ms = 0;
69 model_load_end_ms = 0;
70 }
71 inference_start_ms = 0;
72 prompt_eval_end_ms = 0;
73 first_token_ms = 0;
74 inference_end_ms = 0;
75 aggregate_sampling_time_ms = 0;
76 num_prompt_tokens = 0;
77 num_generated_tokens = 0;
78 aggregate_sampling_timer_start_timestamp = 0;
79 }
80
81 private:
82 long aggregate_sampling_timer_start_timestamp = 0;
83 };
84
85 static constexpr auto kTopp = 0.9f;
86
stats_to_json_string(const Stats & stats)87 inline std::string stats_to_json_string(const Stats& stats) {
88 std::stringstream ss;
89 ss << "{\"prompt_tokens\":" << stats.num_prompt_tokens << ","
90 << "\"generated_tokens\":" << stats.num_generated_tokens << ","
91 << "\"model_load_start_ms\":" << stats.model_load_start_ms << ","
92 << "\"model_load_end_ms\":" << stats.model_load_end_ms << ","
93 << "\"inference_start_ms\":" << stats.inference_start_ms << ","
94 << "\"inference_end_ms\":" << stats.inference_end_ms << ","
95 << "\"prompt_eval_end_ms\":" << stats.prompt_eval_end_ms << ","
96 << "\"first_token_ms\":" << stats.first_token_ms << ","
97 << "\"aggregate_sampling_time_ms\":" << stats.aggregate_sampling_time_ms
98 << "," << "\"SCALING_FACTOR_UNITS_PER_SECOND\":"
99 << stats.SCALING_FACTOR_UNITS_PER_SECOND << "}";
100 return ss.str();
101 }
102
print_report(const Stats & stats)103 inline void print_report(const Stats& stats) {
104 printf("PyTorchObserver %s\n", stats_to_json_string(stats).c_str());
105
106 ET_LOG(
107 Info,
108 "\tPrompt Tokens: %" PRIu64 " Generated Tokens: %" PRIu64,
109 stats.num_prompt_tokens,
110 stats.num_generated_tokens);
111
112 ET_LOG(
113 Info,
114 "\tModel Load Time:\t\t%f (seconds)",
115 ((double)(stats.model_load_end_ms - stats.model_load_start_ms) /
116 stats.SCALING_FACTOR_UNITS_PER_SECOND));
117 double inference_time_ms =
118 (double)(stats.inference_end_ms - stats.inference_start_ms);
119 ET_LOG(
120 Info,
121 "\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
122 inference_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND,
123
124 (stats.num_generated_tokens) /
125 (double)(stats.inference_end_ms - stats.inference_start_ms) *
126 stats.SCALING_FACTOR_UNITS_PER_SECOND);
127 double prompt_eval_time =
128 (double)(stats.prompt_eval_end_ms - stats.inference_start_ms);
129 ET_LOG(
130 Info,
131 "\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
132 prompt_eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
133 (stats.num_prompt_tokens) / prompt_eval_time *
134 stats.SCALING_FACTOR_UNITS_PER_SECOND);
135
136 double eval_time =
137 (double)(stats.inference_end_ms - stats.prompt_eval_end_ms);
138 ET_LOG(
139 Info,
140 "\t\tGenerated %" PRIu64
141 " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
142 stats.num_generated_tokens,
143 eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
144 stats.num_generated_tokens / eval_time *
145 stats.SCALING_FACTOR_UNITS_PER_SECOND);
146
147 // Time to first token is measured from the start of inference, excluding
148 // model load time.
149 ET_LOG(
150 Info,
151 "\tTime to first generated token:\t%f (seconds)",
152 ((double)(stats.first_token_ms - stats.inference_start_ms) /
153 stats.SCALING_FACTOR_UNITS_PER_SECOND));
154
155 ET_LOG(
156 Info,
157 "\tSampling time over %" PRIu64 " tokens:\t%f (seconds)",
158 stats.num_prompt_tokens + stats.num_generated_tokens,
159 (double)stats.aggregate_sampling_time_ms /
160 stats.SCALING_FACTOR_UNITS_PER_SECOND);
161 }
162
163 } // namespace llm
164 } // namespace extension
165 } // namespace executorch
166
167 namespace executorch {
168 namespace llm {
169 // TODO(T197294990): Remove these deprecated aliases once all users have moved
170 // to the new `::executorch` namespaces.
171 using ::executorch::extension::llm::kTopp;
172 using ::executorch::extension::llm::print_report;
173 using ::executorch::extension::llm::Stats;
174 } // namespace llm
175 } // namespace executorch
176