1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/tools/benchmark/benchmark_model.h"
17
18 #include <unistd.h>
19
20 #include <iostream>
21 #include <memory>
22 #include <sstream>
23 #include <string>
24
25 #include "tensorflow/lite/profiling/memory_info.h"
26 #include "tensorflow/lite/profiling/time.h"
27 #include "tensorflow/lite/tools/benchmark/benchmark_utils.h"
28 #include "tensorflow/lite/tools/logging.h"
29
30 namespace tflite {
31 namespace benchmark {
32 using tensorflow::Stat;
33
34 constexpr int kMemoryCheckIntervalMs = 50;
35
36 #ifdef __linux__
GetRssStats(size_t * vsize,size_t * rss,size_t * shared,size_t * code)37 void GetRssStats(size_t* vsize, size_t* rss, size_t* shared, size_t* code) {
38 FILE* fp = fopen("/proc/self/statm", "rt");
39 *vsize = 0;
40 *rss = 0;
41 *shared = 0;
42 *code = 0;
43 if (fp == nullptr) return;
44 fscanf(fp, "%zu %zu %zu %zu", vsize, rss, shared, code);
45 fclose(fp);
46 *vsize = *vsize * getpagesize() >> 20;
47 *rss = *rss * getpagesize() >> 20;
48 *shared = *shared * getpagesize() >> 20;
49 *code = *code * getpagesize() >> 20;
50 }
51 #endif // __linux__
52
DefaultParams()53 BenchmarkParams BenchmarkModel::DefaultParams() {
54 BenchmarkParams params;
55 params.AddParam("num_runs", BenchmarkParam::Create<int32_t>(50));
56 params.AddParam("min_secs", BenchmarkParam::Create<float>(1.0f));
57 params.AddParam("max_secs", BenchmarkParam::Create<float>(150.0f));
58 params.AddParam("run_delay", BenchmarkParam::Create<float>(-1.0f));
59 params.AddParam("run_frequency", BenchmarkParam::Create<float>(-1.0f));
60 params.AddParam("num_threads", BenchmarkParam::Create<int32_t>(-1));
61 params.AddParam("use_caching", BenchmarkParam::Create<bool>(false));
62 params.AddParam("benchmark_name", BenchmarkParam::Create<std::string>(""));
63 params.AddParam("output_prefix", BenchmarkParam::Create<std::string>(""));
64 params.AddParam("warmup_runs", BenchmarkParam::Create<int32_t>(1));
65 params.AddParam("warmup_min_secs", BenchmarkParam::Create<float>(0.5f));
66 params.AddParam("verbose", BenchmarkParam::Create<bool>(false));
67 params.AddParam("dry_run", BenchmarkParam::Create<bool>(false));
68 params.AddParam("report_peak_memory_footprint",
69 BenchmarkParam::Create<bool>(false));
70 params.AddParam("memory_footprint_check_interval_ms",
71 BenchmarkParam::Create<int32_t>(kMemoryCheckIntervalMs));
72 return params;
73 }
74
BenchmarkModel()75 BenchmarkModel::BenchmarkModel() : params_(DefaultParams()) {}
76
OnBenchmarkEnd(const BenchmarkResults & results)77 void BenchmarkLoggingListener::OnBenchmarkEnd(const BenchmarkResults& results) {
78 auto inference_us = results.inference_time_us();
79 auto init_us = results.startup_latency_us();
80 auto warmup_us = results.warmup_time_us();
81 auto init_mem_usage = results.init_mem_usage();
82 auto overall_mem_usage = results.overall_mem_usage();
83 TFLITE_LOG(INFO) << "Inference timings in us: "
84 << "Init: " << init_us << ", "
85 << "First inference: " << warmup_us.first() << ", "
86 << "Warmup (avg): " << warmup_us.avg() << ", "
87 << "Inference (avg): " << inference_us.avg();
88
89 if (!init_mem_usage.IsSupported()) return;
90 TFLITE_LOG(INFO)
91 << "Note: as the benchmark tool itself affects memory footprint, the "
92 "following is only APPROXIMATE to the actual memory footprint of the "
93 "model at runtime. Take the information at your discretion.";
94 TFLITE_LOG(INFO) << "Memory footprint delta from the start of the tool (MB): "
95 << "init=" << init_mem_usage.max_rss_kb / 1024.0
96 << " overall=" << overall_mem_usage.max_rss_kb / 1024.0;
97
98 auto peak_mem_mb = results.peak_mem_mb();
99 if (peak_mem_mb > 0) {
100 TFLITE_LOG(INFO)
101 << "Overall peak memory footprint (MB) via periodic monitoring: "
102 << peak_mem_mb;
103 #ifdef __linux__
104 size_t vsize, rss, shared, code;
105 GetRssStats(&vsize, &rss, &shared, &code);
106 TFLITE_LOG(INFO) << "Memory status at the end of exeution:";
107 TFLITE_LOG(INFO) << "- VmRSS : " << rss << " MB";
108 TFLITE_LOG(INFO) << "+ RssAnnon : " << rss - shared << " MB";
109 TFLITE_LOG(INFO) << "+ RssFile + RssShmem : " << shared << " MB";
110 #endif // __linux_
111 }
112 }
113
GetFlags()114 std::vector<Flag> BenchmarkModel::GetFlags() {
115 return {
116 CreateFlag<int32_t>(
117 "num_runs", ¶ms_,
118 "expected number of runs, see also min_secs, max_secs"),
119 CreateFlag<float>(
120 "min_secs", ¶ms_,
121 "minimum number of seconds to rerun for, potentially making the "
122 "actual number of runs to be greater than num_runs"),
123 CreateFlag<float>(
124 "max_secs", ¶ms_,
125 "maximum number of seconds to rerun for, potentially making the "
126 "actual number of runs to be less than num_runs. Note if --max-secs "
127 "is exceeded in the middle of a run, the benchmark will continue to "
128 "the end of the run but will not start the next run."),
129 CreateFlag<float>("run_delay", ¶ms_, "delay between runs in seconds"),
130 CreateFlag<float>(
131 "run_frequency", ¶ms_,
132 "Execute at a fixed frequency, instead of a fixed delay."
133 "Note if the targeted rate per second cannot be reached, the "
134 "benchmark would start the next run immediately, trying its best to "
135 "catch up. If set, this will override run_delay."),
136 CreateFlag<int32_t>("num_threads", ¶ms_, "number of threads"),
137 CreateFlag<bool>(
138 "use_caching", ¶ms_,
139 "Enable caching of prepacked weights matrices in matrix "
140 "multiplication routines. Currently implies the use of the Ruy "
141 "library."),
142 CreateFlag<std::string>("benchmark_name", ¶ms_, "benchmark name"),
143 CreateFlag<std::string>("output_prefix", ¶ms_,
144 "benchmark output prefix"),
145 CreateFlag<int32_t>(
146 "warmup_runs", ¶ms_,
147 "minimum number of runs performed on initialization, to "
148 "allow performance characteristics to settle, see also "
149 "warmup_min_secs"),
150 CreateFlag<float>(
151 "warmup_min_secs", ¶ms_,
152 "minimum number of seconds to rerun for, potentially making the "
153 "actual number of warm-up runs to be greater than warmup_runs"),
154 CreateFlag<bool>("verbose", ¶ms_,
155 "Whether to log parameters whose values are not set. "
156 "By default, only log those parameters that are set by "
157 "parsing their values from the commandline flags."),
158 CreateFlag<bool>("dry_run", ¶ms_,
159 "Whether to run the tool just with simply loading the "
160 "model, allocating tensors etc. but without actually "
161 "invoking any op kernels."),
162 CreateFlag<bool>(
163 "report_peak_memory_footprint", ¶ms_,
164 "Report the peak memory footprint by periodically checking the "
165 "memory footprint. Internally, a separate thread will be spawned for "
166 "this periodic check. Therefore, the performance benchmark result "
167 "could be affected."),
168 CreateFlag<int32_t>("memory_footprint_check_interval_ms", ¶ms_,
169 "The interval in millisecond between two consecutive "
170 "memory footprint checks. This is only used when "
171 "--report_peak_memory_footprint is set to true.")};
172 }
173
LogParams()174 void BenchmarkModel::LogParams() {
175 const bool verbose = params_.Get<bool>("verbose");
176 TFLITE_LOG(INFO) << "Log parameter values verbosely: [" << verbose << "]";
177
178 LOG_BENCHMARK_PARAM(int32_t, "num_runs", "Min num runs", verbose);
179 LOG_BENCHMARK_PARAM(float, "min_secs", "Min runs duration (seconds)",
180 verbose);
181 LOG_BENCHMARK_PARAM(float, "max_secs", "Max runs duration (seconds)",
182 verbose);
183 LOG_BENCHMARK_PARAM(float, "run_delay", "Inter-run delay (seconds)", verbose);
184 LOG_BENCHMARK_PARAM(float, "run_frequency",
185 "Number of prorated runs per second", verbose);
186 LOG_BENCHMARK_PARAM(int32_t, "num_threads", "Num threads", verbose);
187 LOG_BENCHMARK_PARAM(bool, "use_caching", "Use caching", verbose);
188 LOG_BENCHMARK_PARAM(std::string, "benchmark_name", "Benchmark name", verbose);
189 LOG_BENCHMARK_PARAM(std::string, "output_prefix", "Output prefix", verbose);
190 LOG_BENCHMARK_PARAM(int32_t, "warmup_runs", "Min warmup runs", verbose);
191 LOG_BENCHMARK_PARAM(float, "warmup_min_secs",
192 "Min warmup runs duration (seconds)", verbose);
193 LOG_BENCHMARK_PARAM(bool, "dry_run", "Run w/o invoking kernels", verbose);
194 LOG_BENCHMARK_PARAM(bool, "report_peak_memory_footprint",
195 "Report the peak memory footprint", verbose);
196 LOG_BENCHMARK_PARAM(int32_t, "memory_footprint_check_interval_ms",
197 "Memory footprint check interval (ms)", verbose);
198 }
199
PrepareInputData()200 TfLiteStatus BenchmarkModel::PrepareInputData() { return kTfLiteOk; }
201
ResetInputsAndOutputs()202 TfLiteStatus BenchmarkModel::ResetInputsAndOutputs() { return kTfLiteOk; }
203
Run(int min_num_times,float min_secs,float max_secs,RunType run_type,TfLiteStatus * invoke_status)204 Stat<int64_t> BenchmarkModel::Run(int min_num_times, float min_secs,
205 float max_secs, RunType run_type,
206 TfLiteStatus* invoke_status) {
207 Stat<int64_t> run_stats;
208 TFLITE_LOG(INFO) << "Running benchmark for at least " << min_num_times
209 << " iterations and at least " << min_secs << " seconds but"
210 << " terminate if exceeding " << max_secs << " seconds.";
211 int64_t now_us = profiling::time::NowMicros();
212 int64_t min_finish_us = now_us + static_cast<int64_t>(min_secs * 1.e6f);
213 int64_t max_finish_us = now_us + static_cast<int64_t>(max_secs * 1.e6f);
214
215 *invoke_status = kTfLiteOk;
216 float inter_run_sleep_time = params_.Get<float>("run_delay");
217 auto run_frequency = params_.Get<float>("run_frequency");
218 double manual_inter_run_gap = 1.0 / run_frequency;
219 // float doesn't have sufficient precision for storing this number
220 double next_run_finish_time = now_us * 1e-6 + manual_inter_run_gap;
221 for (int run = 0; (run < min_num_times || now_us < min_finish_us) &&
222 now_us <= max_finish_us;
223 run++) {
224 ResetInputsAndOutputs();
225 listeners_.OnSingleRunStart(run_type);
226 int64_t start_us = profiling::time::NowMicros();
227 TfLiteStatus status = RunImpl();
228 int64_t end_us = profiling::time::NowMicros();
229 listeners_.OnSingleRunEnd();
230
231 run_stats.UpdateStat(end_us - start_us);
232 if (run_frequency > 0) {
233 inter_run_sleep_time =
234 next_run_finish_time - profiling::time::NowMicros() * 1e-6;
235 next_run_finish_time += manual_inter_run_gap;
236 }
237 // Note when "inter_run_sleep_time" is negative or 0.0,
238 // the function will return immediately.
239 util::SleepForSeconds(inter_run_sleep_time);
240 now_us = profiling::time::NowMicros();
241
242 if (status != kTfLiteOk) {
243 *invoke_status = status;
244 }
245 }
246
247 std::stringstream stream;
248 run_stats.OutputToStream(&stream);
249 TFLITE_LOG(INFO) << stream.str() << std::endl;
250
251 return run_stats;
252 }
253
ValidateParams()254 TfLiteStatus BenchmarkModel::ValidateParams() {
255 if (params_.Get<bool>("report_peak_memory_footprint")) {
256 const int32_t interval =
257 params_.Get<int32_t>("memory_footprint_check_interval_ms");
258 if (interval <= 0) {
259 TFLITE_LOG(WARN) << "--memory_footprint_check_interval_ms is set to "
260 << interval
261 << " (ms), This value is invalid, and it will be set to "
262 "the default value "
263 << kMemoryCheckIntervalMs << " (ms).";
264 params_.Set<int32_t>("memory_footprint_check_interval_ms",
265 kMemoryCheckIntervalMs);
266 }
267 }
268 return kTfLiteOk;
269 }
270
Run(int argc,char ** argv)271 TfLiteStatus BenchmarkModel::Run(int argc, char** argv) {
272 TF_LITE_ENSURE_STATUS(ParseFlags(argc, argv));
273 return Run();
274 }
275
Run()276 TfLiteStatus BenchmarkModel::Run() {
277 TF_LITE_ENSURE_STATUS(ValidateParams());
278
279 LogParams();
280
281 auto peak_memory_reporter = MayCreateMemoryUsageMonitor();
282 if (peak_memory_reporter != nullptr) peak_memory_reporter->Start();
283 const double model_size_mb = MayGetModelFileSize() / 1e6;
284 const auto start_mem_usage = profiling::memory::GetMemoryUsage();
285 int64_t initialization_start_us = profiling::time::NowMicros();
286 TF_LITE_ENSURE_STATUS(Init());
287 const auto init_end_mem_usage = profiling::memory::GetMemoryUsage();
288 int64_t initialization_end_us = profiling::time::NowMicros();
289 int64_t startup_latency_us = initialization_end_us - initialization_start_us;
290 const auto init_mem_usage = init_end_mem_usage - start_mem_usage;
291
292 if (model_size_mb > 0) {
293 TFLITE_LOG(INFO) << "The input model file size (MB): " << model_size_mb;
294 }
295 TFLITE_LOG(INFO) << "Initialized session in " << startup_latency_us / 1e3
296 << "ms.";
297
298 TF_LITE_ENSURE_STATUS(PrepareInputData());
299
300 TfLiteStatus status = kTfLiteOk;
301 uint64_t input_bytes = ComputeInputBytes();
302
303 // Overwrite certain parameters when --dry_run=true is set.
304 if (params_.Get<bool>("dry_run")) {
305 params_.Set("warmup_runs", 0);
306 params_.Set("warmup_min_secs", -1.0f);
307 params_.Set("num_runs", 0);
308 params_.Set("min_secs", -1.0f);
309 }
310
311 listeners_.OnBenchmarkStart(params_);
312 Stat<int64_t> warmup_time_us =
313 Run(params_.Get<int32_t>("warmup_runs"),
314 params_.Get<float>("warmup_min_secs"), params_.Get<float>("max_secs"),
315 WARMUP, &status);
316 if (status != kTfLiteOk) {
317 return status;
318 }
319
320 Stat<int64_t> inference_time_us =
321 Run(params_.Get<int32_t>("num_runs"), params_.Get<float>("min_secs"),
322 params_.Get<float>("max_secs"), REGULAR, &status);
323 const auto overall_mem_usage =
324 profiling::memory::GetMemoryUsage() - start_mem_usage;
325
326 float peak_mem_mb = profiling::memory::MemoryUsageMonitor::kInvalidMemUsageMB;
327 if (peak_memory_reporter != nullptr) {
328 peak_memory_reporter->Stop();
329 peak_mem_mb = peak_memory_reporter->GetPeakMemUsageInMB();
330 }
331
332 listeners_.OnBenchmarkEnd({model_size_mb, startup_latency_us, input_bytes,
333 warmup_time_us, inference_time_us, init_mem_usage,
334 overall_mem_usage, peak_mem_mb});
335 return status;
336 }
337
ParseFlags(int * argc,char ** argv)338 TfLiteStatus BenchmarkModel::ParseFlags(int* argc, char** argv) {
339 auto flag_list = GetFlags();
340 const bool parse_result =
341 Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
342 // "--help" flag is added in tools/delegates/default_execution_provider.cc. As
343 // this is an optional dependency, we need to check whether "--help" exists or
344 // not first.
345 if (!parse_result ||
346 (params_.HasParam("help") && params_.Get<bool>("help"))) {
347 std::string usage = Flags::Usage(argv[0], flag_list);
348 TFLITE_LOG(ERROR) << usage;
349 // Returning kTfLiteError intentionally when "--help=true" is specified so
350 // that the caller could check the return value to decide stopping the
351 // execution.
352 return kTfLiteError;
353 }
354
355 std::string unconsumed_args =
356 Flags::ArgsToString(*argc, const_cast<const char**>(argv));
357 if (!unconsumed_args.empty()) {
358 TFLITE_LOG(WARN) << "Unconsumed cmdline flags: " << unconsumed_args;
359 }
360
361 return kTfLiteOk;
362 }
363
364 std::unique_ptr<profiling::memory::MemoryUsageMonitor>
MayCreateMemoryUsageMonitor() const365 BenchmarkModel::MayCreateMemoryUsageMonitor() const {
366 if (!params_.Get<bool>("report_peak_memory_footprint")) return nullptr;
367
368 return std::make_unique<profiling::memory::MemoryUsageMonitor>(
369
370 params_.Get<int32_t>("memory_footprint_check_interval_ms"));
371 }
372
373 } // namespace benchmark
374 } // namespace tflite
375