xref: /aosp_15_r20/external/XNNPACK/bench/prelu.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cfloat>
8 #include <cmath>
9 #include <functional>
10 #include <random>
11 #include <vector>
12 
13 #include <xnnpack.h>
14 
15 #include <benchmark/benchmark.h>
16 #include "bench/utils.h"
17 #ifdef BENCHMARK_TENSORFLOW_LITE
18 #include "flatbuffers/include/flatbuffers/flatbuffers.h"
19 #include "tensorflow/lite/interpreter.h"
20 #include "tensorflow/lite/kernels/register.h"
21 #include "tensorflow/lite/model.h"
22 #include "tensorflow/lite/schema/schema_generated.h"
23 #include "tensorflow/lite/version.h"
24 #endif  // BENCHMARK_TENSORFLOW_LITE
25 
26 
xnnpack_prelu_f32(benchmark::State & state,const char * net)27 void xnnpack_prelu_f32(benchmark::State& state, const char* net) {
28   const size_t batch_size = state.range(0);
29   const size_t height = state.range(1);
30   const size_t width = state.range(2);
31   const size_t channels = state.range(3);
32 
33   std::random_device random_device;
34   auto rng = std::mt19937(random_device());
35   auto f32irng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
36   auto f32wrng = std::bind(std::uniform_real_distribution<float>(0.25f, 0.75f), std::ref(rng));
37 
38   std::vector<float> input(batch_size * height * width * channels + XNN_EXTRA_BYTES / sizeof(float));
39   std::generate(input.begin(), input.end(), std::ref(f32irng));
40   std::vector<float> slope(channels);
41   std::generate(slope.begin(), slope.end(), std::ref(f32wrng));
42   std::vector<float> output(batch_size * height * width * channels);
43 
44   xnn_status status = xnn_initialize(nullptr /* allocator */);
45   if (status != xnn_status_success) {
46     state.SkipWithError("failed to initialize XNNPACK");
47     return;
48   }
49 
50   xnn_operator_t prelu_op = nullptr;
51   status = xnn_create_prelu_nc_f32(
52     channels, channels /* input stride */, channels /* output stride */,
53     slope.data(),
54     0 /* flags */, nullptr, &prelu_op);
55   if (status != xnn_status_success) {
56     state.SkipWithError("failed to create FP32 PReLU operator");
57     return;
58   }
59 
60   status = xnn_setup_prelu_nc_f32(
61     prelu_op,
62     batch_size * height * width,
63     input.data(), output.data(),
64     nullptr /* thread pool */);
65   if (status != xnn_status_success) {
66     state.SkipWithError("failed to setup FP32 PReLU operator");
67     return;
68   }
69 
70   for (auto _ : state) {
71     status = xnn_run_operator(prelu_op, nullptr /* thread pool */);
72     if (status != xnn_status_success) {
73       state.SkipWithError("failed to run FP32 PReLU operator");
74       return;
75     }
76   }
77 
78   status = xnn_delete_operator(prelu_op);
79   if (status != xnn_status_success) {
80     state.SkipWithError("failed to delete FP32 PReLU operator");
81     return;
82   }
83   prelu_op = nullptr;
84 
85   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
86   if (cpu_frequency != 0) {
87     state.counters["cpufreq"] = cpu_frequency;
88   }
89 
90   const size_t elements_per_iteration = batch_size * height * width * channels;
91   state.counters["elements"] =
92     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
93 
94   const size_t bytes_per_iteration = (2 * elements_per_iteration + channels) * sizeof(float);
95   state.counters["bytes"] =
96     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
97 }
98 
99 #ifdef BENCHMARK_TENSORFLOW_LITE
tflite_prelu_f32(benchmark::State & state,const char * net)100 void tflite_prelu_f32(benchmark::State& state, const char* net) {
101   const size_t batch_size = state.range(0);
102   const size_t height = state.range(1);
103   const size_t width = state.range(2);
104   const size_t channels = state.range(3);
105 
106   std::random_device random_device;
107   auto rng = std::mt19937(random_device());
108   auto f32irng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
109   auto f32wrng = std::bind(std::uniform_real_distribution<float>(0.25f, 0.75f), std::ref(rng));
110 
111   std::vector<float> slope(channels);
112   std::generate(slope.begin(), slope.end(), std::ref(f32wrng));
113 
114   flatbuffers::FlatBufferBuilder builder;
115   flatbuffers::Offset<tflite::OperatorCode> operator_code =
116       CreateOperatorCode(builder, tflite::BuiltinOperator_PRELU);
117 
118   flatbuffers::Offset<tflite::Buffer> buffers[2] = {
119     tflite::CreateBuffer(builder, builder.CreateVector({})),
120     tflite::CreateBuffer(builder, builder.CreateVector(
121       reinterpret_cast<const uint8_t*>(slope.data()),
122       sizeof(float) * slope.size())),
123   };
124 
125   const int32_t input_shape[4] = {
126     static_cast<int32_t>(batch_size),
127     static_cast<int32_t>(height),
128     static_cast<int32_t>(width),
129     static_cast<int32_t>(channels)
130   };
131   const int32_t output_shape[4] = {
132     static_cast<int32_t>(batch_size),
133     static_cast<int32_t>(height),
134     static_cast<int32_t>(width),
135     static_cast<int32_t>(channels)
136   };
137   const int32_t slope_shape[1] = {
138     static_cast<int32_t>(channels)
139   };
140 
141   flatbuffers::Offset<tflite::Tensor> tensors[3] = {
142     tflite::CreateTensor(builder,
143                          builder.CreateVector<int32_t>(input_shape, 4),
144                          tflite::TensorType_FLOAT32),
145     tflite::CreateTensor(builder,
146                          builder.CreateVector<int32_t>(slope_shape, 1),
147                          tflite::TensorType_FLOAT32,
148                          1 /* buffer id */),
149     tflite::CreateTensor(builder,
150                          builder.CreateVector<int32_t>(output_shape, 4),
151                          tflite::TensorType_FLOAT32),
152   };
153 
154   const int32_t op_inputs[2] = { 0, 1 };
155   const int32_t op_outputs[1] = { 2 };
156   flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(
157       builder,
158       0 /* opcode_index */,
159       builder.CreateVector<int32_t>(op_inputs, 2),
160       builder.CreateVector<int32_t>(op_outputs, 1));
161 
162   const int32_t graph_inputs[1] = { 0 };
163   const int32_t graph_outputs[1] = { 2 };
164   flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
165       builder,
166       builder.CreateVector(tensors, 3),
167       builder.CreateVector<int32_t>(graph_inputs, 1),
168       builder.CreateVector<int32_t>(graph_outputs, 1),
169       builder.CreateVector(&op, 1));
170 
171   flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("PReLU model");
172 
173   flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
174       TFLITE_SCHEMA_VERSION,
175       builder.CreateVector(&operator_code, 1),
176       builder.CreateVector(&subgraph, 1),
177       description,
178       builder.CreateVector(buffers, 2));
179 
180   builder.Finish(model_buffer);
181 
182   const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
183   tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
184   tflite::InterpreterBuilder interpreterBuilder(model, resolver);
185   std::unique_ptr<tflite::Interpreter> interpreter;
186   if (interpreterBuilder(&interpreter) != kTfLiteOk) {
187     state.SkipWithError("failed to create TFLite interpreter");
188     return;
189   }
190   if (interpreter == nullptr) {
191     state.SkipWithError("TFLite interpreter is null");
192     return;
193   }
194   interpreter->SetNumThreads(1);
195 
196   if (interpreter->AllocateTensors() != kTfLiteOk) {
197     state.SkipWithError("failed to allocate tensors");
198     return;
199   }
200 
201   std::generate(
202     interpreter->typed_tensor<float>(0),
203     interpreter->typed_tensor<float>(0) + batch_size * height * width * channels,
204     std::ref(f32irng));
205 
206   for (auto _ : state) {
207     if (interpreter->Invoke() != kTfLiteOk) {
208       state.SkipWithError("failed to invoke TFLite interpreter");
209       return;
210     }
211   }
212 
213   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
214   if (cpu_frequency != 0) {
215     state.counters["cpufreq"] = cpu_frequency;
216   }
217 
218   const size_t elements_per_iteration = batch_size * height * width * channels;
219   state.counters["elements"] =
220     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
221 
222   const size_t bytes_per_iteration = (2 * elements_per_iteration + channels) * sizeof(float);
223   state.counters["bytes"] =
224     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
225 
226   interpreter.reset();
227 }
228 #endif  // BENCHMARK_TENSORFLOW_LITE
229 
230 // Characteristic arguments for ImageNet classification models
ImageNet(benchmark::internal::Benchmark * b)231 static void ImageNet(benchmark::internal::Benchmark* b)
232 {
233   b->ArgNames({"N", "H", "W", "C"});
234 
235   int32_t c = 16;
236   for (int32_t hw = 224 / 2; hw >= 7; hw /= 2) {
237     b->Args({1, hw, hw, c});
238     b->Args({1, hw, hw, c * 2});
239     c *= 2;
240   }
241 }
242 
243 BENCHMARK_CAPTURE(xnnpack_prelu_f32, imagenet, "ImageNet 224x224")->Apply(ImageNet)->UseRealTime();
244 
245 #ifdef BENCHMARK_TENSORFLOW_LITE
246   BENCHMARK_CAPTURE(tflite_prelu_f32, imagenet, "ImageNet 224x224")->Apply(ImageNet)->UseRealTime();
247 #endif  // BENCHMARK_TENSORFLOW_LITE
248 
249 #ifndef XNNPACK_BENCHMARK_NO_MAIN
250 BENCHMARK_MAIN();
251 #endif
252