xref: /aosp_15_r20/external/XNNPACK/bench/sigmoid.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2020 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <algorithm>
10 #include <array>
11 #include <cmath>
12 #include <functional>
13 #include <limits>
14 #include <random>
15 #include <vector>
16 
17 #include <fp16.h>
18 
19 #include <xnnpack.h>
20 
21 #include <benchmark/benchmark.h>
22 #include "bench/utils.h"
23 #ifdef BENCHMARK_TENSORFLOW_LITE
24 #include "flatbuffers/include/flatbuffers/flatbuffers.h"
25 #include "tensorflow/lite/interpreter.h"
26 #include "tensorflow/lite/kernels/register.h"
27 #include "tensorflow/lite/model.h"
28 #include "tensorflow/lite/schema/schema_generated.h"
29 #include "tensorflow/lite/version.h"
30 #endif  // BENCHMARK_TENSORFLOW_LITE
31 
32 
33 #ifndef XNN_NO_F16_OPERATORS
xnnpack_sigmoid_f16(benchmark::State & state)34 static void xnnpack_sigmoid_f16(benchmark::State& state) {
35   const size_t batch_size = state.range(0);
36 
37   std::random_device random_device;
38   auto rng = std::mt19937(random_device());
39   auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
40   auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
41 
42   std::vector<uint16_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint16_t));
43   std::vector<uint16_t> output(batch_size);
44   std::generate(input.begin(), input.end(), std::ref(f16rng));
45   std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
46 
47   xnn_status status = xnn_initialize(nullptr /* allocator */);
48   if (status != xnn_status_success) {
49     state.SkipWithError("failed to initialize XNNPACK");
50     return;
51   }
52 
53   xnn_operator_t sigmoid_op = nullptr;
54   status = xnn_create_sigmoid_nc_f16(
55     1 /* channels */, 1 /* input stride */, 1 /* output stride */,
56     0 /* flags */, &sigmoid_op);
57   if (status != xnn_status_success || sigmoid_op == nullptr) {
58     state.SkipWithError("failed to create Sigmoid operator");
59     return;
60   }
61 
62   status = xnn_setup_sigmoid_nc_f16(
63     sigmoid_op, batch_size,
64     input.data(), output.data(),
65     nullptr /* thread pool */);
66   if (status != xnn_status_success) {
67     state.SkipWithError("failed to setup Sigmoid operator");
68     return;
69   }
70 
71   for (auto _ : state) {
72     status = xnn_run_operator(sigmoid_op, nullptr /* thread pool */);
73     if (status != xnn_status_success) {
74       state.SkipWithError("failed to run Sigmoid operator");
75       return;
76     }
77   }
78 
79   status = xnn_delete_operator(sigmoid_op);
80   if (status != xnn_status_success) {
81     state.SkipWithError("failed to delete Sigmoid operator");
82     return;
83   }
84 
85   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
86   if (cpu_frequency != 0) {
87     state.counters["cpufreq"] = cpu_frequency;
88   }
89 
90   state.counters["elements"] =
91     benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
92 
93   const size_t bytes_per_iteration = 2 * batch_size * sizeof(uint16_t);
94   state.counters["bytes"] =
95     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
96 }
97 #endif  // XNN_NO_F16_OPERATORS
98 
xnnpack_sigmoid_f32(benchmark::State & state)99 static void xnnpack_sigmoid_f32(benchmark::State& state) {
100   const size_t batch_size = state.range(0);
101 
102   std::random_device random_device;
103   auto rng = std::mt19937(random_device());
104   auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
105 
106   std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
107   std::vector<float> output(batch_size);
108   std::generate(input.begin(), input.end(), std::ref(f32rng));
109   std::fill(output.begin(), output.end(), std::nanf(""));
110 
111   xnn_status status = xnn_initialize(nullptr /* allocator */);
112   if (status != xnn_status_success) {
113     state.SkipWithError("failed to initialize XNNPACK");
114     return;
115   }
116 
117   xnn_operator_t sigmoid_op = nullptr;
118   status = xnn_create_sigmoid_nc_f32(
119     1 /* channels */, 1 /* input stride */, 1 /* output stride */,
120     0 /* flags */, &sigmoid_op);
121   if (status != xnn_status_success || sigmoid_op == nullptr) {
122     state.SkipWithError("failed to create Sigmoid operator");
123     return;
124   }
125 
126   status = xnn_setup_sigmoid_nc_f32(
127     sigmoid_op, batch_size,
128     input.data(), output.data(),
129     nullptr /* thread pool */);
130   if (status != xnn_status_success) {
131     state.SkipWithError("failed to setup Sigmoid operator");
132     return;
133   }
134 
135   for (auto _ : state) {
136     status = xnn_run_operator(sigmoid_op, nullptr /* thread pool */);
137     if (status != xnn_status_success) {
138       state.SkipWithError("failed to run Sigmoid operator");
139       return;
140     }
141   }
142 
143   status = xnn_delete_operator(sigmoid_op);
144   if (status != xnn_status_success) {
145     state.SkipWithError("failed to delete Sigmoid operator");
146     return;
147   }
148 
149   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
150   if (cpu_frequency != 0) {
151     state.counters["cpufreq"] = cpu_frequency;
152   }
153 
154   state.counters["elements"] =
155     benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
156 
157   const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
158   state.counters["bytes"] =
159     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
160 }
161 
162 #ifndef XNN_NO_QS8_OPERATORS
xnnpack_sigmoid_qs8(benchmark::State & state)163 static void xnnpack_sigmoid_qs8(benchmark::State& state) {
164   const size_t batch_size = state.range(0);
165 
166   std::random_device random_device;
167   auto rng = std::mt19937(random_device());
168   auto i8rng = std::bind(
169     std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
170     std::ref(rng));
171 
172   std::vector<int8_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(int8_t));
173   std::vector<int8_t> output(batch_size);
174   std::generate(input.begin(), input.end(), std::ref(i8rng));
175   std::fill(output.begin(), output.end(), INT8_C(0xA5));
176 
177   xnn_status status = xnn_initialize(nullptr /* allocator */);
178   if (status != xnn_status_success) {
179     state.SkipWithError("failed to initialize XNNPACK");
180     return;
181   }
182 
183   xnn_operator_t sigmoid_op = nullptr;
184   status = xnn_create_sigmoid_nc_qs8(
185     1 /* channels */, 1 /* input stride */, 1 /* output stride */,
186     1 /* input zero point */, 1.0f /* input scale */,
187     -128 /* output zero point */, 1.0f / 256.0f /* output scale */,
188     std::numeric_limits<int8_t>::min() /* output min */, std::numeric_limits<int8_t>::max() /* output max */,
189     0 /* flags */, &sigmoid_op);
190   if (status != xnn_status_success || sigmoid_op == nullptr) {
191     state.SkipWithError("failed to create Sigmoid operator");
192     return;
193   }
194 
195   status = xnn_setup_sigmoid_nc_qs8(
196     sigmoid_op, batch_size,
197     input.data(), output.data(),
198     nullptr /* thread pool */);
199   if (status != xnn_status_success) {
200     state.SkipWithError("failed to setup Sigmoid operator");
201     return;
202   }
203 
204   for (auto _ : state) {
205     status = xnn_run_operator(sigmoid_op, nullptr /* thread pool */);
206     if (status != xnn_status_success) {
207       state.SkipWithError("failed to run Sigmoid operator");
208       return;
209     }
210   }
211 
212   status = xnn_delete_operator(sigmoid_op);
213   if (status != xnn_status_success) {
214     state.SkipWithError("failed to delete Sigmoid operator");
215     return;
216   }
217 
218   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
219   if (cpu_frequency != 0) {
220     state.counters["cpufreq"] = cpu_frequency;
221   }
222 
223   state.counters["elements"] =
224     benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
225 
226   const size_t bytes_per_iteration = 2 * batch_size * sizeof(int8_t);
227   state.counters["bytes"] =
228     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
229 }
230 #endif  // XNN_NO_QS8_OPERATORS
231 
232 #ifndef XNN_NO_QU8_OPERATORS
xnnpack_sigmoid_qu8(benchmark::State & state)233 static void xnnpack_sigmoid_qu8(benchmark::State& state) {
234   const size_t batch_size = state.range(0);
235 
236   std::random_device random_device;
237   auto rng = std::mt19937(random_device());
238   auto u8rng = std::bind(
239     std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
240 
241   std::vector<uint8_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint8_t));
242   std::vector<uint8_t> output(batch_size);
243   std::generate(input.begin(), input.end(), std::ref(u8rng));
244   std::fill(output.begin(), output.end(), UINT8_C(0xA5));
245 
246   xnn_status status = xnn_initialize(nullptr /* allocator */);
247   if (status != xnn_status_success) {
248     state.SkipWithError("failed to initialize XNNPACK");
249     return;
250   }
251 
252   xnn_operator_t sigmoid_op = nullptr;
253   status = xnn_create_sigmoid_nc_qu8(
254     1 /* channels */, 1 /* input stride */, 1 /* output stride */,
255     128 /* input zero point */, 1.0f /* input scale */,
256     0 /* output zero point */, 1.0f / 256.0f /* output scale */,
257     std::numeric_limits<uint8_t>::min() /* output min */, std::numeric_limits<uint8_t>::max() /* output max */,
258     0 /* flags */, &sigmoid_op);
259   if (status != xnn_status_success || sigmoid_op == nullptr) {
260     state.SkipWithError("failed to create Sigmoid operator");
261     return;
262   }
263 
264   status = xnn_setup_sigmoid_nc_qu8(
265     sigmoid_op, batch_size,
266     input.data(), output.data(),
267     nullptr /* thread pool */);
268   if (status != xnn_status_success) {
269     state.SkipWithError("failed to setup Sigmoid operator");
270     return;
271   }
272 
273   for (auto _ : state) {
274     status = xnn_run_operator(sigmoid_op, nullptr /* thread pool */);
275     if (status != xnn_status_success) {
276       state.SkipWithError("failed to run Sigmoid operator");
277       return;
278     }
279   }
280 
281   status = xnn_delete_operator(sigmoid_op);
282   if (status != xnn_status_success) {
283     state.SkipWithError("failed to delete Sigmoid operator");
284     return;
285   }
286 
287   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
288   if (cpu_frequency != 0) {
289     state.counters["cpufreq"] = cpu_frequency;
290   }
291 
292   state.counters["elements"] =
293     benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
294 
295   const size_t bytes_per_iteration = 2 * batch_size * sizeof(uint8_t);
296   state.counters["bytes"] =
297     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
298 }
299 #endif  // XNN_NO_QU8_OPERATORS
300 
301 #ifdef BENCHMARK_TENSORFLOW_LITE
tflite_sigmoid_f32(benchmark::State & state)302 static void tflite_sigmoid_f32(benchmark::State& state) {
303   const size_t batch_size = state.range(0);
304 
305   std::random_device random_device;
306   auto rng = std::mt19937(random_device());
307   auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
308 
309   flatbuffers::FlatBufferBuilder builder;
310   const flatbuffers::Offset<tflite::OperatorCode> operator_code =
311       CreateOperatorCode(builder, tflite::BuiltinOperator_LOGISTIC);
312 
313   const std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
314     tflite::CreateBuffer(builder, builder.CreateVector({})),
315   }};
316 
317   const std::array<int32_t, 1> shape{{
318     static_cast<int32_t>(batch_size)
319   }};
320 
321   const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
322     tflite::CreateTensor(builder,
323                          builder.CreateVector<int32_t>(shape.data(), shape.size()),
324                          tflite::TensorType_FLOAT32),
325     tflite::CreateTensor(builder,
326                          builder.CreateVector<int32_t>(shape.data(), shape.size()),
327                          tflite::TensorType_FLOAT32),
328   }};
329 
330   const std::array<int32_t, 1> op_inputs{{ 0 }};
331   const std::array<int32_t, 1> op_outputs{{ 1 }};
332   flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(
333       builder,
334       0 /* opcode_index */,
335       builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
336       builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
337 
338   const std::array<int32_t, 1> graph_inputs{{ 0 }};
339   const std::array<int32_t, 1> graph_outputs{{ 1 }};
340   const flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
341       builder,
342       builder.CreateVector(tensors.data(), tensors.size()),
343       builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
344       builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
345       builder.CreateVector(&op, 1));
346 
347   const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
348       TFLITE_SCHEMA_VERSION,
349       builder.CreateVector(&operator_code, 1),
350       builder.CreateVector(&subgraph, 1),
351       builder.CreateString("Sigmoid model"),
352       builder.CreateVector(buffers.data(), buffers.size()));
353 
354   builder.Finish(model_buffer);
355 
356   const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
357   tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
358   tflite::InterpreterBuilder interpreterBuilder(model, resolver);
359   std::unique_ptr<tflite::Interpreter> interpreter;
360   if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
361     state.SkipWithError("failed to create TFLite interpreter");
362     return;
363   }
364   interpreter->SetNumThreads(1);
365 
366   if (interpreter->AllocateTensors() != kTfLiteOk) {
367     state.SkipWithError("failed to allocate tensors");
368     return;
369   }
370 
371   std::generate(
372     interpreter->typed_tensor<float>(0),
373     interpreter->typed_tensor<float>(0) + batch_size,
374     std::ref(f32rng));
375 
376   for (auto _ : state) {
377     if (interpreter->Invoke() != kTfLiteOk) {
378       state.SkipWithError("failed to invoke TFLite interpreter");
379       return;
380     }
381   }
382 
383   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
384   if (cpu_frequency != 0) {
385     state.counters["cpufreq"] = cpu_frequency;
386   }
387 
388   state.counters["elements"] =
389     benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
390 
391   const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
392   state.counters["bytes"] =
393     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
394 
395   interpreter.reset();
396 }
397 
tflite_sigmoid_qs8(benchmark::State & state)398 static void tflite_sigmoid_qs8(benchmark::State& state) {
399   const size_t batch_size = state.range(0);
400 
401   std::random_device random_device;
402   auto rng = std::mt19937(random_device());
403   auto i8rng = std::bind(
404     std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
405     std::ref(rng));
406 
407   flatbuffers::FlatBufferBuilder builder;
408   const flatbuffers::Offset<tflite::OperatorCode> operator_code =
409       CreateOperatorCode(builder, tflite::BuiltinOperator_LOGISTIC);
410 
411   const std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
412     tflite::CreateBuffer(builder, builder.CreateVector({})),
413   }};
414 
415   const std::array<int32_t, 1> shape{{
416     static_cast<int32_t>(batch_size)
417   }};
418 
419   const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
420     tflite::CreateTensor(builder,
421                          builder.CreateVector<int32_t>(shape.data(), shape.size()),
422                          tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
423                          tflite::CreateQuantizationParameters(builder,
424                            0 /*min*/, 0 /*max*/,
425                            builder.CreateVector<float>({1.0f /* scale */}),
426                            builder.CreateVector<int64_t>({1 /* zero point */}))),
427     tflite::CreateTensor(builder,
428                          builder.CreateVector<int32_t>(shape.data(), shape.size()),
429                          tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
430                          tflite::CreateQuantizationParameters(builder,
431                            0 /*min*/, 0 /*max*/,
432                            builder.CreateVector<float>({1.0f / 256.0f /* scale */}),
433                            builder.CreateVector<int64_t>({-128 /* zero point */}))),
434   }};
435 
436   const std::array<int32_t, 1> op_inputs{{ 0 }};
437   const std::array<int32_t, 1> op_outputs{{ 1 }};
438   flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(
439       builder,
440       0 /* opcode_index */,
441       builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
442       builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
443 
444   const std::array<int32_t, 1> graph_inputs{{ 0 }};
445   const std::array<int32_t, 1> graph_outputs{{ 1 }};
446   const flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
447       builder,
448       builder.CreateVector(tensors.data(), tensors.size()),
449       builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
450       builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
451       builder.CreateVector(&op, 1));
452 
453   const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
454       TFLITE_SCHEMA_VERSION,
455       builder.CreateVector(&operator_code, 1),
456       builder.CreateVector(&subgraph, 1),
457       builder.CreateString("Sigmoid model"),
458       builder.CreateVector(buffers.data(), buffers.size()));
459 
460   builder.Finish(model_buffer);
461 
462   const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
463   tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
464   tflite::InterpreterBuilder interpreterBuilder(model, resolver);
465   std::unique_ptr<tflite::Interpreter> interpreter;
466   if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
467     state.SkipWithError("failed to create TFLite interpreter");
468     return;
469   }
470   interpreter->SetNumThreads(1);
471 
472   if (interpreter->AllocateTensors() != kTfLiteOk) {
473     state.SkipWithError("failed to allocate tensors");
474     return;
475   }
476 
477   std::generate(
478     interpreter->typed_tensor<int8_t>(0),
479     interpreter->typed_tensor<int8_t>(0) + batch_size,
480     std::ref(i8rng));
481 
482   for (auto _ : state) {
483     if (interpreter->Invoke() != kTfLiteOk) {
484       state.SkipWithError("failed to invoke TFLite interpreter");
485       return;
486     }
487   }
488 
489   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
490   if (cpu_frequency != 0) {
491     state.counters["cpufreq"] = cpu_frequency;
492   }
493 
494   state.counters["elements"] =
495     benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
496 
497   const size_t bytes_per_iteration = 2 * batch_size * sizeof(int8_t);
498   state.counters["bytes"] =
499     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
500 
501   interpreter.reset();
502 }
503 
tflite_sigmoid_qu8(benchmark::State & state)504 static void tflite_sigmoid_qu8(benchmark::State& state) {
505   const size_t batch_size = state.range(0);
506 
507   std::random_device random_device;
508   auto rng = std::mt19937(random_device());
509   auto u8rng = std::bind(
510     std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()),
511     std::ref(rng));
512 
513   flatbuffers::FlatBufferBuilder builder;
514   const flatbuffers::Offset<tflite::OperatorCode> operator_code =
515       CreateOperatorCode(builder, tflite::BuiltinOperator_LOGISTIC);
516 
517   const std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
518     tflite::CreateBuffer(builder, builder.CreateVector({})),
519   }};
520 
521   const std::array<int32_t, 1> shape{{
522     static_cast<int32_t>(batch_size)
523   }};
524 
525   const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
526     tflite::CreateTensor(builder,
527                          builder.CreateVector<int32_t>(shape.data(), shape.size()),
528                          tflite::TensorType_UINT8, 0 /* buffer */, 0 /* name */,
529                          tflite::CreateQuantizationParameters(builder,
530                            0 /*min*/, 0 /*max*/,
531                            builder.CreateVector<float>({1.0f /* scale */}),
532                            builder.CreateVector<int64_t>({128 /* zero point */}))),
533     tflite::CreateTensor(builder,
534                          builder.CreateVector<int32_t>(shape.data(), shape.size()),
535                          tflite::TensorType_UINT8, 0 /* buffer */, 0 /* name */,
536                          tflite::CreateQuantizationParameters(builder,
537                            0 /*min*/, 0 /*max*/,
538                            builder.CreateVector<float>({1.0f / 256.0f /* scale */}),
539                            builder.CreateVector<int64_t>({0 /* zero point */}))),
540   }};
541 
542   const std::array<int32_t, 1> op_inputs{{ 0 }};
543   const std::array<int32_t, 1> op_outputs{{ 1 }};
544   flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(
545       builder,
546       0 /* opcode_index */,
547       builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
548       builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
549 
550   const std::array<int32_t, 1> graph_inputs{{ 0 }};
551   const std::array<int32_t, 1> graph_outputs{{ 1 }};
552   const flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
553       builder,
554       builder.CreateVector(tensors.data(), tensors.size()),
555       builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
556       builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
557       builder.CreateVector(&op, 1));
558 
559   const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
560       TFLITE_SCHEMA_VERSION,
561       builder.CreateVector(&operator_code, 1),
562       builder.CreateVector(&subgraph, 1),
563       builder.CreateString("Sigmoid model"),
564       builder.CreateVector(buffers.data(), buffers.size()));
565 
566   builder.Finish(model_buffer);
567 
568   const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
569   tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
570   tflite::InterpreterBuilder interpreterBuilder(model, resolver);
571   std::unique_ptr<tflite::Interpreter> interpreter;
572   if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
573     state.SkipWithError("failed to create TFLite interpreter");
574     return;
575   }
576   interpreter->SetNumThreads(1);
577 
578   if (interpreter->AllocateTensors() != kTfLiteOk) {
579     state.SkipWithError("failed to allocate tensors");
580     return;
581   }
582 
583   std::generate(
584     interpreter->typed_tensor<uint8_t>(0),
585     interpreter->typed_tensor<uint8_t>(0) + batch_size,
586     std::ref(u8rng));
587 
588   for (auto _ : state) {
589     if (interpreter->Invoke() != kTfLiteOk) {
590       state.SkipWithError("failed to invoke TFLite interpreter");
591       return;
592     }
593   }
594 
595   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
596   if (cpu_frequency != 0) {
597     state.counters["cpufreq"] = cpu_frequency;
598   }
599 
600   state.counters["elements"] =
601     benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
602 
603   const size_t bytes_per_iteration = 2 * batch_size * sizeof(uint8_t);
604   state.counters["bytes"] =
605     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
606 
607   interpreter.reset();
608 }
609 #endif  // BENCHMARK_TENSORFLOW_LITE
610 
611 #ifndef XNN_NO_F16_OPERATORS
612   BENCHMARK(xnnpack_sigmoid_f16)
613     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
614     ->UseRealTime();
615 #endif  // XNN_NO_F16_OPERATORS
616 BENCHMARK(xnnpack_sigmoid_f32)
617   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
618   ->UseRealTime();
619 #ifndef XNN_NO_QS8_OPERATORS
620   BENCHMARK(xnnpack_sigmoid_qs8)
621     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
622     ->UseRealTime();
623 #endif  // XNN_NO_QS8_OPERATORS
624 #ifndef XNN_NO_QU8_OPERATORS
625   BENCHMARK(xnnpack_sigmoid_qu8)
626     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
627     ->UseRealTime();
628 #endif  // XNN_NO_QU8_OPERATORS
629 
630 #ifdef BENCHMARK_TENSORFLOW_LITE
631   BENCHMARK(tflite_sigmoid_f32)
632     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
633     ->UseRealTime();
634   BENCHMARK(tflite_sigmoid_qs8)
635     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
636     ->UseRealTime();
637   BENCHMARK(tflite_sigmoid_qu8)
638     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
639     ->UseRealTime();
640 #endif  // BENCHMARK_TENSORFLOW_LITE
641 
642 #ifndef XNNPACK_BENCHMARK_NO_MAIN
643 BENCHMARK_MAIN();
644 #endif
645