xref: /aosp_15_r20/external/gemmlowp/test/benchmark_meta_gemm.cc (revision 5f39d1b313f0528e11bae88b3029b54b9e1033e7)
1*5f39d1b3SJooyung Han // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2*5f39d1b3SJooyung Han //
3*5f39d1b3SJooyung Han // Licensed under the Apache License, Version 2.0 (the "License");
4*5f39d1b3SJooyung Han // you may not use this file except in compliance with the License.
5*5f39d1b3SJooyung Han // You may obtain a copy of the License at
6*5f39d1b3SJooyung Han //
7*5f39d1b3SJooyung Han //     http://www.apache.org/licenses/LICENSE-2.0
8*5f39d1b3SJooyung Han //
9*5f39d1b3SJooyung Han // Unless required by applicable law or agreed to in writing, software
10*5f39d1b3SJooyung Han // distributed under the License is distributed on an "AS IS" BASIS,
11*5f39d1b3SJooyung Han // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*5f39d1b3SJooyung Han // See the License for the specific language governing permissions and
13*5f39d1b3SJooyung Han // limitations under the License.
14*5f39d1b3SJooyung Han 
15*5f39d1b3SJooyung Han #include <unistd.h>
16*5f39d1b3SJooyung Han #ifdef __APPLE__
17*5f39d1b3SJooyung Han #include <sys/time.h>
18*5f39d1b3SJooyung Han #endif
19*5f39d1b3SJooyung Han 
20*5f39d1b3SJooyung Han #include <cstdint>
21*5f39d1b3SJooyung Han #include <cstdlib>
22*5f39d1b3SJooyung Han #include <ctime>
23*5f39d1b3SJooyung Han #include <iomanip>
24*5f39d1b3SJooyung Han #include <iostream>
25*5f39d1b3SJooyung Han #include <map>
26*5f39d1b3SJooyung Han #include <vector>
27*5f39d1b3SJooyung Han 
28*5f39d1b3SJooyung Han #include "../eight_bit_int_gemm/eight_bit_int_gemm.h"
29*5f39d1b3SJooyung Han #include "test.h"
30*5f39d1b3SJooyung Han 
31*5f39d1b3SJooyung Han #if defined(__arm__) && !defined(GEMMLOWP_NEON)
32*5f39d1b3SJooyung Han #warning "Building without NEON support on ARM, check your compiler setup!"
33*5f39d1b3SJooyung Han #endif
34*5f39d1b3SJooyung Han 
time()35*5f39d1b3SJooyung Han double time() {
36*5f39d1b3SJooyung Han #ifdef __APPLE__
37*5f39d1b3SJooyung Han   timeval t;
38*5f39d1b3SJooyung Han   gettimeofday(&t, nullptr);
39*5f39d1b3SJooyung Han   return t.tv_sec + 1e-6 * t.tv_usec;
40*5f39d1b3SJooyung Han #else
41*5f39d1b3SJooyung Han   timespec t;
42*5f39d1b3SJooyung Han   clock_gettime(CLOCK_REALTIME, &t);
43*5f39d1b3SJooyung Han   return t.tv_sec + 1e-9 * t.tv_nsec;
44*5f39d1b3SJooyung Han #endif
45*5f39d1b3SJooyung Han }
46*5f39d1b3SJooyung Han 
47*5f39d1b3SJooyung Han const std::int32_t MIN_WORKING_SET_SIZE = 2 * 1024 * 1024;
48*5f39d1b3SJooyung Han const double MIN_OPS = 1000.0 * 1000000.0;
49*5f39d1b3SJooyung Han 
50*5f39d1b3SJooyung Han struct WorkingSet {
WorkingSetWorkingSet51*5f39d1b3SJooyung Han   WorkingSet() : lhs(nullptr), rhs(nullptr), result(nullptr) {}
52*5f39d1b3SJooyung Han 
initWorkingSet53*5f39d1b3SJooyung Han   void init(std::int32_t n, std::int32_t m, std::int32_t k) {
54*5f39d1b3SJooyung Han     lhs = new std::uint8_t[n * k];
55*5f39d1b3SJooyung Han     rhs = new std::uint8_t[k * m];
56*5f39d1b3SJooyung Han     result = new std::uint8_t[m * n];
57*5f39d1b3SJooyung Han   }
58*5f39d1b3SJooyung Han 
59*5f39d1b3SJooyung Han   std::uint8_t* lhs;
60*5f39d1b3SJooyung Han   std::uint8_t* rhs;
61*5f39d1b3SJooyung Han   std::uint8_t* result;
62*5f39d1b3SJooyung Han };
63*5f39d1b3SJooyung Han 
64*5f39d1b3SJooyung Han struct Shape {
65*5f39d1b3SJooyung Han   std::int32_t n;
66*5f39d1b3SJooyung Han   std::int32_t m;
67*5f39d1b3SJooyung Han   std::int32_t k;
68*5f39d1b3SJooyung Han 
69*5f39d1b3SJooyung Han   std::int32_t repetitions;
70*5f39d1b3SJooyung Han   std::int32_t current_set;
71*5f39d1b3SJooyung Han   std::vector<WorkingSet> working_sets;
72*5f39d1b3SJooyung Han 
ShapeShape73*5f39d1b3SJooyung Han   Shape(std::int32_t n, std::int32_t m, std::int32_t k)
74*5f39d1b3SJooyung Han       : n(n), m(m), k(k), repetitions(1), current_set(0), working_sets() {}
75*5f39d1b3SJooyung Han 
initShape76*5f39d1b3SJooyung Han   void init() {
77*5f39d1b3SJooyung Han     const std::int32_t size = n * k + k * m + n * m;
78*5f39d1b3SJooyung Han     const std::int32_t count = MIN_WORKING_SET_SIZE / size + 1;
79*5f39d1b3SJooyung Han     const double ops = static_cast<double>(n) * static_cast<double>(m) *
80*5f39d1b3SJooyung Han                        static_cast<double>(k);
81*5f39d1b3SJooyung Han     for (int i = 0; i < count; ++i) {
82*5f39d1b3SJooyung Han       working_sets.push_back(WorkingSet());
83*5f39d1b3SJooyung Han       working_sets.back().init(n, m, k);
84*5f39d1b3SJooyung Han     }
85*5f39d1b3SJooyung Han     current_set = 0;
86*5f39d1b3SJooyung Han     repetitions = MIN_OPS / ops + 20;
87*5f39d1b3SJooyung Han   }
88*5f39d1b3SJooyung Han 
working_setShape89*5f39d1b3SJooyung Han   WorkingSet& working_set() { return working_sets[current_set]; }
90*5f39d1b3SJooyung Han 
next_working_setShape91*5f39d1b3SJooyung Han   void next_working_set() {
92*5f39d1b3SJooyung Han     current_set = (current_set + 1) % working_sets.size();
93*5f39d1b3SJooyung Han   }
94*5f39d1b3SJooyung Han };
95*5f39d1b3SJooyung Han 
run_gemm(std::int32_t n,std::int32_t m,std::int32_t k,std::uint8_t * lhs,std::uint8_t * rhs,std::uint8_t * result)96*5f39d1b3SJooyung Han double run_gemm(std::int32_t n, std::int32_t m, std::int32_t k,
97*5f39d1b3SJooyung Han                 std::uint8_t* lhs, std::uint8_t* rhs, std::uint8_t* result) {
98*5f39d1b3SJooyung Han   gemmlowp::eight_bit_int_gemm::EightBitIntGemm(
99*5f39d1b3SJooyung Han       true, false, false, m, n, k, rhs, -100, k, lhs, -100, k, result, 10000,
100*5f39d1b3SJooyung Han       10, 3, m, gemmlowp::eight_bit_int_gemm::BitDepthSetting::A8B8);
101*5f39d1b3SJooyung Han   return static_cast<double>(n * m * k * 2);
102*5f39d1b3SJooyung Han }
103*5f39d1b3SJooyung Han 
run_gemms(std::vector<Shape> * shapes)104*5f39d1b3SJooyung Han double run_gemms(std::vector<Shape>* shapes) {
105*5f39d1b3SJooyung Han   double ops = 0.0;
106*5f39d1b3SJooyung Han   for (auto& shape : *shapes) {
107*5f39d1b3SJooyung Han     ops += run_gemm(shape.n, shape.m, shape.k, shape.working_set().lhs,
108*5f39d1b3SJooyung Han                     shape.working_set().rhs, shape.working_set().result);
109*5f39d1b3SJooyung Han   }
110*5f39d1b3SJooyung Han   return ops;
111*5f39d1b3SJooyung Han }
112*5f39d1b3SJooyung Han 
print_summary(std::vector<double> * times,bool full)113*5f39d1b3SJooyung Han void print_summary(std::vector<double>* times, bool full) {
114*5f39d1b3SJooyung Han   std::sort(times->begin(), times->end());
115*5f39d1b3SJooyung Han 
116*5f39d1b3SJooyung Han   double sum_times = 0;
117*5f39d1b3SJooyung Han   double sum_times_trimmed = 0;
118*5f39d1b3SJooyung Han   int count_times_trimmed = 0;
119*5f39d1b3SJooyung Han   const float trim_ratio = 0.25;
120*5f39d1b3SJooyung Han   const size_t count_trimmed = times->size() * trim_ratio;
121*5f39d1b3SJooyung Han   double sum_times_best = 0;
122*5f39d1b3SJooyung Han   int count_times_best = 0;
123*5f39d1b3SJooyung Han   const float best_ratio = 0.1;
124*5f39d1b3SJooyung Han   const size_t count_best = times->size() * best_ratio;
125*5f39d1b3SJooyung Han 
126*5f39d1b3SJooyung Han   for (size_t i = 0; i < times->size(); i++) {
127*5f39d1b3SJooyung Han     sum_times += (*times)[i];
128*5f39d1b3SJooyung Han     if (i >= count_trimmed && i < times->size() - count_trimmed) {
129*5f39d1b3SJooyung Han       sum_times_trimmed += (*times)[i];
130*5f39d1b3SJooyung Han       count_times_trimmed++;
131*5f39d1b3SJooyung Han     }
132*5f39d1b3SJooyung Han     if (i < count_best) {
133*5f39d1b3SJooyung Han       sum_times_best += (*times)[i];
134*5f39d1b3SJooyung Han       count_times_best++;
135*5f39d1b3SJooyung Han     }
136*5f39d1b3SJooyung Han   }
137*5f39d1b3SJooyung Han 
138*5f39d1b3SJooyung Han   const double min_latency = times->front();
139*5f39d1b3SJooyung Han   const double max_latency = times->back();
140*5f39d1b3SJooyung Han   const double mean_latency = sum_times / times->size();
141*5f39d1b3SJooyung Han   const double trimmed_mean_latency = sum_times_trimmed / count_times_trimmed;
142*5f39d1b3SJooyung Han   const double best_mean_latency = sum_times_best / count_times_best;
143*5f39d1b3SJooyung Han 
144*5f39d1b3SJooyung Han   if (full) {
145*5f39d1b3SJooyung Han     std::cout << "Graph latency (over " << times->size()
146*5f39d1b3SJooyung Han               << " iterations):" << std::endl;
147*5f39d1b3SJooyung Han     std::cout << "  Best:             " << min_latency << "s" << std::endl;
148*5f39d1b3SJooyung Han     std::cout << "  Worst:            " << max_latency << "s" << std::endl;
149*5f39d1b3SJooyung Han     std::cout << "  Mean:             " << mean_latency << "s" << std::endl;
150*5f39d1b3SJooyung Han     std::cout << "  " << 100 * trim_ratio
151*5f39d1b3SJooyung Han               << "% trimmed mean: " << trimmed_mean_latency << "s" << std::endl;
152*5f39d1b3SJooyung Han     std::cout << "  Mean of " << 100 * best_ratio
153*5f39d1b3SJooyung Han               << "% best: " << best_mean_latency << "s" << std::endl;
154*5f39d1b3SJooyung Han   } else {
155*5f39d1b3SJooyung Han     std::cout << (mean_latency * 1000.0) << std::endl;
156*5f39d1b3SJooyung Han   }
157*5f39d1b3SJooyung Han }
158*5f39d1b3SJooyung Han 
time_all(std::vector<Shape> * shapes,std::int32_t repetitions,double max_time)159*5f39d1b3SJooyung Han void time_all(std::vector<Shape>* shapes, std::int32_t repetitions,
160*5f39d1b3SJooyung Han               double max_time) {
161*5f39d1b3SJooyung Han   std::vector<double> times;
162*5f39d1b3SJooyung Han   double ops = 0.0;
163*5f39d1b3SJooyung Han   double sum_time = 0.0;
164*5f39d1b3SJooyung Han 
165*5f39d1b3SJooyung Han   while (sum_time < max_time) {
166*5f39d1b3SJooyung Han     double start = time();
167*5f39d1b3SJooyung Han 
168*5f39d1b3SJooyung Han     for (int i = 0; i < repetitions; ++i) {
169*5f39d1b3SJooyung Han       ops += run_gemms(shapes);
170*5f39d1b3SJooyung Han     }
171*5f39d1b3SJooyung Han     double delta_time = (time() - start);
172*5f39d1b3SJooyung Han     times.push_back(delta_time / repetitions);
173*5f39d1b3SJooyung Han     sum_time += delta_time;
174*5f39d1b3SJooyung Han   }
175*5f39d1b3SJooyung Han 
176*5f39d1b3SJooyung Han   print_summary(&times, true);
177*5f39d1b3SJooyung Han }
178*5f39d1b3SJooyung Han 
time_one(Shape * shape,double max_time)179*5f39d1b3SJooyung Han void time_one(Shape* shape, double max_time) {
180*5f39d1b3SJooyung Han   std::vector<double> times;
181*5f39d1b3SJooyung Han   double ops = 0.0;
182*5f39d1b3SJooyung Han   double sum_time = 0.0;
183*5f39d1b3SJooyung Han 
184*5f39d1b3SJooyung Han   std::cout << std::setprecision(6) << std::fixed << shape->n << ", "
185*5f39d1b3SJooyung Han             << shape->m << ", " << shape->k << ", " << std::flush;
186*5f39d1b3SJooyung Han 
187*5f39d1b3SJooyung Han   while (sum_time < max_time) {
188*5f39d1b3SJooyung Han     double start = time();
189*5f39d1b3SJooyung Han 
190*5f39d1b3SJooyung Han     for (int i = 0; i < shape->repetitions; ++i) {
191*5f39d1b3SJooyung Han       ops += run_gemm(shape->n, shape->m, shape->k, shape->working_set().lhs,
192*5f39d1b3SJooyung Han                       shape->working_set().rhs, shape->working_set().result);
193*5f39d1b3SJooyung Han       shape->next_working_set();
194*5f39d1b3SJooyung Han     }
195*5f39d1b3SJooyung Han     double delta_time = (time() - start);
196*5f39d1b3SJooyung Han     times.push_back(delta_time / shape->repetitions);
197*5f39d1b3SJooyung Han     sum_time += delta_time;
198*5f39d1b3SJooyung Han   }
199*5f39d1b3SJooyung Han 
200*5f39d1b3SJooyung Han   print_summary(&times, false);
201*5f39d1b3SJooyung Han }
202*5f39d1b3SJooyung Han 
main()203*5f39d1b3SJooyung Han int main() {
204*5f39d1b3SJooyung Han   std::vector<Shape> googlenet_gemms;
205*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(12544, 64, 147));
206*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(3136, 64, 64));
207*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(3136, 192, 576));
208*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(784, 64, 192));
209*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(784, 96, 192));
210*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(784, 128, 864));
211*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(784, 16, 192));
212*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(784, 32, 400));
213*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(784, 32, 192));
214*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(784, 128, 256));
215*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(784, 128, 256));
216*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(784, 192, 1152));
217*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(784, 32, 256));
218*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(784, 96, 800));
219*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(784, 64, 256));
220*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 192, 480));
221*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 96, 480));
222*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 204, 864));
223*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 16, 480));
224*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 48, 400));
225*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 64, 480));
226*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 160, 508));
227*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 112, 508));
228*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 224, 1008));
229*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 24, 508));
230*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 64, 600));
231*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 64, 508));
232*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 128, 512));
233*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 128, 512));
234*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 256, 1152));
235*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 24, 512));
236*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 64, 600));
237*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 64, 512));
238*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 112, 512));
239*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 144, 512));
240*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 288, 1296));
241*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 32, 512));
242*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 64, 800));
243*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 64, 512));
244*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 256, 528));
245*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 160, 528));
246*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 320, 1440));
247*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 32, 528));
248*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 128, 800));
249*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(196, 128, 528));
250*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(49, 256, 832));
251*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(49, 160, 832));
252*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(49, 320, 1440));
253*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(49, 48, 832));
254*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(49, 128, 1200));
255*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(49, 128, 832));
256*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(49, 384, 832));
257*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(49, 192, 832));
258*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(49, 384, 1728));
259*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(49, 48, 832));
260*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(49, 128, 1200));
261*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(49, 128, 832));
262*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(16, 128, 508));
263*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(1, 1024, 2048));
264*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(1, 1008, 1024));
265*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(16, 128, 528));
266*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(1, 1024, 2048));
267*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(1, 1008, 1024));
268*5f39d1b3SJooyung Han   googlenet_gemms.push_back(Shape(1, 1008, 1024));
269*5f39d1b3SJooyung Han 
270*5f39d1b3SJooyung Han   for (auto& shape : googlenet_gemms) {
271*5f39d1b3SJooyung Han     shape.init();
272*5f39d1b3SJooyung Han   }
273*5f39d1b3SJooyung Han 
274*5f39d1b3SJooyung Han   std::vector<Shape> small_gemms;
275*5f39d1b3SJooyung Han   small_gemms.push_back(Shape(29232, 16, 25));
276*5f39d1b3SJooyung Han   small_gemms.push_back(Shape(7308, 6, 400));
277*5f39d1b3SJooyung Han   small_gemms.push_back(Shape(203, 3002, 216));
278*5f39d1b3SJooyung Han 
279*5f39d1b3SJooyung Han   for (auto& shape : small_gemms) {
280*5f39d1b3SJooyung Han     shape.init();
281*5f39d1b3SJooyung Han   }
282*5f39d1b3SJooyung Han 
283*5f39d1b3SJooyung Han   std::vector<Shape> others;
284*5f39d1b3SJooyung Han   others.push_back(Shape(100, 100, 100));
285*5f39d1b3SJooyung Han   others.push_back(Shape(1000, 1000, 1000));
286*5f39d1b3SJooyung Han   others.push_back(Shape(2000, 1000, 1000));
287*5f39d1b3SJooyung Han 
288*5f39d1b3SJooyung Han   for (auto& shape : others) {
289*5f39d1b3SJooyung Han     shape.init();
290*5f39d1b3SJooyung Han   }
291*5f39d1b3SJooyung Han 
292*5f39d1b3SJooyung Han   std::vector<Shape> lstm;
293*5f39d1b3SJooyung Han   lstm.push_back(Shape(1, 500, 320));
294*5f39d1b3SJooyung Han   lstm.push_back(Shape(1, 100, 500));
295*5f39d1b3SJooyung Han   lstm.push_back(Shape(1, 500, 500));
296*5f39d1b3SJooyung Han   lstm.push_back(Shape(1, 500, 100));
297*5f39d1b3SJooyung Han   lstm.push_back(Shape(1, 2000, 100));
298*5f39d1b3SJooyung Han 
299*5f39d1b3SJooyung Han   for (auto& shape : lstm) {
300*5f39d1b3SJooyung Han     shape.init();
301*5f39d1b3SJooyung Han   }
302*5f39d1b3SJooyung Han 
303*5f39d1b3SJooyung Han   gemmlowp::eight_bit_int_gemm::SetMaxNumThreads(4);
304*5f39d1b3SJooyung Han 
305*5f39d1b3SJooyung Han   std::cout << "Warmup run." << std::endl;
306*5f39d1b3SJooyung Han   time_all(&googlenet_gemms, 10, 1.0);
307*5f39d1b3SJooyung Han   time_all(&small_gemms, 50, 1.0);
308*5f39d1b3SJooyung Han 
309*5f39d1b3SJooyung Han   std::cout << "Timing all." << std::endl;
310*5f39d1b3SJooyung Han   time_all(&googlenet_gemms, 10, 10.0);
311*5f39d1b3SJooyung Han   time_all(&small_gemms, 50, 10.0);
312*5f39d1b3SJooyung Han 
313*5f39d1b3SJooyung Han   std::cout << "Timing separate." << std::endl;
314*5f39d1b3SJooyung Han 
315*5f39d1b3SJooyung Han   for (auto& shape : googlenet_gemms) {
316*5f39d1b3SJooyung Han     time_one(&shape, 0.10);
317*5f39d1b3SJooyung Han   }
318*5f39d1b3SJooyung Han 
319*5f39d1b3SJooyung Han   for (auto& shape : small_gemms) {
320*5f39d1b3SJooyung Han     time_one(&shape, 0.10);
321*5f39d1b3SJooyung Han   }
322*5f39d1b3SJooyung Han 
323*5f39d1b3SJooyung Han   for (auto& shape : others) {
324*5f39d1b3SJooyung Han     time_one(&shape, 0.10);
325*5f39d1b3SJooyung Han   }
326*5f39d1b3SJooyung Han 
327*5f39d1b3SJooyung Han   for (auto& shape : lstm) {
328*5f39d1b3SJooyung Han     time_one(&shape, 0.10);
329*5f39d1b3SJooyung Han   }
330*5f39d1b3SJooyung Han 
331*5f39d1b3SJooyung Han   return 0;
332*5f39d1b3SJooyung Han }
333