xref: /aosp_15_r20/external/gemmlowp/meta/legacy_multi_thread_gemm.h (revision 5f39d1b313f0528e11bae88b3029b54b9e1033e7)
1*5f39d1b3SJooyung Han // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2*5f39d1b3SJooyung Han //
3*5f39d1b3SJooyung Han // Licensed under the Apache License, Version 2.0 (the "License");
4*5f39d1b3SJooyung Han // you may not use this file except in compliance with the License.
5*5f39d1b3SJooyung Han // You may obtain a copy of the License at
6*5f39d1b3SJooyung Han //
7*5f39d1b3SJooyung Han //     http://www.apache.org/licenses/LICENSE-2.0
8*5f39d1b3SJooyung Han //
9*5f39d1b3SJooyung Han // Unless required by applicable law or agreed to in writing, software
10*5f39d1b3SJooyung Han // distributed under the License is distributed on an "AS IS" BASIS,
11*5f39d1b3SJooyung Han // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*5f39d1b3SJooyung Han // See the License for the specific language governing permissions and
13*5f39d1b3SJooyung Han // limitations under the License.
14*5f39d1b3SJooyung Han 
15*5f39d1b3SJooyung Han #ifndef GEMMLOWP_META_MULTI_THREAD_GEMM_H_
16*5f39d1b3SJooyung Han #define GEMMLOWP_META_MULTI_THREAD_GEMM_H_
17*5f39d1b3SJooyung Han 
18*5f39d1b3SJooyung Han #include "../internal/common.h"
19*5f39d1b3SJooyung Han 
20*5f39d1b3SJooyung Han #ifdef GEMMLOWP_NEON
21*5f39d1b3SJooyung Han 
22*5f39d1b3SJooyung Han #include "legacy_multi_thread_common.h"
23*5f39d1b3SJooyung Han #include "legacy_multi_thread_gemv.h"
24*5f39d1b3SJooyung Han #include "legacy_operations_common.h"
25*5f39d1b3SJooyung Han #include "legacy_single_thread_gemm.h"
26*5f39d1b3SJooyung Han 
27*5f39d1b3SJooyung Han namespace gemmlowp {
28*5f39d1b3SJooyung Han namespace meta {
29*5f39d1b3SJooyung Han namespace internal {
30*5f39d1b3SJooyung Han 
31*5f39d1b3SJooyung Han const std::int32_t kMaxCacheFriendlySize = 256 * 1024;
32*5f39d1b3SJooyung Han 
33*5f39d1b3SJooyung Han template <typename IN_TYPE, typename OUT_TYPE, typename F>
CacheFriendlyMatrixMatrix(std::uint8_t * scratch,const IN_TYPE * lhs,const IN_TYPE * rhs,std::int32_t m,std::int32_t n,std::int32_t k,OUT_TYPE * result,std::int32_t result_stride,const F & operation)34*5f39d1b3SJooyung Han void CacheFriendlyMatrixMatrix(std::uint8_t* scratch, const IN_TYPE* lhs,
35*5f39d1b3SJooyung Han                                const IN_TYPE* rhs, std::int32_t m,
36*5f39d1b3SJooyung Han                                std::int32_t n, std::int32_t k, OUT_TYPE* result,
37*5f39d1b3SJooyung Han                                std::int32_t result_stride, const F& operation) {
38*5f39d1b3SJooyung Han   const std::int32_t rhs_size = n * k * sizeof(IN_TYPE);
39*5f39d1b3SJooyung Han   if (rhs_size > kMaxCacheFriendlySize) {
40*5f39d1b3SJooyung Han     const std::int32_t optimal_n =
41*5f39d1b3SJooyung Han         std::max(1, 4 * (kMaxCacheFriendlySize / (k * 4)));
42*5f39d1b3SJooyung Han     const std::int32_t chunks_count_less_one = n / optimal_n - 1;
43*5f39d1b3SJooyung Han     const std::int32_t chunk_size = optimal_n * k;
44*5f39d1b3SJooyung Han     for (int i = 0; i < chunks_count_less_one; ++i) {
45*5f39d1b3SJooyung Han       operation.ExecuteCacheFriendlyMatrixMatrix(
46*5f39d1b3SJooyung Han           scratch, lhs, rhs + i * chunk_size, m, optimal_n, k,
47*5f39d1b3SJooyung Han           result + i * optimal_n, result_stride);
48*5f39d1b3SJooyung Han     }
49*5f39d1b3SJooyung Han     const std::int32_t n_left = n - chunks_count_less_one * optimal_n;
50*5f39d1b3SJooyung Han     operation.ExecuteCacheFriendlyMatrixMatrix(
51*5f39d1b3SJooyung Han         scratch, lhs, rhs + chunks_count_less_one * chunk_size, m, n_left, k,
52*5f39d1b3SJooyung Han         result + chunks_count_less_one * optimal_n, result_stride);
53*5f39d1b3SJooyung Han   } else {
54*5f39d1b3SJooyung Han     operation.ExecuteCacheFriendlyMatrixMatrix(scratch, lhs, rhs, m, n, k,
55*5f39d1b3SJooyung Han                                                result, result_stride);
56*5f39d1b3SJooyung Han   }
57*5f39d1b3SJooyung Han }
58*5f39d1b3SJooyung Han 
59*5f39d1b3SJooyung Han class GemmQuantized8BitOperation : public Quantized8BitOperation {
60*5f39d1b3SJooyung Han  public:
GemmQuantized8BitOperation(std::int32_t lhs_offset,std::int32_t rhs_offset,std::int32_t sum_offset,std::int32_t multiplier,std::int32_t shift)61*5f39d1b3SJooyung Han   GemmQuantized8BitOperation(std::int32_t lhs_offset, std::int32_t rhs_offset,
62*5f39d1b3SJooyung Han                              std::int32_t sum_offset, std::int32_t multiplier,
63*5f39d1b3SJooyung Han                              std::int32_t shift)
64*5f39d1b3SJooyung Han       : Quantized8BitOperation(lhs_offset, rhs_offset, sum_offset, multiplier,
65*5f39d1b3SJooyung Han                                shift) {}
66*5f39d1b3SJooyung Han 
ExecuteMatrixMatrix(std::uint8_t * scratch,const std::uint8_t * lhs,const std::uint8_t * rhs,std::int32_t m,std::int32_t n,std::int32_t k,std::uint8_t * result,std::int32_t result_stride)67*5f39d1b3SJooyung Han   void ExecuteMatrixMatrix(std::uint8_t* scratch, const std::uint8_t* lhs,
68*5f39d1b3SJooyung Han                            const std::uint8_t* rhs, std::int32_t m,
69*5f39d1b3SJooyung Han                            std::int32_t n, std::int32_t k, std::uint8_t* result,
70*5f39d1b3SJooyung Han                            std::int32_t result_stride) const {
71*5f39d1b3SJooyung Han     CacheFriendlyMatrixMatrix(scratch, lhs, rhs, m, n, k, result, result_stride,
72*5f39d1b3SJooyung Han                               *this);
73*5f39d1b3SJooyung Han   }
74*5f39d1b3SJooyung Han 
ExecuteCacheFriendlyMatrixMatrix(std::uint8_t * scratch,const std::uint8_t * lhs,const std::uint8_t * rhs,std::int32_t m,std::int32_t n,std::int32_t k,std::uint8_t * result,std::int32_t result_stride)75*5f39d1b3SJooyung Han   void ExecuteCacheFriendlyMatrixMatrix(std::uint8_t* scratch,
76*5f39d1b3SJooyung Han                                         const std::uint8_t* lhs,
77*5f39d1b3SJooyung Han                                         const std::uint8_t* rhs, std::int32_t m,
78*5f39d1b3SJooyung Han                                         std::int32_t n, std::int32_t k,
79*5f39d1b3SJooyung Han                                         std::uint8_t* result,
80*5f39d1b3SJooyung Han                                         std::int32_t result_stride) const {
81*5f39d1b3SJooyung Han     gemm_q8_strided(scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
82*5f39d1b3SJooyung Han                     sum_offset, multiplier, shift, result, result_stride);
83*5f39d1b3SJooyung Han   }
84*5f39d1b3SJooyung Han 
ScratchPerThread(std::int32_t m,std::int32_t n,std::int32_t k)85*5f39d1b3SJooyung Han   static std::int32_t ScratchPerThread(std::int32_t m, std::int32_t n,
86*5f39d1b3SJooyung Han                                        std::int32_t k) {
87*5f39d1b3SJooyung Han     return 4 * kMaxCacheFriendlySize;
88*5f39d1b3SJooyung Han   }
89*5f39d1b3SJooyung Han };
90*5f39d1b3SJooyung Han 
91*5f39d1b3SJooyung Han class GemmFloatOperation : public FloatOperation {
92*5f39d1b3SJooyung Han  public:
GemmFloatOperation(std::int32_t lhs_offset,std::int32_t rhs_offset,float result_offset)93*5f39d1b3SJooyung Han   GemmFloatOperation(std::int32_t lhs_offset, std::int32_t rhs_offset,
94*5f39d1b3SJooyung Han                      float result_offset)
95*5f39d1b3SJooyung Han       : FloatOperation(lhs_offset, rhs_offset, result_offset) {}
96*5f39d1b3SJooyung Han 
ExecuteMatrixMatrix(std::uint8_t * scratch,const std::uint8_t * lhs,const std::uint8_t * rhs,std::int32_t m,std::int32_t n,std::int32_t k,float * result,std::int32_t result_stride)97*5f39d1b3SJooyung Han   void ExecuteMatrixMatrix(std::uint8_t* scratch, const std::uint8_t* lhs,
98*5f39d1b3SJooyung Han                            const std::uint8_t* rhs, std::int32_t m,
99*5f39d1b3SJooyung Han                            std::int32_t n, std::int32_t k, float* result,
100*5f39d1b3SJooyung Han                            std::int32_t result_stride) const {
101*5f39d1b3SJooyung Han     CacheFriendlyMatrixMatrix(scratch, lhs, rhs, m, n, k, result, result_stride,
102*5f39d1b3SJooyung Han                               *this);
103*5f39d1b3SJooyung Han   }
104*5f39d1b3SJooyung Han 
ExecuteCacheFriendlyMatrixMatrix(std::uint8_t * scratch,const std::uint8_t * lhs,const std::uint8_t * rhs,std::int32_t m,std::int32_t n,std::int32_t k,float * result,std::int32_t result_stride)105*5f39d1b3SJooyung Han   void ExecuteCacheFriendlyMatrixMatrix(std::uint8_t* scratch,
106*5f39d1b3SJooyung Han                                         const std::uint8_t* lhs,
107*5f39d1b3SJooyung Han                                         const std::uint8_t* rhs, std::int32_t m,
108*5f39d1b3SJooyung Han                                         std::int32_t n, std::int32_t k,
109*5f39d1b3SJooyung Han                                         float* result,
110*5f39d1b3SJooyung Han                                         std::int32_t result_stride) const {
111*5f39d1b3SJooyung Han     gemm_f_strided(scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
112*5f39d1b3SJooyung Han                    result_offset, result, result_stride);
113*5f39d1b3SJooyung Han   }
114*5f39d1b3SJooyung Han 
ScratchPerThread(std::int32_t m,std::int32_t n,std::int32_t k)115*5f39d1b3SJooyung Han   static std::int32_t ScratchPerThread(std::int32_t m, std::int32_t n,
116*5f39d1b3SJooyung Han                                        std::int32_t k) {
117*5f39d1b3SJooyung Han     return 4 * kMaxCacheFriendlySize;
118*5f39d1b3SJooyung Han   }
119*5f39d1b3SJooyung Han };
120*5f39d1b3SJooyung Han 
121*5f39d1b3SJooyung Han class GemmInt32Operation : public Int32Operation {
122*5f39d1b3SJooyung Han  public:
GemmInt32Operation(std::int32_t lhs_offset,std::int32_t rhs_offset)123*5f39d1b3SJooyung Han   GemmInt32Operation(std::int32_t lhs_offset, std::int32_t rhs_offset)
124*5f39d1b3SJooyung Han       : Int32Operation(lhs_offset, rhs_offset) {}
125*5f39d1b3SJooyung Han 
ExecuteMatrixMatrix(std::uint8_t * scratch,const std::uint8_t * lhs,const std::uint8_t * rhs,std::int32_t m,std::int32_t n,std::int32_t k,std::int32_t * result,std::int32_t result_stride)126*5f39d1b3SJooyung Han   void ExecuteMatrixMatrix(std::uint8_t* scratch, const std::uint8_t* lhs,
127*5f39d1b3SJooyung Han                            const std::uint8_t* rhs, std::int32_t m,
128*5f39d1b3SJooyung Han                            std::int32_t n, std::int32_t k, std::int32_t* result,
129*5f39d1b3SJooyung Han                            std::int32_t result_stride) const {
130*5f39d1b3SJooyung Han     CacheFriendlyMatrixMatrix(scratch, lhs, rhs, m, n, k, result, result_stride,
131*5f39d1b3SJooyung Han                               *this);
132*5f39d1b3SJooyung Han   }
133*5f39d1b3SJooyung Han 
ExecuteCacheFriendlyMatrixMatrix(std::uint8_t * scratch,const std::uint8_t * lhs,const std::uint8_t * rhs,std::int32_t m,std::int32_t n,std::int32_t k,std::int32_t * result,std::int32_t result_stride)134*5f39d1b3SJooyung Han   void ExecuteCacheFriendlyMatrixMatrix(std::uint8_t* scratch,
135*5f39d1b3SJooyung Han                                         const std::uint8_t* lhs,
136*5f39d1b3SJooyung Han                                         const std::uint8_t* rhs, std::int32_t m,
137*5f39d1b3SJooyung Han                                         std::int32_t n, std::int32_t k,
138*5f39d1b3SJooyung Han                                         std::int32_t* result,
139*5f39d1b3SJooyung Han                                         std::int32_t result_stride) const {
140*5f39d1b3SJooyung Han     gemm_i32_strided(scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset, result,
141*5f39d1b3SJooyung Han                      result_stride);
142*5f39d1b3SJooyung Han   }
143*5f39d1b3SJooyung Han 
ScratchPerThread(std::int32_t m,std::int32_t n,std::int32_t k)144*5f39d1b3SJooyung Han   static std::int32_t ScratchPerThread(std::int32_t m, std::int32_t n,
145*5f39d1b3SJooyung Han                                        std::int32_t k) {
146*5f39d1b3SJooyung Han     return 4 * kMaxCacheFriendlySize;
147*5f39d1b3SJooyung Han   }
148*5f39d1b3SJooyung Han };
149*5f39d1b3SJooyung Han 
150*5f39d1b3SJooyung Han }  // namespace internal
151*5f39d1b3SJooyung Han 
gemm_q8_scratch(std::int32_t m,std::int32_t n,std::int32_t k,std::int32_t max_threads)152*5f39d1b3SJooyung Han std::int32_t gemm_q8_scratch(std::int32_t m, std::int32_t n, std::int32_t k,
153*5f39d1b3SJooyung Han                              std::int32_t max_threads) {
154*5f39d1b3SJooyung Han   return internal::ResolveMaxThreads(max_threads) *
155*5f39d1b3SJooyung Han          internal::GemmQuantized8BitOperation::ScratchPerThread(m, n, k);
156*5f39d1b3SJooyung Han }
157*5f39d1b3SJooyung Han 
multi_thread_gemm_q8(gemmlowp::WorkersPool * pool,std::int32_t max_threads,std::uint8_t * scratch,const std::uint8_t * lhs,const std::uint8_t * rhs,std::int32_t m,std::int32_t n,std::int32_t k,std::int32_t lhs_offset,std::int32_t rhs_offset,std::int32_t sum_offset,std::int32_t multiplier,std::int32_t shift,std::uint8_t * result)158*5f39d1b3SJooyung Han void multi_thread_gemm_q8(gemmlowp::WorkersPool* pool, std::int32_t max_threads,
159*5f39d1b3SJooyung Han                           std::uint8_t* scratch, const std::uint8_t* lhs,
160*5f39d1b3SJooyung Han                           const std::uint8_t* rhs, std::int32_t m,
161*5f39d1b3SJooyung Han                           std::int32_t n, std::int32_t k,
162*5f39d1b3SJooyung Han                           std::int32_t lhs_offset, std::int32_t rhs_offset,
163*5f39d1b3SJooyung Han                           std::int32_t sum_offset, std::int32_t multiplier,
164*5f39d1b3SJooyung Han                           std::int32_t shift, std::uint8_t* result) {
165*5f39d1b3SJooyung Han   if (m == 1) {
166*5f39d1b3SJooyung Han     multi_thread_gemv_q8(pool, max_threads, scratch, lhs, rhs, n, k, lhs_offset,
167*5f39d1b3SJooyung Han                          rhs_offset, sum_offset, multiplier, shift, result);
168*5f39d1b3SJooyung Han     return;
169*5f39d1b3SJooyung Han   } else if (n == 1) {
170*5f39d1b3SJooyung Han     multi_thread_gemv_q8(pool, max_threads, scratch, rhs, lhs, m, k, rhs_offset,
171*5f39d1b3SJooyung Han                          lhs_offset, sum_offset, multiplier, shift, result);
172*5f39d1b3SJooyung Han     return;
173*5f39d1b3SJooyung Han   }
174*5f39d1b3SJooyung Han 
175*5f39d1b3SJooyung Han   max_threads = internal::ResolveMaxThreads(max_threads);
176*5f39d1b3SJooyung Han   internal::GemmQuantized8BitOperation operation(lhs_offset, rhs_offset,
177*5f39d1b3SJooyung Han                                                  sum_offset, multiplier, shift);
178*5f39d1b3SJooyung Han   if (max_threads == 1) {
179*5f39d1b3SJooyung Han     internal::CacheFriendlyMatrixMatrix(scratch, lhs, rhs, m, n, k, result, n,
180*5f39d1b3SJooyung Han                                         operation);
181*5f39d1b3SJooyung Han   } else {
182*5f39d1b3SJooyung Han     internal::MultiThreadedMatrixMatrix(pool, max_threads, scratch, lhs, rhs, m,
183*5f39d1b3SJooyung Han                                         n, k, result, n, operation);
184*5f39d1b3SJooyung Han   }
185*5f39d1b3SJooyung Han }
186*5f39d1b3SJooyung Han 
gemm_f_scratch(std::int32_t m,std::int32_t n,std::int32_t k,std::int32_t max_threads)187*5f39d1b3SJooyung Han std::int32_t gemm_f_scratch(std::int32_t m, std::int32_t n, std::int32_t k,
188*5f39d1b3SJooyung Han                             std::int32_t max_threads) {
189*5f39d1b3SJooyung Han   return internal::ResolveMaxThreads(max_threads) *
190*5f39d1b3SJooyung Han          internal::GemmFloatOperation::ScratchPerThread(m, n, k);
191*5f39d1b3SJooyung Han }
192*5f39d1b3SJooyung Han 
multi_thread_gemm_f(gemmlowp::WorkersPool * pool,std::int32_t max_threads,std::uint8_t * scratch,const std::uint8_t * lhs,const std::uint8_t * rhs,std::int32_t m,std::int32_t n,std::int32_t k,std::int32_t lhs_offset,std::int32_t rhs_offset,float result_offset,float * result)193*5f39d1b3SJooyung Han void multi_thread_gemm_f(gemmlowp::WorkersPool* pool, std::int32_t max_threads,
194*5f39d1b3SJooyung Han                          std::uint8_t* scratch, const std::uint8_t* lhs,
195*5f39d1b3SJooyung Han                          const std::uint8_t* rhs, std::int32_t m,
196*5f39d1b3SJooyung Han                          std::int32_t n, std::int32_t k,
197*5f39d1b3SJooyung Han                          std::int32_t lhs_offset, std::int32_t rhs_offset,
198*5f39d1b3SJooyung Han                          float result_offset, float* result) {
199*5f39d1b3SJooyung Han   if (m == 1) {
200*5f39d1b3SJooyung Han     multi_thread_gemv_f(pool, max_threads, scratch, lhs, rhs, n, k, lhs_offset,
201*5f39d1b3SJooyung Han                         rhs_offset, result_offset, result);
202*5f39d1b3SJooyung Han     return;
203*5f39d1b3SJooyung Han   } else if (n == 1) {
204*5f39d1b3SJooyung Han     multi_thread_gemv_f(pool, max_threads, scratch, rhs, lhs, m, k, rhs_offset,
205*5f39d1b3SJooyung Han                         lhs_offset, result_offset, result);
206*5f39d1b3SJooyung Han     return;
207*5f39d1b3SJooyung Han   }
208*5f39d1b3SJooyung Han 
209*5f39d1b3SJooyung Han   max_threads = internal::ResolveMaxThreads(max_threads);
210*5f39d1b3SJooyung Han   internal::GemmFloatOperation operation(lhs_offset, rhs_offset, result_offset);
211*5f39d1b3SJooyung Han   if (max_threads == 1) {
212*5f39d1b3SJooyung Han     internal::CacheFriendlyMatrixMatrix(scratch, lhs, rhs, m, n, k, result, n,
213*5f39d1b3SJooyung Han                                         operation);
214*5f39d1b3SJooyung Han   } else {
215*5f39d1b3SJooyung Han     internal::MultiThreadedMatrixMatrix(pool, max_threads, scratch, lhs, rhs, m,
216*5f39d1b3SJooyung Han                                         n, k, result, n, operation);
217*5f39d1b3SJooyung Han   }
218*5f39d1b3SJooyung Han }
219*5f39d1b3SJooyung Han 
gemm_i32_scratch(std::int32_t m,std::int32_t n,std::int32_t k,std::int32_t max_threads)220*5f39d1b3SJooyung Han std::int32_t gemm_i32_scratch(std::int32_t m, std::int32_t n, std::int32_t k,
221*5f39d1b3SJooyung Han                               std::int32_t max_threads) {
222*5f39d1b3SJooyung Han   return internal::ResolveMaxThreads(max_threads) *
223*5f39d1b3SJooyung Han          internal::GemmInt32Operation::ScratchPerThread(m, n, k);
224*5f39d1b3SJooyung Han }
225*5f39d1b3SJooyung Han 
multi_thread_gemm_i32(gemmlowp::WorkersPool * pool,std::int32_t max_threads,std::uint8_t * scratch,const std::uint8_t * lhs,const std::uint8_t * rhs,std::int32_t m,std::int32_t n,std::int32_t k,std::int32_t lhs_offset,std::int32_t rhs_offset,std::int32_t * result)226*5f39d1b3SJooyung Han void multi_thread_gemm_i32(gemmlowp::WorkersPool* pool,
227*5f39d1b3SJooyung Han                            std::int32_t max_threads, std::uint8_t* scratch,
228*5f39d1b3SJooyung Han                            const std::uint8_t* lhs, const std::uint8_t* rhs,
229*5f39d1b3SJooyung Han                            std::int32_t m, std::int32_t n, std::int32_t k,
230*5f39d1b3SJooyung Han                            std::int32_t lhs_offset, std::int32_t rhs_offset,
231*5f39d1b3SJooyung Han                            std::int32_t* result) {
232*5f39d1b3SJooyung Han   if (m == 1) {
233*5f39d1b3SJooyung Han     multi_thread_gemv_i32(pool, max_threads, scratch, lhs, rhs, n, k,
234*5f39d1b3SJooyung Han                           lhs_offset, rhs_offset, result);
235*5f39d1b3SJooyung Han     return;
236*5f39d1b3SJooyung Han   } else if (n == 1) {
237*5f39d1b3SJooyung Han     multi_thread_gemv_i32(pool, max_threads, scratch, rhs, lhs, m, k,
238*5f39d1b3SJooyung Han                           rhs_offset, lhs_offset, result);
239*5f39d1b3SJooyung Han     return;
240*5f39d1b3SJooyung Han   }
241*5f39d1b3SJooyung Han 
242*5f39d1b3SJooyung Han   max_threads = internal::ResolveMaxThreads(max_threads);
243*5f39d1b3SJooyung Han   internal::GemmInt32Operation operation(lhs_offset, rhs_offset);
244*5f39d1b3SJooyung Han   if (max_threads == 1) {
245*5f39d1b3SJooyung Han     internal::CacheFriendlyMatrixMatrix(scratch, lhs, rhs, m, n, k, result, n,
246*5f39d1b3SJooyung Han                                         operation);
247*5f39d1b3SJooyung Han   } else {
248*5f39d1b3SJooyung Han     internal::MultiThreadedMatrixMatrix(pool, max_threads, scratch, lhs, rhs, m,
249*5f39d1b3SJooyung Han                                         n, k, result, n, operation);
250*5f39d1b3SJooyung Han   }
251*5f39d1b3SJooyung Han }
252*5f39d1b3SJooyung Han 
253*5f39d1b3SJooyung Han }  // namespace meta
254*5f39d1b3SJooyung Han }  // namespace gemmlowp
255*5f39d1b3SJooyung Han 
256*5f39d1b3SJooyung Han #else
257*5f39d1b3SJooyung Han #warning "Meta gemm fast-path requires GEMMLOWP_NEON_(32|64)!"
258*5f39d1b3SJooyung Han #endif
259*5f39d1b3SJooyung Han 
260*5f39d1b3SJooyung Han #endif  // GEMMLOWP_META_MULTI_THREAD_GEMM_H_
261