xref: /aosp_15_r20/external/gemmlowp/internal/kernel_neon.h (revision 5f39d1b313f0528e11bae88b3029b54b9e1033e7)
1*5f39d1b3SJooyung Han // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2*5f39d1b3SJooyung Han //
3*5f39d1b3SJooyung Han // Licensed under the Apache License, Version 2.0 (the "License");
4*5f39d1b3SJooyung Han // you may not use this file except in compliance with the License.
5*5f39d1b3SJooyung Han // You may obtain a copy of the License at
6*5f39d1b3SJooyung Han //
7*5f39d1b3SJooyung Han //     http://www.apache.org/licenses/LICENSE-2.0
8*5f39d1b3SJooyung Han //
9*5f39d1b3SJooyung Han // Unless required by applicable law or agreed to in writing, software
10*5f39d1b3SJooyung Han // distributed under the License is distributed on an "AS IS" BASIS,
11*5f39d1b3SJooyung Han // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*5f39d1b3SJooyung Han // See the License for the specific language governing permissions and
13*5f39d1b3SJooyung Han // limitations under the License.
14*5f39d1b3SJooyung Han 
15*5f39d1b3SJooyung Han // kernel_neon.h: a collection of NEON optimized kernels.
16*5f39d1b3SJooyung Han // Check in kernel_default.h which one(s) are actually used by default.
17*5f39d1b3SJooyung Han // Others are mere experiments; they are still covered by tests
18*5f39d1b3SJooyung Han // in case they might be useful some day.
19*5f39d1b3SJooyung Han 
20*5f39d1b3SJooyung Han #ifndef GEMMLOWP_INTERNAL_KERNEL_NEON_H_
21*5f39d1b3SJooyung Han #define GEMMLOWP_INTERNAL_KERNEL_NEON_H_
22*5f39d1b3SJooyung Han 
23*5f39d1b3SJooyung Han #include "kernel.h"
24*5f39d1b3SJooyung Han 
25*5f39d1b3SJooyung Han #include <arm_neon.h>
26*5f39d1b3SJooyung Han #include <cassert>
27*5f39d1b3SJooyung Han 
28*5f39d1b3SJooyung Han namespace gemmlowp {
29*5f39d1b3SJooyung Han 
30*5f39d1b3SJooyung Han // The kernels here are specifically arm 32bit assembly, not arm 64bit.
31*5f39d1b3SJooyung Han #ifdef GEMMLOWP_NEON_32
32*5f39d1b3SJooyung Han 
33*5f39d1b3SJooyung Han // Our main GEMM kernel.
34*5f39d1b3SJooyung Han struct NEON_32_Kernel12x4Depth2 : KernelBase {
35*5f39d1b3SJooyung Han   typedef KernelFormat<KernelSideFormat<CellFormat<4, 2>, 3>,
36*5f39d1b3SJooyung Han                        KernelSideFormat<CellFormat<4, 2>, 1> >
37*5f39d1b3SJooyung Han       Format;
38*5f39d1b3SJooyung Han 
NameNEON_32_Kernel12x4Depth239*5f39d1b3SJooyung Han   const char* Name() const override { return "NEON, 12x4, depth 2"; }
40*5f39d1b3SJooyung Han 
41*5f39d1b3SJooyung Han   // TODO(benoitjacob): reorder function arguments so dst comes last
RunNEON_32_Kernel12x4Depth242*5f39d1b3SJooyung Han   void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
43*5f39d1b3SJooyung Han            std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
44*5f39d1b3SJooyung Han            const std::uint8_t* rhs_ptr, std::size_t start_depth,
45*5f39d1b3SJooyung Han            std::size_t run_depth) const override {
46*5f39d1b3SJooyung Han     ScopedProfilingLabel label("optimized kernel (NEON 12x4)");
47*5f39d1b3SJooyung Han 
48*5f39d1b3SJooyung Han // For iOS assembler, the %= style of local labels cause compilation errors,
49*5f39d1b3SJooyung Han //  so use numerical ones instead. See
50*5f39d1b3SJooyung Han // http://stackoverflow.com/questions/3898435/labels-in-gcc-inline-assembly
51*5f39d1b3SJooyung Han // If you add any labels, remember to undef them at the end.
52*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_CLEAR_ACCUMULATORS "1"
53*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_BEFORE_LOOP "2"
54*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_LOOP "3"
55*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_AFTER_LOOP "4"
56*5f39d1b3SJooyung Han 
57*5f39d1b3SJooyung Han     assert(dst_row_stride == 1);
58*5f39d1b3SJooyung Han     (void)dst_row_stride;
59*5f39d1b3SJooyung Han     asm volatile(
60*5f39d1b3SJooyung Han         // Overview of register layout:
61*5f39d1b3SJooyung Han         //
62*5f39d1b3SJooyung Han         // A 2x4 cell of Rhs is stored in 16bit in d0--d1 (q0).
63*5f39d1b3SJooyung Han         // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in d2--d7
64*5f39d1b3SJooyung Han         // (q1--q3).
65*5f39d1b3SJooyung Han         // A 12x4 block of accumulators is stored in 32bit in q4--q15.
66*5f39d1b3SJooyung Han         //
67*5f39d1b3SJooyung Han         //                   +-----+-----+-----+-----+
68*5f39d1b3SJooyung Han         //                   |d0[0]|d0[1]|d0[2]|d0[3]|
69*5f39d1b3SJooyung Han         //              Rhs  +-----+-----+-----+-----+
70*5f39d1b3SJooyung Han         //                   |d1[0]|d1[1]|d1[2]|d1[3]|
71*5f39d1b3SJooyung Han         //                   +-----+-----+-----+-----+
72*5f39d1b3SJooyung Han         //
73*5f39d1b3SJooyung Han         //                   |     |     |     |     |
74*5f39d1b3SJooyung Han         //
75*5f39d1b3SJooyung Han         //    Lhs            |     |     |     |     |
76*5f39d1b3SJooyung Han         //
77*5f39d1b3SJooyung Han         //  +--+--+ - - - -  +-----+-----+-----+-----+
78*5f39d1b3SJooyung Han         //  |d2|d3|          | q4  | q5  | q6  | q7  |
79*5f39d1b3SJooyung Han         //  |d2|d3|          | q4  | q5  | q6  | q7  |
80*5f39d1b3SJooyung Han         //  |d2|d3|          | q4  | q5  | q6  | q7  |
81*5f39d1b3SJooyung Han         //  |d2|d3|          | q4  | q5  | q6  | q7  |
82*5f39d1b3SJooyung Han         //  +--+--+ - - - -  +-----+-----+-----+-----+
83*5f39d1b3SJooyung Han         //  |d4|d5|          | q8  | q9  | q10 | q11 |
84*5f39d1b3SJooyung Han         //  |d4|d5|          | q8  | q9  | q10 | q11 |
85*5f39d1b3SJooyung Han         //  |d4|d5|          | q8  | q9  | q10 | q11 |
86*5f39d1b3SJooyung Han         //  |d4|d5|          | q8  | q9  | q10 | q11 |
87*5f39d1b3SJooyung Han         //  +--+--+ - - - -  +-----+-----+-----+-----+
88*5f39d1b3SJooyung Han         //  |d6|d7|          | q12 | q13 | q14 | q15 |
89*5f39d1b3SJooyung Han         //  |d6|d7|          | q12 | q13 | q14 | q15 |
90*5f39d1b3SJooyung Han         //  |d6|d7|          | q12 | q13 | q14 | q15 |
91*5f39d1b3SJooyung Han         //  |d6|d7|          | q12 | q13 | q14 | q15 |
92*5f39d1b3SJooyung Han         //  +--+--+ - - - -  +-----+-----+-----+-----+
93*5f39d1b3SJooyung Han         //
94*5f39d1b3SJooyung Han         //                            Accumulator
95*5f39d1b3SJooyung Han 
96*5f39d1b3SJooyung Han         // Load 1 Rhs cell of size 2x4
97*5f39d1b3SJooyung Han         "vld1.8 {d0}, [%[rhs_ptr]]!\n"
98*5f39d1b3SJooyung Han         // Load 3 Lhs cells of size 4x2 each
99*5f39d1b3SJooyung Han         "vld1.8 {d2}, [%[lhs_ptr]]!\n"
100*5f39d1b3SJooyung Han         "vld1.8 {d4}, [%[lhs_ptr]]!\n"
101*5f39d1b3SJooyung Han         "vld1.8 {d6}, [%[lhs_ptr]]!\n"
102*5f39d1b3SJooyung Han 
103*5f39d1b3SJooyung Han         // Check if start_depth==0 to decide whether we will clear
104*5f39d1b3SJooyung Han         // accumulators or load existing accumulators.
105*5f39d1b3SJooyung Han         "cmp %[start_depth], #0\n"
106*5f39d1b3SJooyung Han 
107*5f39d1b3SJooyung Han         // Multiply dst_col_stride by 4 == sizeof(int32) to use
108*5f39d1b3SJooyung Han         // it as a byte offset below.
109*5f39d1b3SJooyung Han         "lsl %[dst_col_stride], #2\n"
110*5f39d1b3SJooyung Han 
111*5f39d1b3SJooyung Han         "beq " GEMMLOWP_LABEL_CLEAR_ACCUMULATORS
112*5f39d1b3SJooyung Han         "f\n"
113*5f39d1b3SJooyung Han 
114*5f39d1b3SJooyung Han         // Load accumulators (start_depth != 0)
115*5f39d1b3SJooyung Han         "mov r1, %[dst_ptr]\n"
116*5f39d1b3SJooyung Han         "subs %[run_depth], #2\n"
117*5f39d1b3SJooyung Han         "mov r0, r1\n"
118*5f39d1b3SJooyung Han         "vld1.32 {d8, d9},   [r0]!\n"
119*5f39d1b3SJooyung Han         "add r1, %[dst_col_stride]\n"
120*5f39d1b3SJooyung Han         "vld1.32 {d16, d17}, [r0]!\n"
121*5f39d1b3SJooyung Han         "vld1.32 {d24, d25}, [r0]\n"
122*5f39d1b3SJooyung Han         "mov r0, r1\n"
123*5f39d1b3SJooyung Han         "vld1.32 {d10, d11}, [r0]!\n"
124*5f39d1b3SJooyung Han         "add r1, %[dst_col_stride]\n"
125*5f39d1b3SJooyung Han         "vld1.32 {d18, d19}, [r0]!\n"
126*5f39d1b3SJooyung Han         "vld1.32 {d26, d27}, [r0]\n"
127*5f39d1b3SJooyung Han         "mov r0, r1\n"
128*5f39d1b3SJooyung Han         "vld1.32 {d12, d13}, [r0]!\n"
129*5f39d1b3SJooyung Han         "add r1, %[dst_col_stride]\n"
130*5f39d1b3SJooyung Han         "vld1.32 {d20, d21}, [r0]!\n"
131*5f39d1b3SJooyung Han         "vld1.32 {d28, d29}, [r0]\n"
132*5f39d1b3SJooyung Han         "mov r0, r1\n"
133*5f39d1b3SJooyung Han         "vld1.32 {d14, d15}, [r0]!\n"
134*5f39d1b3SJooyung Han         "vld1.32 {d22, d23}, [r0]!\n"
135*5f39d1b3SJooyung Han         "vld1.32 {d30, d31}, [r0]\n"
136*5f39d1b3SJooyung Han 
137*5f39d1b3SJooyung Han         "b " GEMMLOWP_LABEL_BEFORE_LOOP "f\n"
138*5f39d1b3SJooyung Han 
139*5f39d1b3SJooyung Han         GEMMLOWP_LABEL_CLEAR_ACCUMULATORS
140*5f39d1b3SJooyung Han         ":\n"
141*5f39d1b3SJooyung Han 
142*5f39d1b3SJooyung Han         // Clear accumulators (start_depth == 0)
143*5f39d1b3SJooyung Han         "vmov.s32 q4, #0\n"
144*5f39d1b3SJooyung Han         "subs %[run_depth], #2\n"
145*5f39d1b3SJooyung Han         "vmov.s32 q8, q4\n"
146*5f39d1b3SJooyung Han         "vmov.s32 q12, q4\n"
147*5f39d1b3SJooyung Han         "vmov.s32 q5, q4\n"
148*5f39d1b3SJooyung Han         "vmov.s32 q9, q4\n"
149*5f39d1b3SJooyung Han         "vmov.s32 q13, q4\n"
150*5f39d1b3SJooyung Han         "vmov.s32 q6, q4\n"
151*5f39d1b3SJooyung Han         "vmov.s32 q10, q4\n"
152*5f39d1b3SJooyung Han         "vmov.s32 q14, q4\n"
153*5f39d1b3SJooyung Han         "vmov.s32 q7, q4\n"
154*5f39d1b3SJooyung Han         "vmov.s32 q11, q4\n"
155*5f39d1b3SJooyung Han         "vmov.s32 q15, q4\n"
156*5f39d1b3SJooyung Han 
157*5f39d1b3SJooyung Han         GEMMLOWP_LABEL_BEFORE_LOOP
158*5f39d1b3SJooyung Han         ":\n"
159*5f39d1b3SJooyung Han 
160*5f39d1b3SJooyung Han         // If there are only two levels of depth, skip the loop.
161*5f39d1b3SJooyung Han         "beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n"
162*5f39d1b3SJooyung Han 
163*5f39d1b3SJooyung Han         GEMMLOWP_LABEL_LOOP
164*5f39d1b3SJooyung Han         ":\n"
165*5f39d1b3SJooyung Han         // Expand Lhs/Rhs cells to 16 bit.
166*5f39d1b3SJooyung Han         // Note: moving theses vmovls further down to allow for
167*5f39d1b3SJooyung Han         // longer data pipelining helps a little on A57 but is
168*5f39d1b3SJooyung Han         // harmful on A53 --- It looks as if A53 doesn't like
169*5f39d1b3SJooyung Han         // interleaving vmovl's into the vmlal's.
170*5f39d1b3SJooyung Han         "vmovl.u8 q0, d0\n"
171*5f39d1b3SJooyung Han         "vmovl.u8 q1, d2\n"
172*5f39d1b3SJooyung Han         "vmovl.u8 q2, d4\n"
173*5f39d1b3SJooyung Han         "vmovl.u8 q3, d6\n"
174*5f39d1b3SJooyung Han 
175*5f39d1b3SJooyung Han         // Multiply-accumulate, level of depth 0
176*5f39d1b3SJooyung Han         "vmlal.u16 q4, d2, d0[0]\n"
177*5f39d1b3SJooyung Han         "vmlal.u16 q5, d2, d0[1]\n"
178*5f39d1b3SJooyung Han         "vmlal.u16 q6, d2, d0[2]\n"
179*5f39d1b3SJooyung Han         "vmlal.u16 q7, d2, d0[3]\n"
180*5f39d1b3SJooyung Han         "vldr d2, [%[lhs_ptr]]\n"
181*5f39d1b3SJooyung Han         "vmlal.u16 q8, d4, d0[0]\n"
182*5f39d1b3SJooyung Han         "vmlal.u16 q9, d4, d0[1]\n"
183*5f39d1b3SJooyung Han         "vmlal.u16 q10, d4, d0[2]\n"
184*5f39d1b3SJooyung Han         "vmlal.u16 q11, d4, d0[3]\n"
185*5f39d1b3SJooyung Han         "vldr d4, [%[lhs_ptr], #8]\n"
186*5f39d1b3SJooyung Han         "vmlal.u16 q12, d6, d0[0]\n"
187*5f39d1b3SJooyung Han         "vmlal.u16 q13, d6, d0[1]\n"
188*5f39d1b3SJooyung Han         "vmlal.u16 q14, d6, d0[2]\n"
189*5f39d1b3SJooyung Han         "vmlal.u16 q15, d6, d0[3]\n"
190*5f39d1b3SJooyung Han         "vldr d6, [%[lhs_ptr], #16]\n"
191*5f39d1b3SJooyung Han         "vldr d0, [%[rhs_ptr]]\n"
192*5f39d1b3SJooyung Han 
193*5f39d1b3SJooyung Han         // Multiply-accumulate, level of depth 1
194*5f39d1b3SJooyung Han         "vmlal.u16 q4, d3, d1[0]\n"
195*5f39d1b3SJooyung Han         "vmlal.u16 q5, d3, d1[1]\n"
196*5f39d1b3SJooyung Han         "add %[lhs_ptr], #24\n"
197*5f39d1b3SJooyung Han         "vmlal.u16 q6, d3, d1[2]\n"
198*5f39d1b3SJooyung Han         "vmlal.u16 q7, d3, d1[3]\n"
199*5f39d1b3SJooyung Han         "add %[rhs_ptr], #8\n"
200*5f39d1b3SJooyung Han         "vmlal.u16 q8, d5, d1[0]\n"
201*5f39d1b3SJooyung Han         "vmlal.u16 q9, d5, d1[1]\n"
202*5f39d1b3SJooyung Han         "subs %[run_depth], #2\n"
203*5f39d1b3SJooyung Han         "vmlal.u16 q10, d5, d1[2]\n"
204*5f39d1b3SJooyung Han         "vmlal.u16 q11, d5, d1[3]\n"
205*5f39d1b3SJooyung Han         "vmlal.u16 q12, d7, d1[0]\n"
206*5f39d1b3SJooyung Han         "vmlal.u16 q13, d7, d1[1]\n"
207*5f39d1b3SJooyung Han         "vmlal.u16 q14, d7, d1[2]\n"
208*5f39d1b3SJooyung Han         "vmlal.u16 q15, d7, d1[3]\n"
209*5f39d1b3SJooyung Han 
210*5f39d1b3SJooyung Han         "bne " GEMMLOWP_LABEL_LOOP "b\n"
211*5f39d1b3SJooyung Han 
212*5f39d1b3SJooyung Han         GEMMLOWP_LABEL_AFTER_LOOP
213*5f39d1b3SJooyung Han         ":\n"
214*5f39d1b3SJooyung Han 
215*5f39d1b3SJooyung Han         // Do remaining arithmetic for the last 2 levels of depth.
216*5f39d1b3SJooyung Han 
217*5f39d1b3SJooyung Han         // Expand Lhs/Rhs cells to 16 bit.
218*5f39d1b3SJooyung Han         "vmovl.u8 q0, d0\n"
219*5f39d1b3SJooyung Han         "vmovl.u8 q1, d2\n"
220*5f39d1b3SJooyung Han         "vmovl.u8 q2, d4\n"
221*5f39d1b3SJooyung Han         "vmovl.u8 q3, d6\n"
222*5f39d1b3SJooyung Han 
223*5f39d1b3SJooyung Han         // Multiply-accumulate, level of depth 0
224*5f39d1b3SJooyung Han         "vmlal.u16 q4, d2, d0[0]\n"
225*5f39d1b3SJooyung Han         "vmlal.u16 q5, d2, d0[1]\n"
226*5f39d1b3SJooyung Han         "vmlal.u16 q6, d2, d0[2]\n"
227*5f39d1b3SJooyung Han         "vmlal.u16 q7, d2, d0[3]\n"
228*5f39d1b3SJooyung Han         "vmlal.u16 q8, d4, d0[0]\n"
229*5f39d1b3SJooyung Han         "vmlal.u16 q9, d4, d0[1]\n"
230*5f39d1b3SJooyung Han         "vmlal.u16 q10, d4, d0[2]\n"
231*5f39d1b3SJooyung Han         "vmlal.u16 q11, d4, d0[3]\n"
232*5f39d1b3SJooyung Han         "vmlal.u16 q12, d6, d0[0]\n"
233*5f39d1b3SJooyung Han         "vmlal.u16 q13, d6, d0[1]\n"
234*5f39d1b3SJooyung Han         "vmlal.u16 q14, d6, d0[2]\n"
235*5f39d1b3SJooyung Han         "vmlal.u16 q15, d6, d0[3]\n"
236*5f39d1b3SJooyung Han 
237*5f39d1b3SJooyung Han         // Multiply-accumulate, level of depth 1
238*5f39d1b3SJooyung Han         "vmlal.u16 q4, d3, d1[0]\n"
239*5f39d1b3SJooyung Han         "vmlal.u16 q5, d3, d1[1]\n"
240*5f39d1b3SJooyung Han         "vmlal.u16 q6, d3, d1[2]\n"
241*5f39d1b3SJooyung Han         "vmlal.u16 q7, d3, d1[3]\n"
242*5f39d1b3SJooyung Han         "vmlal.u16 q8, d5, d1[0]\n"
243*5f39d1b3SJooyung Han         "vmlal.u16 q9, d5, d1[1]\n"
244*5f39d1b3SJooyung Han         "vmlal.u16 q10, d5, d1[2]\n"
245*5f39d1b3SJooyung Han         "vmlal.u16 q11, d5, d1[3]\n"
246*5f39d1b3SJooyung Han         "vmlal.u16 q12, d7, d1[0]\n"
247*5f39d1b3SJooyung Han         "vmlal.u16 q13, d7, d1[1]\n"
248*5f39d1b3SJooyung Han         "vmlal.u16 q14, d7, d1[2]\n"
249*5f39d1b3SJooyung Han         "vmlal.u16 q15, d7, d1[3]\n"
250*5f39d1b3SJooyung Han 
251*5f39d1b3SJooyung Han         // Store accumulators
252*5f39d1b3SJooyung Han         "mov r1, %[dst_ptr]\n"
253*5f39d1b3SJooyung Han         "mov r0, r1\n"
254*5f39d1b3SJooyung Han         "vst1.32 {d8, d9},   [r0]!\n"
255*5f39d1b3SJooyung Han         "add r1, %[dst_col_stride]\n"
256*5f39d1b3SJooyung Han         "vst1.32 {d16, d17}, [r0]!\n"
257*5f39d1b3SJooyung Han         "vst1.32 {d24, d25}, [r0]\n"
258*5f39d1b3SJooyung Han         "mov r0, r1\n"
259*5f39d1b3SJooyung Han         "vst1.32 {d10, d11}, [r0]!\n"
260*5f39d1b3SJooyung Han         "add r1, %[dst_col_stride]\n"
261*5f39d1b3SJooyung Han         "vst1.32 {d18, d19}, [r0]!\n"
262*5f39d1b3SJooyung Han         "vst1.32 {d26, d27}, [r0]\n"
263*5f39d1b3SJooyung Han         "mov r0, r1\n"
264*5f39d1b3SJooyung Han         "vst1.32 {d12, d13}, [r0]!\n"
265*5f39d1b3SJooyung Han         "add r1, %[dst_col_stride]\n"
266*5f39d1b3SJooyung Han         "vst1.32 {d20, d21}, [r0]!\n"
267*5f39d1b3SJooyung Han         "vst1.32 {d28, d29}, [r0]\n"
268*5f39d1b3SJooyung Han         "mov r0, r1\n"
269*5f39d1b3SJooyung Han         "vst1.32 {d14, d15}, [r0]!\n"
270*5f39d1b3SJooyung Han         "vst1.32 {d22, d23}, [r0]!\n"
271*5f39d1b3SJooyung Han         "vst1.32 {d30, d31}, [r0]\n"
272*5f39d1b3SJooyung Han         :  // outputs
273*5f39d1b3SJooyung Han         [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
274*5f39d1b3SJooyung Han         [dst_ptr] "+r"(dst_ptr),
275*5f39d1b3SJooyung Han         [run_depth] "+r"(run_depth)
276*5f39d1b3SJooyung Han         :  // inputs
277*5f39d1b3SJooyung Han         [start_depth] "r"(start_depth),
278*5f39d1b3SJooyung Han         [dst_col_stride] "r"(dst_col_stride)
279*5f39d1b3SJooyung Han         :  // clobbers
280*5f39d1b3SJooyung Han         "cc", "memory", "r0", "r1",
281*5f39d1b3SJooyung Han         // note: someone on internet says that quad registers are
282*5f39d1b3SJooyung Han         // unsupported in the clobber list!
283*5f39d1b3SJooyung Han         "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
284*5f39d1b3SJooyung Han         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
285*5f39d1b3SJooyung Han         "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
286*5f39d1b3SJooyung Han         "d31");
287*5f39d1b3SJooyung Han #undef GEMMLOWP_LABEL_CLEAR_ACCUMULATORS
288*5f39d1b3SJooyung Han #undef GEMMLOWP_LABEL_BEFORE_LOOP
289*5f39d1b3SJooyung Han #undef GEMMLOWP_LABEL_LOOP
290*5f39d1b3SJooyung Han #undef GEMMLOWP_LABEL_AFTER_LOOP
291*5f39d1b3SJooyung Han   }
292*5f39d1b3SJooyung Han };
293*5f39d1b3SJooyung Han 
294*5f39d1b3SJooyung Han struct NEON_32_Kernel12x4Depth2Assuming12BitProducts : KernelBase {
295*5f39d1b3SJooyung Han   typedef KernelFormat<
296*5f39d1b3SJooyung Han       KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 3>,
297*5f39d1b3SJooyung Han       KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> >
298*5f39d1b3SJooyung Han       Format;
299*5f39d1b3SJooyung Han 
NameNEON_32_Kernel12x4Depth2Assuming12BitProducts300*5f39d1b3SJooyung Han   const char* Name() const override {
301*5f39d1b3SJooyung Han     return "NEON, 12x4, depth 2, assuming 12-bit products";
302*5f39d1b3SJooyung Han   }
303*5f39d1b3SJooyung Han 
304*5f39d1b3SJooyung Han   // TODO(benoitjacob): reorder function arguments so dst comes last
RunNEON_32_Kernel12x4Depth2Assuming12BitProducts305*5f39d1b3SJooyung Han   void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
306*5f39d1b3SJooyung Han            std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
307*5f39d1b3SJooyung Han            const std::uint8_t* rhs_ptr, std::size_t start_depth,
308*5f39d1b3SJooyung Han            std::size_t run_depth) const override {
309*5f39d1b3SJooyung Han     ScopedProfilingLabel label(
310*5f39d1b3SJooyung Han         "optimized kernel (NEON 12x4, assuming 12-bit products)");
311*5f39d1b3SJooyung Han     assert(dst_row_stride == 1);
312*5f39d1b3SJooyung Han     (void)dst_row_stride;
313*5f39d1b3SJooyung Han 
314*5f39d1b3SJooyung Han // See comments above for why we need local numerical labels in our asm.
315*5f39d1b3SJooyung Han #define GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS "1"
316*5f39d1b3SJooyung Han #define GEMMLOWP_LOAD_GLOBAL_ACCUMULATORS_NEON_32_KERNEL_12X4_DEPTH2_12BIT "2"
317*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_32 "3"
318*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_24 "4"
319*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_16 "5"
320*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_8 "6"
321*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_2 "7"
322*5f39d1b3SJooyung Han 
323*5f39d1b3SJooyung Han     // This kernel is special in that it uses local 16-bit accumulators.
324*5f39d1b3SJooyung Han     // Because it assumes that each product fits in 12 bits, it can accumulate
325*5f39d1b3SJooyung Han     // 16 products into a local 16-bit accumulator without risking overflow.
326*5f39d1b3SJooyung Han     // At that point, it must accumulate these local 16-bit accumulators back
327*5f39d1b3SJooyung Han     // into global 32-bit accumulators, which have to be stored in memory for
328*5f39d1b3SJooyung Han     // lack of register space.
329*5f39d1b3SJooyung Han     // This 12x4 block of global accumulators is laid out as 3 cells of size 4x4
330*5f39d1b3SJooyung Han     // stored in diagonal-major order like this for the first 4x4 cell:
331*5f39d1b3SJooyung Han     //
332*5f39d1b3SJooyung Han     //   0   4   8  12
333*5f39d1b3SJooyung Han     //  13   1   5   9
334*5f39d1b3SJooyung Han     //  10  14   2   6
335*5f39d1b3SJooyung Han     //   7  11  15   3
336*5f39d1b3SJooyung Han     //
337*5f39d1b3SJooyung Han     // and likewise for the 2nd  cell (16--31) and 3rd cell (32--47)
338*5f39d1b3SJooyung Han     std::int32_t global_accumulators[3 * 4 * 4];
339*5f39d1b3SJooyung Han     asm volatile(
340*5f39d1b3SJooyung Han         // Compute stride between consecutive columns, in bytes
341*5f39d1b3SJooyung Han         "mov r0, #4\n"  // multiply by 4 = sizeof(int32)
342*5f39d1b3SJooyung Han         "mul %[dst_col_stride], r0\n"
343*5f39d1b3SJooyung Han 
344*5f39d1b3SJooyung Han         "cmp %[start_depth], #0\n"
345*5f39d1b3SJooyung Han         "bne"
346*5f39d1b3SJooyung Han         " " GEMMLOWP_LOAD_GLOBAL_ACCUMULATORS_NEON_32_KERNEL_12X4_DEPTH2_12BIT
347*5f39d1b3SJooyung Han         "f\n"
348*5f39d1b3SJooyung Han 
349*5f39d1b3SJooyung Han         // If start_depth==0, we need to clear our global accumulators
350*5f39d1b3SJooyung Han         "mov r0, %[global_accumulators]\n"
351*5f39d1b3SJooyung Han         "vmov.s32 q8, #0\n"
352*5f39d1b3SJooyung Han         "vmov.s32 q9, q8\n"
353*5f39d1b3SJooyung Han         "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
354*5f39d1b3SJooyung Han         "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
355*5f39d1b3SJooyung Han         "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
356*5f39d1b3SJooyung Han         "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
357*5f39d1b3SJooyung Han         "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
358*5f39d1b3SJooyung Han         "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
359*5f39d1b3SJooyung Han         "b " GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS
360*5f39d1b3SJooyung Han         "f\n"
361*5f39d1b3SJooyung Han 
362*5f39d1b3SJooyung Han         // If start_depth!=0, we need to load our existing global accumulators
363*5f39d1b3SJooyung Han         GEMMLOWP_LOAD_GLOBAL_ACCUMULATORS_NEON_32_KERNEL_12X4_DEPTH2_12BIT
364*5f39d1b3SJooyung Han         ":\n"
365*5f39d1b3SJooyung Han         // Load global accumulators from destination matrix, column-major
366*5f39d1b3SJooyung Han         "mov r1, %[dst_ptr]\n"
367*5f39d1b3SJooyung Han         "mov r0, %[dst_col_stride]\n"
368*5f39d1b3SJooyung Han         "sub r0, #32\n"
369*5f39d1b3SJooyung Han         "vld1.32 {d0,d1}, [r1]!\n"
370*5f39d1b3SJooyung Han         "vld1.32 {d8,d9}, [r1]!\n"
371*5f39d1b3SJooyung Han         "vld1.32 {d16,d17}, [r1], r0\n"
372*5f39d1b3SJooyung Han         "vld1.32 {d2,d3}, [r1]!\n"
373*5f39d1b3SJooyung Han         "vld1.32 {d10,d11}, [r1]!\n"
374*5f39d1b3SJooyung Han         "vld1.32 {d18,d19}, [r1], r0\n"
375*5f39d1b3SJooyung Han         "vld1.32 {d4,d5}, [r1]!\n"
376*5f39d1b3SJooyung Han         "vld1.32 {d12,d13}, [r1]!\n"
377*5f39d1b3SJooyung Han         "vld1.32 {d20,d21}, [r1], r0\n"
378*5f39d1b3SJooyung Han         "vld1.32 {d6,d7}, [r1]!\n"
379*5f39d1b3SJooyung Han         "vld1.32 {d14,d15}, [r1]!\n"
380*5f39d1b3SJooyung Han         "vld1.32 {d22,d23}, [r1], r0\n"
381*5f39d1b3SJooyung Han         // Now we need to convert the global accumulator registers to
382*5f39d1b3SJooyung Han         // 4x4-block-wise diagonal-major order. What we effectively want to do
383*5f39d1b3SJooyung Han         // is to rotate the rows, however the accumulators are stored in
384*5f39d1b3SJooyung Han         // column-major order in registers. So we achieve this by
385*5f39d1b3SJooyung Han         // transposing, rotating the registers, and transposing again each
386*5f39d1b3SJooyung Han         // 4x4 block.
387*5f39d1b3SJooyung Han         //
388*5f39d1b3SJooyung Han         // Transpose 3 4x4 blocks separately
389*5f39d1b3SJooyung Han         "vtrn.32 q0, q1\n"
390*5f39d1b3SJooyung Han         "vtrn.32 q2, q3\n"
391*5f39d1b3SJooyung Han         "vswp d1, d4\n"
392*5f39d1b3SJooyung Han         "vswp d3, d6\n"
393*5f39d1b3SJooyung Han         "vtrn.32 q4, q5\n"
394*5f39d1b3SJooyung Han         "vtrn.32 q6, q7\n"
395*5f39d1b3SJooyung Han         "vswp d9, d12\n"
396*5f39d1b3SJooyung Han         "vswp d11, d14\n"
397*5f39d1b3SJooyung Han         "vtrn.32 q8, q9\n"
398*5f39d1b3SJooyung Han         "vtrn.32 q10, q11\n"
399*5f39d1b3SJooyung Han         "vswp d17, d20\n"
400*5f39d1b3SJooyung Han         "vswp d19, d22\n"
401*5f39d1b3SJooyung Han         // Rotate the registers
402*5f39d1b3SJooyung Han         "vext.32 q1, q1, q1, #1\n"
403*5f39d1b3SJooyung Han         "vext.32 q2, q2, q2, #2\n"
404*5f39d1b3SJooyung Han         "vext.32 q3, q3, q3, #3\n"
405*5f39d1b3SJooyung Han         "vext.32 q5, q5, q5, #1\n"
406*5f39d1b3SJooyung Han         "vext.32 q6, q6, q6, #2\n"
407*5f39d1b3SJooyung Han         "vext.32 q7, q7, q7, #3\n"
408*5f39d1b3SJooyung Han         "vext.32 q9, q9, q9, #1\n"
409*5f39d1b3SJooyung Han         "vext.32 q10, q10, q10, #2\n"
410*5f39d1b3SJooyung Han         "vext.32 q11, q11, q11, #3\n"
411*5f39d1b3SJooyung Han         // Transpose again and store into our global accumulators
412*5f39d1b3SJooyung Han         // buffer. These two operations are done at once using vst4.
413*5f39d1b3SJooyung Han         "mov r0, %[global_accumulators]\n"
414*5f39d1b3SJooyung Han         "vst4.32 {d0,d2,d4,d6}, [r0]!\n"
415*5f39d1b3SJooyung Han         "vst4.32 {d1,d3,d5,d7}, [r0]!\n"
416*5f39d1b3SJooyung Han         "vst4.32 {d8,d10,d12,d14}, [r0]!\n"
417*5f39d1b3SJooyung Han         "vst4.32 {d9,d11,d13,d15}, [r0]!\n"
418*5f39d1b3SJooyung Han         "vst4.32 {d16,d18,d20,d22}, [r0]!\n"
419*5f39d1b3SJooyung Han         "vst4.32 {d17,d19,d21,d23}, [r0]!\n"
420*5f39d1b3SJooyung Han 
421*5f39d1b3SJooyung Han         /* Main loop */
422*5f39d1b3SJooyung Han 
423*5f39d1b3SJooyung Han         GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS
424*5f39d1b3SJooyung Han         ":\n"
425*5f39d1b3SJooyung Han 
426*5f39d1b3SJooyung Han     // Overview of register layout:
427*5f39d1b3SJooyung Han     //
428*5f39d1b3SJooyung Han     // Registers q4--q16 are the local 16-bit accumulators.
429*5f39d1b3SJooyung Han     // However, each entry in the result matrix is represented
430*5f39d1b3SJooyung Han     // by *two* local 16-bit accumulators: one for even levels
431*5f39d1b3SJooyung Han     // of depth and one for odd levels of depth. These correspond
432*5f39d1b3SJooyung Han     // to the scalars at even and odd indices within each q-register.
433*5f39d1b3SJooyung Han     // Thus we effectively use 32 bits of register space for each
434*5f39d1b3SJooyung Han     // entry in the result matrix. The accumulators register layout
435*5f39d1b3SJooyung Han     // is the same as was described above for the global 32-bit
436*5f39d1b3SJooyung Han     // accumulators (3 cells of size 4x4 in diagonal-major order)
437*5f39d1b3SJooyung Han     // with the only difference that instead of 32bit values we have
438*5f39d1b3SJooyung Han     // pairs of 16bit values.
439*5f39d1b3SJooyung Han     //
440*5f39d1b3SJooyung Han     // A 2x4 cell of Rhs is stored in 8bit in d0.
441*5f39d1b3SJooyung Han     // A 12x2 block of 3 4x2 cells Lhs is stored in 8bit in d1--d3.
442*5f39d1b3SJooyung Han     //
443*5f39d1b3SJooyung Han     //                      +--------+--------+--------+--------+
444*5f39d1b3SJooyung Han     //                      |d0[0]   |d0[2]   |d0[4]   |d0[6]   |
445*5f39d1b3SJooyung Han     //                 Rhs  +--------+--------+--------+--------+
446*5f39d1b3SJooyung Han     //                      |d0[1]   |d0[3]   |d0[5]   |d0[7]   |
447*5f39d1b3SJooyung Han     //                      +--------+--------+--------+--------+
448*5f39d1b3SJooyung Han     //
449*5f39d1b3SJooyung Han     //                      |        |        |        |        |
450*5f39d1b3SJooyung Han     //
451*5f39d1b3SJooyung Han     //    Lhs               |        |        |        |        |
452*5f39d1b3SJooyung Han     //
453*5f39d1b3SJooyung Han     //  +-----+-----+ - - - +--------+--------+--------+--------+
454*5f39d1b3SJooyung Han     //  |d1[0]|d1[1]|       |q4[0,1] |q5[0,1] |q6[0,1] |q7[0,1] |
455*5f39d1b3SJooyung Han     //  |d1[2]|d1[3]|       |q7[2,3] |q4[2,3] |q5[2,3] |q6[2,3] |
456*5f39d1b3SJooyung Han     //  |d1[4]|d1[5]|       |q6[4,5] |q7[4,5] |q4[4,5] |q5[4,5] |
457*5f39d1b3SJooyung Han     //  |d1[6]|d1[7]|       |q5[6,7] |q6[6,7] |q7[6,7] |q4[6,7] |
458*5f39d1b3SJooyung Han     //  +-----+-----+ - - - +--------+--------+--------+--------+
459*5f39d1b3SJooyung Han     //  |d2[0]|d2[1]|       |q8[0,1] |q8[0,1] |q8[0,1] |q8[0,1] |
460*5f39d1b3SJooyung Han     //  |d2[2]|d2[3]|       |q9[2,3] |q9[2,3] |q9[2,3] |q9[2,3] |
461*5f39d1b3SJooyung Han     //  |d2[4]|d2[5]|       |q10[4,5]|q10[4,5]|q10[4,5]|q10[4,5]|
462*5f39d1b3SJooyung Han     //  |d2[6]|d2[7]|       |q11[6,7]|q11[6,7]|q11[6,7]|q11[6,7]|
463*5f39d1b3SJooyung Han     //  +-----+-----+ - - - +--------+--------+--------+--------+
464*5f39d1b3SJooyung Han     //  |d3[0]|d3[1]|       |q12[0,1]|q12[0,1]|q12[0,1]|q12[0,1]|
465*5f39d1b3SJooyung Han     //  |d3[2]|d3[3]|       |q13[2,3]|q13[2,3]|q13[2,3]|q13[2,3]|
466*5f39d1b3SJooyung Han     //  |d3[4]|d3[5]|       |q14[4,5]|q14[4,5]|q14[4,5]|q14[4,5]|
467*5f39d1b3SJooyung Han     //  |d3[6]|d3[7]|       |q15[6,7]|q15[6,7]|q15[6,7]|q15[6,7]|
468*5f39d1b3SJooyung Han     //  +-----+-----+ - - - +--------+--------+--------+--------+
469*5f39d1b3SJooyung Han     //
470*5f39d1b3SJooyung Han     //                            Local 16-bit accumulators
471*5f39d1b3SJooyung Han     //                         Note: 2 scalars per matrix entry
472*5f39d1b3SJooyung Han 
473*5f39d1b3SJooyung Han #define GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH \
474*5f39d1b3SJooyung Han   /* Load 3 Lhs cells of size 4x2 */          \
475*5f39d1b3SJooyung Han   "vld1.8 {d1,d2,d3}, [%[lhs_ptr]:64]!\n"     \
476*5f39d1b3SJooyung Han                                               \
477*5f39d1b3SJooyung Han   /* Load 1 Rhs cell of size 2x4 */           \
478*5f39d1b3SJooyung Han   "vld1.8 {d0}, [%[rhs_ptr]:64]!\n"           \
479*5f39d1b3SJooyung Han                                               \
480*5f39d1b3SJooyung Han   /* Multiply-accumulate */                   \
481*5f39d1b3SJooyung Han   "vmlal.u8 q4, d1, d0\n"                     \
482*5f39d1b3SJooyung Han   "vmlal.u8 q8, d2, d0\n"                     \
483*5f39d1b3SJooyung Han   "vmlal.u8 q12, d3, d0\n"                    \
484*5f39d1b3SJooyung Han   "vext.8 d0, d0, d0, #2\n"                   \
485*5f39d1b3SJooyung Han   "vmlal.u8 q5, d1, d0\n"                     \
486*5f39d1b3SJooyung Han   "vmlal.u8 q9, d2, d0\n"                     \
487*5f39d1b3SJooyung Han   "vmlal.u8 q13, d3, d0\n"                    \
488*5f39d1b3SJooyung Han   "vext.8 d0, d0, d0, #2\n"                   \
489*5f39d1b3SJooyung Han   "vmlal.u8 q6, d1, d0\n"                     \
490*5f39d1b3SJooyung Han   "vmlal.u8 q10, d2, d0\n"                    \
491*5f39d1b3SJooyung Han   "vmlal.u8 q14, d3, d0\n"                    \
492*5f39d1b3SJooyung Han   "vext.8 d0, d0, d0, #2\n"                   \
493*5f39d1b3SJooyung Han   "vmlal.u8 q7, d1, d0\n"                     \
494*5f39d1b3SJooyung Han   "vmlal.u8 q11, d2, d0\n"                    \
495*5f39d1b3SJooyung Han   "vmlal.u8 q15, d3, d0\n"                    \
496*5f39d1b3SJooyung Han                                               \
497*5f39d1b3SJooyung Han   "sub %[run_depth], #2\n"
498*5f39d1b3SJooyung Han 
499*5f39d1b3SJooyung Han #define GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH \
500*5f39d1b3SJooyung Han   GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH       \
501*5f39d1b3SJooyung Han   GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH       \
502*5f39d1b3SJooyung Han   GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH       \
503*5f39d1b3SJooyung Han   GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH
504*5f39d1b3SJooyung Han 
505*5f39d1b3SJooyung Han         // Clear local 16-bit accumulators
506*5f39d1b3SJooyung Han         "vmov.s32 q4, #0\n"
507*5f39d1b3SJooyung Han         "vmov.s32 q5, q4\n"
508*5f39d1b3SJooyung Han         "vmov.s32 q6, q4\n"
509*5f39d1b3SJooyung Han         "vmov.s32 q7, q4\n"
510*5f39d1b3SJooyung Han         "vmov.s32 q8, q4\n"
511*5f39d1b3SJooyung Han         "vmov.s32 q9, q4\n"
512*5f39d1b3SJooyung Han         "vmov.s32 q10, q4\n"
513*5f39d1b3SJooyung Han         "vmov.s32 q11, q4\n"
514*5f39d1b3SJooyung Han         "vmov.s32 q12, q4\n"
515*5f39d1b3SJooyung Han         "vmov.s32 q13, q4\n"
516*5f39d1b3SJooyung Han         "vmov.s32 q14, q4\n"
517*5f39d1b3SJooyung Han         "vmov.s32 q15, q4\n"
518*5f39d1b3SJooyung Han 
519*5f39d1b3SJooyung Han         // Select a suitable number of depth levels
520*5f39d1b3SJooyung Han         // to process at this iteration. TODO (benoitjacob) I guess that
521*5f39d1b3SJooyung Han         // someone who really knows asm should make this a jump table.
522*5f39d1b3SJooyung Han         "cmp %[run_depth], #32\n"
523*5f39d1b3SJooyung Han         "bge " GEMMLOWP_LABEL_32
524*5f39d1b3SJooyung Han         "f\n"
525*5f39d1b3SJooyung Han         "cmp %[run_depth], #24\n"
526*5f39d1b3SJooyung Han         "bge " GEMMLOWP_LABEL_24
527*5f39d1b3SJooyung Han         "f\n"
528*5f39d1b3SJooyung Han         "cmp %[run_depth], #16\n"
529*5f39d1b3SJooyung Han         "bge " GEMMLOWP_LABEL_16
530*5f39d1b3SJooyung Han         "f\n"
531*5f39d1b3SJooyung Han         "cmp %[run_depth], #8\n"
532*5f39d1b3SJooyung Han         "bge " GEMMLOWP_LABEL_8
533*5f39d1b3SJooyung Han         "f\n"
534*5f39d1b3SJooyung Han         "b " GEMMLOWP_LABEL_2 "f\n"
535*5f39d1b3SJooyung Han 
536*5f39d1b3SJooyung Han         GEMMLOWP_LABEL_32
537*5f39d1b3SJooyung Han         ":\n" GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH GEMMLOWP_LABEL_24
538*5f39d1b3SJooyung Han         ":\n" GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH GEMMLOWP_LABEL_16
539*5f39d1b3SJooyung Han         ":\n" GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH GEMMLOWP_LABEL_8
540*5f39d1b3SJooyung Han         ":\n" GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH
541*5f39d1b3SJooyung Han             GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH
542*5f39d1b3SJooyung Han                 GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH GEMMLOWP_LABEL_2
543*5f39d1b3SJooyung Han         ":\n" GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH
544*5f39d1b3SJooyung Han 
545*5f39d1b3SJooyung Han         // Accumulate the local accumulators into the global accumulators.
546*5f39d1b3SJooyung Han         // This is about summing adjacent pairs of 16-bit scalars into
547*5f39d1b3SJooyung Han         // single 32-bit scalars, so we use pairwise long addition (vpadal).
548*5f39d1b3SJooyung Han         "mov r0, %[global_accumulators]\n"
549*5f39d1b3SJooyung Han         "mov r1, %[global_accumulators]\n"
550*5f39d1b3SJooyung Han         "vld1.32 {d0,d1,d2,d3}, [r0]!\n"
551*5f39d1b3SJooyung Han         "vld1.32 {d4,d5,d6,d7}, [r0]!\n"
552*5f39d1b3SJooyung Han         "vpadal.u16 q0, q4\n"
553*5f39d1b3SJooyung Han         "vpadal.u16 q1, q5\n"
554*5f39d1b3SJooyung Han         "vpadal.u16 q2, q6\n"
555*5f39d1b3SJooyung Han         "vpadal.u16 q3, q7\n"
556*5f39d1b3SJooyung Han         "vst1.32 {d0,d1,d2,d3}, [r1]!\n"
557*5f39d1b3SJooyung Han         "vst1.32 {d4,d5,d6,d7}, [r1]!\n"
558*5f39d1b3SJooyung Han         "vld1.32 {d0,d1,d2,d3}, [r0]!\n"
559*5f39d1b3SJooyung Han         "vld1.32 {d4,d5,d6,d7}, [r0]!\n"
560*5f39d1b3SJooyung Han         "vpadal.u16 q0, q8\n"
561*5f39d1b3SJooyung Han         "vpadal.u16 q1, q9\n"
562*5f39d1b3SJooyung Han         "vpadal.u16 q2, q10\n"
563*5f39d1b3SJooyung Han         "vpadal.u16 q3, q11\n"
564*5f39d1b3SJooyung Han         "vst1.32 {d0,d1,d2,d3}, [r1]!\n"
565*5f39d1b3SJooyung Han         "vst1.32 {d4,d5,d6,d7}, [r1]!\n"
566*5f39d1b3SJooyung Han         "vld1.32 {d0,d1,d2,d3}, [r0]!\n"
567*5f39d1b3SJooyung Han         "vld1.32 {d4,d5,d6,d7}, [r0]!\n"
568*5f39d1b3SJooyung Han         "vpadal.u16 q0, q12\n"
569*5f39d1b3SJooyung Han         "vpadal.u16 q1, q13\n"
570*5f39d1b3SJooyung Han         "vpadal.u16 q2, q14\n"
571*5f39d1b3SJooyung Han         "vpadal.u16 q3, q15\n"
572*5f39d1b3SJooyung Han         "vst1.32 {d0,d1,d2,d3}, [r1]!\n"
573*5f39d1b3SJooyung Han         "vst1.32 {d4,d5,d6,d7}, [r1]!\n"
574*5f39d1b3SJooyung Han 
575*5f39d1b3SJooyung Han         // Loop.
576*5f39d1b3SJooyung Han         "cmp %[run_depth], #0\n"
577*5f39d1b3SJooyung Han         "bne " GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS
578*5f39d1b3SJooyung Han         "b\n"
579*5f39d1b3SJooyung Han 
580*5f39d1b3SJooyung Han #undef GEMMLOWP_CLEAR_LOCAL_ACCUMULATORS
581*5f39d1b3SJooyung Han #undef GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH
582*5f39d1b3SJooyung Han #undef GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH
583*5f39d1b3SJooyung Han #undef GEMMLOWP_ADD_TO_GLOBAL_ACCUMULATORS
584*5f39d1b3SJooyung Han 
585*5f39d1b3SJooyung Han         /* end of main loop */
586*5f39d1b3SJooyung Han 
587*5f39d1b3SJooyung Han         // Store the global accumulators to the destination matrix
588*5f39d1b3SJooyung Han         // (column-major)
589*5f39d1b3SJooyung Han         // This is the reverse of the steps that we followed at the beginning
590*5f39d1b3SJooyung Han         // when we load the global accumulators from the destination matrix.
591*5f39d1b3SJooyung Han         // The problem is the same: how to convert 4x4 blocks
592*5f39d1b3SJooyung Han         // between column-major and diagonal-major orders.
593*5f39d1b3SJooyung Han         // Like above, we do this by rotating rows, and we achieve that by
594*5f39d1b3SJooyung Han         // tranposing, rotating columns, and transposing again.
595*5f39d1b3SJooyung Han         //
596*5f39d1b3SJooyung Han         // Load and transpose 4x4 blocks of global accumulators
597*5f39d1b3SJooyung Han         // These two steps are done at once by the vld4 instruction.
598*5f39d1b3SJooyung Han         "mov r0, %[global_accumulators]\n"
599*5f39d1b3SJooyung Han         "vld4.32 {d0,d2,d4,d6}, [r0]!\n"
600*5f39d1b3SJooyung Han         "vld4.32 {d1,d3,d5,d7}, [r0]!\n"
601*5f39d1b3SJooyung Han         "vld4.32 {d8,d10,d12,d14}, [r0]!\n"
602*5f39d1b3SJooyung Han         "vld4.32 {d9,d11,d13,d15}, [r0]!\n"
603*5f39d1b3SJooyung Han         "vld4.32 {d16,d18,d20,d22}, [r0]!\n"
604*5f39d1b3SJooyung Han         "vld4.32 {d17,d19,d21,d23}, [r0]!\n"
605*5f39d1b3SJooyung Han         // Rotate the rows of each 4x4 block
606*5f39d1b3SJooyung Han         "vext.32 q1, q1, q1, #3\n"
607*5f39d1b3SJooyung Han         "vext.32 q2, q2, q2, #2\n"
608*5f39d1b3SJooyung Han         "vext.32 q3, q3, q3, #1\n"
609*5f39d1b3SJooyung Han         "vext.32 q5, q5, q5, #3\n"
610*5f39d1b3SJooyung Han         "vext.32 q6, q6, q6, #2\n"
611*5f39d1b3SJooyung Han         "vext.32 q7, q7, q7, #1\n"
612*5f39d1b3SJooyung Han         "vext.32 q9, q9, q9, #3\n"
613*5f39d1b3SJooyung Han         "vext.32 q10, q10, q10, #2\n"
614*5f39d1b3SJooyung Han         "vext.32 q11, q11, q11, #1\n"
615*5f39d1b3SJooyung Han         // Transpose again each 4x4 block
616*5f39d1b3SJooyung Han         "vtrn.32 q0, q1\n"
617*5f39d1b3SJooyung Han         "vtrn.32 q2, q3\n"
618*5f39d1b3SJooyung Han         "vswp d1, d4\n"
619*5f39d1b3SJooyung Han         "vswp d3, d6\n"
620*5f39d1b3SJooyung Han         "vtrn.32 q4, q5\n"
621*5f39d1b3SJooyung Han         "vtrn.32 q6, q7\n"
622*5f39d1b3SJooyung Han         "vswp d9, d12\n"
623*5f39d1b3SJooyung Han         "vswp d11, d14\n"
624*5f39d1b3SJooyung Han         "vtrn.32 q8, q9\n"
625*5f39d1b3SJooyung Han         "vtrn.32 q10, q11\n"
626*5f39d1b3SJooyung Han         "vswp d17, d20\n"
627*5f39d1b3SJooyung Han         "vswp d19, d22\n"
628*5f39d1b3SJooyung Han         // Store into the column-major destination matrix
629*5f39d1b3SJooyung Han         "mov r1, %[dst_ptr]\n"
630*5f39d1b3SJooyung Han         "mov r0, %[dst_col_stride]\n"
631*5f39d1b3SJooyung Han         "sub r0, #32\n"
632*5f39d1b3SJooyung Han         "vst1.32 {d0,d1}, [r1]!\n"
633*5f39d1b3SJooyung Han         "vst1.32 {d8,d9}, [r1]!\n"
634*5f39d1b3SJooyung Han         "vst1.32 {d16,d17}, [r1], r0\n"
635*5f39d1b3SJooyung Han         "vst1.32 {d2,d3}, [r1]!\n"
636*5f39d1b3SJooyung Han         "vst1.32 {d10,d11}, [r1]!\n"
637*5f39d1b3SJooyung Han         "vst1.32 {d18,d19}, [r1], r0\n"
638*5f39d1b3SJooyung Han         "vst1.32 {d4,d5}, [r1]!\n"
639*5f39d1b3SJooyung Han         "vst1.32 {d12,d13}, [r1]!\n"
640*5f39d1b3SJooyung Han         "vst1.32 {d20,d21}, [r1], r0\n"
641*5f39d1b3SJooyung Han         "vst1.32 {d6,d7}, [r1]!\n"
642*5f39d1b3SJooyung Han         "vst1.32 {d14,d15}, [r1]!\n"
643*5f39d1b3SJooyung Han         "vst1.32 {d22,d23}, [r1], r0\n"
644*5f39d1b3SJooyung Han         :  // outputs
645*5f39d1b3SJooyung Han         [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
646*5f39d1b3SJooyung Han         [dst_ptr] "+r"(dst_ptr),
647*5f39d1b3SJooyung Han         [run_depth] "+r"(run_depth)
648*5f39d1b3SJooyung Han         :  // inputs
649*5f39d1b3SJooyung Han         [start_depth] "r"(start_depth), [dst_col_stride] "r"(dst_col_stride),
650*5f39d1b3SJooyung Han         [global_accumulators] "r"(&global_accumulators[0])
651*5f39d1b3SJooyung Han         :  // clobbers
652*5f39d1b3SJooyung Han         "cc", "memory", "r0", "r1",
653*5f39d1b3SJooyung Han         // note: someone on internet says that quad registers are
654*5f39d1b3SJooyung Han         // unsupported in the clobber list!
655*5f39d1b3SJooyung Han         "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
656*5f39d1b3SJooyung Han         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
657*5f39d1b3SJooyung Han         "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
658*5f39d1b3SJooyung Han         "d31");
659*5f39d1b3SJooyung Han #undef GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS
660*5f39d1b3SJooyung Han #undef GEMMLOWP_LOAD_GLOBAL_ACCUMULATORS_NEON_32_KERNEL_12X4_DEPTH2_12BIT
661*5f39d1b3SJooyung Han #undef GEMMLOWP_LABEL_32
662*5f39d1b3SJooyung Han #undef GEMMLOWP_LABEL_24
663*5f39d1b3SJooyung Han #undef GEMMLOWP_LABEL_16
664*5f39d1b3SJooyung Han #undef GEMMLOWP_LABEL_8
665*5f39d1b3SJooyung Han #undef GEMMLOWP_LABEL_2
666*5f39d1b3SJooyung Han   }
667*5f39d1b3SJooyung Han };
668*5f39d1b3SJooyung Han 
669*5f39d1b3SJooyung Han struct NEON_32bit_GEMM_Int8Operands_LhsNonzero : KernelBase {
670*5f39d1b3SJooyung Han   typedef KernelFormat<
671*5f39d1b3SJooyung Han       KernelSideFormatInt8<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
672*5f39d1b3SJooyung Han       KernelSideFormatInt8<CellFormat<2, 16, CellOrder::WidthMajor>, 1> >
673*5f39d1b3SJooyung Han       Format;
NameNEON_32bit_GEMM_Int8Operands_LhsNonzero674*5f39d1b3SJooyung Han   const char* Name() const override {
675*5f39d1b3SJooyung Han     return "NEON, 4x2, depth 16, accumulating two within signed int16";
676*5f39d1b3SJooyung Han   }
677*5f39d1b3SJooyung Han 
678*5f39d1b3SJooyung Han   // TODO(benoitjacob): reorder function arguments so dst comes last
RunNEON_32bit_GEMM_Int8Operands_LhsNonzero679*5f39d1b3SJooyung Han   void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
680*5f39d1b3SJooyung Han            std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
681*5f39d1b3SJooyung Han            const std::uint8_t* rhs_ptr, std::size_t start_depth,
682*5f39d1b3SJooyung Han            std::size_t run_depth) const override {
683*5f39d1b3SJooyung Han     (void)dst_row_stride;
684*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_AFTER_LOOP "1"
685*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_LOOP "2"
686*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES "3"
687*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_STORE "4"
688*5f39d1b3SJooyung Han     asm volatile(
689*5f39d1b3SJooyung Han         // Multiply dst_col_stride by 4 == sizeof(int32) to use
690*5f39d1b3SJooyung Han         // it as a byte offset below.
691*5f39d1b3SJooyung Han         "lsl %[dst_col_stride], %[dst_col_stride], #2\n"
692*5f39d1b3SJooyung Han 
693*5f39d1b3SJooyung Han         // Overview of register layout:
694*5f39d1b3SJooyung Han         //
695*5f39d1b3SJooyung Han         // A 2x16 block of Rhs is stored in 8 bit in d0--d3.
696*5f39d1b3SJooyung Han         // A 4x16 block of Lhs is stored in 8 bit in d4--d7. That is only
697*5f39d1b3SJooyung Han         // half of the register space required, so we loop over these registers
698*5f39d1b3SJooyung Han         // twice. Only half of it, a 2x16 block, is stored in d4--d7 at
699*5f39d1b3SJooyung Han         // any given time.
700*5f39d1b3SJooyung Han         //
701*5f39d1b3SJooyung Han         // A 4x2 block of accumulators is stored in q8--q15 (as 4x32 bit
702*5f39d1b3SJooyung Han         // components which need to be horizontally-added at the end)
703*5f39d1b3SJooyung Han         //
704*5f39d1b3SJooyung Han         // The Lhs vectors are multiplied by the Rhs vectors with a widening
705*5f39d1b3SJooyung Han         // multiply over the 8 first levels of depth, producing int16x8
706*5f39d1b3SJooyung Han         // vectors of products for each position in the accumulator matrix.
707*5f39d1b3SJooyung Han         // Here comes the special trick: since the operands are signed int8,
708*5f39d1b3SJooyung Han         // their range being [ -2^7 , 2^7 ), their products are in range
709*5f39d1b3SJooyung Han         // [ -2^14 , 2^14 - 1 ), meaning that we can add two such values
710*5f39d1b3SJooyung Han         // without any risk of overflowing int16.
711*5f39d1b3SJooyung Han         // We thus proceed with the 8 next levels of depth, multiplying
712*5f39d1b3SJooyung Han         // again Lhs by Rhs, accumulating into this existing int16x8 vector.
713*5f39d1b3SJooyung Han         //
714*5f39d1b3SJooyung Han         // Only then, having processed 16 levels of depth, do we need to
715*5f39d1b3SJooyung Han         // horizontally add these int16x8 accumulators into the final
716*5f39d1b3SJooyung Han         // int32x4 accumulators.
717*5f39d1b3SJooyung Han         //
718*5f39d1b3SJooyung Han         // As we do not have enough registers to store all 16 int16x8
719*5f39d1b3SJooyung Han         // temporary-16bit-accumulators, we have them cycle through q4--q7.
720*5f39d1b3SJooyung Han         //
721*5f39d1b3SJooyung Han         //
722*5f39d1b3SJooyung Han         // Register layout (ignoring the q4--q7 temporary 16bit accumulators):
723*5f39d1b3SJooyung Han         //
724*5f39d1b3SJooyung Han         //                               +----+----+
725*5f39d1b3SJooyung Han         //                               | d0 | d2 |
726*5f39d1b3SJooyung Han         //                               | .  | .  |
727*5f39d1b3SJooyung Han         //                               | .  | .  |
728*5f39d1b3SJooyung Han         //                               | .  | .  |
729*5f39d1b3SJooyung Han         //                       Rhs     +----+----+
730*5f39d1b3SJooyung Han         //                               | d1 | d3 |
731*5f39d1b3SJooyung Han         //                               | .  | .  |
732*5f39d1b3SJooyung Han         //                               | .  | .  |
733*5f39d1b3SJooyung Han         //                               | .  | .  |
734*5f39d1b3SJooyung Han         //                               +----+----+
735*5f39d1b3SJooyung Han         //
736*5f39d1b3SJooyung Han         //                               |    |    |
737*5f39d1b3SJooyung Han         //
738*5f39d1b3SJooyung Han         //    Lhs                        |    |    |
739*5f39d1b3SJooyung Han         //
740*5f39d1b3SJooyung Han         //  +--------+--------+ - - - -  +----+----+
741*5f39d1b3SJooyung Han         //  | d4 ... | d5 ... |          | q8 | q9 |
742*5f39d1b3SJooyung Han         //  | d6 ... | d7 ... |          | q10| q11|
743*5f39d1b3SJooyung Han         //  | d4 ... | d5 ... |          | q12| q13|
744*5f39d1b3SJooyung Han         //  | d6 ... | d7 ... |          | q14| q15|
745*5f39d1b3SJooyung Han         //  +--------+--------+ - - - -  +----+----+
746*5f39d1b3SJooyung Han         //
747*5f39d1b3SJooyung Han         //                               Accumulator
748*5f39d1b3SJooyung Han         //
749*5f39d1b3SJooyung Han 
750*5f39d1b3SJooyung Han         // Clear accumulators, and, interleaved with it,
751*5f39d1b3SJooyung Han         // initial loads of the first loop iteration,
752*5f39d1b3SJooyung Han         // taken out of the loop so that in the loop itself we have
753*5f39d1b3SJooyung Han         // optimal streaming of data from memory.
754*5f39d1b3SJooyung Han         "vldr d0, [%[rhs_ptr], #0]\n"
755*5f39d1b3SJooyung Han         "vmov.i32 q8, #0\n"
756*5f39d1b3SJooyung Han         "vldr d4, [%[lhs_ptr], #0]\n"
757*5f39d1b3SJooyung Han         "vmov.i32 q9, #0\n"
758*5f39d1b3SJooyung Han         "vldr d2, [%[rhs_ptr], #16]\n"
759*5f39d1b3SJooyung Han         "vmov.i32 q10, q8\n"
760*5f39d1b3SJooyung Han         "vldr d6, [%[lhs_ptr], #16]\n"
761*5f39d1b3SJooyung Han         "vmov.i32 q11, q8\n"
762*5f39d1b3SJooyung Han         "vldr d1, [%[rhs_ptr], #8]\n"
763*5f39d1b3SJooyung Han         "vmov.i32 q12, q8\n"
764*5f39d1b3SJooyung Han         "vldr d5, [%[lhs_ptr], #8]\n"
765*5f39d1b3SJooyung Han         "vmov.i32 q13, q8\n"
766*5f39d1b3SJooyung Han         "vldr d3, [%[rhs_ptr], #24]\n"
767*5f39d1b3SJooyung Han         "vmov.i32 q14, q8\n"
768*5f39d1b3SJooyung Han         "vldr d7, [%[lhs_ptr], #24]\n"
769*5f39d1b3SJooyung Han         "vmov.i32 q15, q8\n"
770*5f39d1b3SJooyung Han 
771*5f39d1b3SJooyung Han         // General loop.
772*5f39d1b3SJooyung Han         GEMMLOWP_LABEL_LOOP
773*5f39d1b3SJooyung Han         ":\n"
774*5f39d1b3SJooyung Han 
775*5f39d1b3SJooyung Han         // Multiply 8 first levels of depth.
776*5f39d1b3SJooyung Han         "vmull.s8    q4,  d0,  d4\n"
777*5f39d1b3SJooyung Han         "add %[rhs_ptr], %[rhs_ptr], #32\n"
778*5f39d1b3SJooyung Han         "vmull.s8    q5,  d2,  d4\n"
779*5f39d1b3SJooyung Han         "vldr d4, [%[lhs_ptr], #32]\n"
780*5f39d1b3SJooyung Han         "vmull.s8    q6,  d0,  d6\n"
781*5f39d1b3SJooyung Han         "vmull.s8    q7,  d2,  d6\n"
782*5f39d1b3SJooyung Han         "vldr d6, [%[lhs_ptr], #48]\n"
783*5f39d1b3SJooyung Han 
784*5f39d1b3SJooyung Han         // Multiply-accumulate second-half, again into the same
785*5f39d1b3SJooyung Han         // 16bit local accumulator registers. This is where we
786*5f39d1b3SJooyung Han         // take advantage of having int8 instead of uint8 and therefore
787*5f39d1b3SJooyung Han         // being able to accumulate two products into int16.
788*5f39d1b3SJooyung Han         "vmlal.s8    q4,  d1,  d5\n"
789*5f39d1b3SJooyung Han         "vmlal.s8    q5,  d3,  d5\n"
790*5f39d1b3SJooyung Han         "vldr d5, [%[lhs_ptr], #40]\n"
791*5f39d1b3SJooyung Han         "vmlal.s8    q6,  d1,  d7\n"
792*5f39d1b3SJooyung Han         "vmlal.s8    q7,  d3,  d7\n"
793*5f39d1b3SJooyung Han         "vldr d7, [%[lhs_ptr], #56]\n"
794*5f39d1b3SJooyung Han 
795*5f39d1b3SJooyung Han         // Add pairwise, accumulate into 32-bit accumulators.
796*5f39d1b3SJooyung Han         "vpadal.s16   q8,  q4\n"
797*5f39d1b3SJooyung Han         "add %[lhs_ptr], %[lhs_ptr], #64\n"
798*5f39d1b3SJooyung Han         "vpadal.s16   q9,  q5\n"
799*5f39d1b3SJooyung Han         "subs %[run_depth], %[run_depth], #16\n"
800*5f39d1b3SJooyung Han         "vpadal.s16   q10, q6\n"
801*5f39d1b3SJooyung Han         "vpadal.s16   q11, q7\n"
802*5f39d1b3SJooyung Han 
803*5f39d1b3SJooyung Han         "beq " GEMMLOWP_LABEL_AFTER_LOOP
804*5f39d1b3SJooyung Han         "f\n"
805*5f39d1b3SJooyung Han 
806*5f39d1b3SJooyung Han         // Multiply first half.
807*5f39d1b3SJooyung Han         "vmull.s8    q4,  d0,  d4\n"
808*5f39d1b3SJooyung Han         "vmull.s8    q5,  d2,  d4\n"
809*5f39d1b3SJooyung Han         "vldr d4, [%[lhs_ptr], #0]\n"
810*5f39d1b3SJooyung Han         "vmull.s8    q6,  d0,  d6\n"
811*5f39d1b3SJooyung Han         "vldr d0, [%[rhs_ptr], #0]\n"
812*5f39d1b3SJooyung Han         "vmull.s8    q7,  d2,  d6\n"
813*5f39d1b3SJooyung Han         "vldr d2, [%[rhs_ptr], #16]\n"
814*5f39d1b3SJooyung Han 
815*5f39d1b3SJooyung Han         // Multiply-accumulate second-half, again into the same
816*5f39d1b3SJooyung Han         // 16bit local accumulator registers. This is where we
817*5f39d1b3SJooyung Han         // take advantage of having int8 instead of uint8 and therefore
818*5f39d1b3SJooyung Han         // being able to accumulate two products into int16.
819*5f39d1b3SJooyung Han         "vmlal.s8    q4,  d1,  d5\n"
820*5f39d1b3SJooyung Han         "vldr d6, [%[lhs_ptr], #16]\n"
821*5f39d1b3SJooyung Han         "vmlal.s8    q5,  d3,  d5\n"
822*5f39d1b3SJooyung Han         "vldr d5, [%[lhs_ptr], #8]\n"
823*5f39d1b3SJooyung Han         "vmlal.s8    q6,  d1,  d7\n"
824*5f39d1b3SJooyung Han         "vldr d1, [%[rhs_ptr], #8]\n"
825*5f39d1b3SJooyung Han         "vmlal.s8    q7,  d3,  d7\n"
826*5f39d1b3SJooyung Han         "vldr d3, [%[rhs_ptr], #24]\n"
827*5f39d1b3SJooyung Han 
828*5f39d1b3SJooyung Han         // Add pairwise, accumulate into 32-bit accumulators.
829*5f39d1b3SJooyung Han         "vpadal.s16   q12, q4\n"
830*5f39d1b3SJooyung Han         "vldr d7, [%[lhs_ptr], #24]\n"
831*5f39d1b3SJooyung Han         "vpadal.s16   q13, q5\n"
832*5f39d1b3SJooyung Han         "vpadal.s16   q14, q6\n"
833*5f39d1b3SJooyung Han         "vpadal.s16   q15, q7\n"
834*5f39d1b3SJooyung Han 
835*5f39d1b3SJooyung Han         "b " GEMMLOWP_LABEL_LOOP "b\n"
836*5f39d1b3SJooyung Han 
837*5f39d1b3SJooyung Han         GEMMLOWP_LABEL_AFTER_LOOP
838*5f39d1b3SJooyung Han         ":\n"
839*5f39d1b3SJooyung Han 
840*5f39d1b3SJooyung Han         // Multiply first half.
841*5f39d1b3SJooyung Han         "vmull.s8    q4,  d0,  d4\n"
842*5f39d1b3SJooyung Han         "vmull.s8    q5,  d2,  d4\n"
843*5f39d1b3SJooyung Han         "vmull.s8    q6,  d0,  d6\n"
844*5f39d1b3SJooyung Han         "vmull.s8    q7,  d2,  d6\n"
845*5f39d1b3SJooyung Han 
846*5f39d1b3SJooyung Han         // Multiply-accumulate second-half, again into the same
847*5f39d1b3SJooyung Han         // 16bit local accumulator registers. This is where we
848*5f39d1b3SJooyung Han         // take advantage of having int8 instead of uint8 and therefore
849*5f39d1b3SJooyung Han         // being able to accumulate two products into int16.
850*5f39d1b3SJooyung Han         "vmlal.s8    q4,  d1,  d5\n"
851*5f39d1b3SJooyung Han         "vmlal.s8    q5,  d3,  d5\n"
852*5f39d1b3SJooyung Han         "vmlal.s8    q6,  d1,  d7\n"
853*5f39d1b3SJooyung Han         "vmlal.s8    q7,  d3,  d7\n"
854*5f39d1b3SJooyung Han 
855*5f39d1b3SJooyung Han         // Add pairwise, accumulate into 32-bit accumulators.
856*5f39d1b3SJooyung Han         "vpadal.s16   q12, q4\n"
857*5f39d1b3SJooyung Han         "vpadal.s16   q13, q5\n"
858*5f39d1b3SJooyung Han         "vpadal.s16   q14, q6\n"
859*5f39d1b3SJooyung Han         "vpadal.s16   q15, q7\n"
860*5f39d1b3SJooyung Han         "cmp %[start_depth], #0\n"
861*5f39d1b3SJooyung Han 
862*5f39d1b3SJooyung Han         // Reduce 32bit accumulators horizontally.
863*5f39d1b3SJooyung Han         "vpadd.s32 d0, d16, d17\n"
864*5f39d1b3SJooyung Han         "vpadd.s32 d1, d18, d19\n"
865*5f39d1b3SJooyung Han         "vpadd.s32 d2, d20, d21\n"
866*5f39d1b3SJooyung Han         "vpadd.s32 d3, d22, d23\n"
867*5f39d1b3SJooyung Han         "vpadd.s32 d4, d24, d25\n"
868*5f39d1b3SJooyung Han         "vpadd.s32 d5, d26, d27\n"
869*5f39d1b3SJooyung Han         "vpadd.s32 d6, d28, d29\n"
870*5f39d1b3SJooyung Han         "vpadd.s32 d7, d30, d31\n"
871*5f39d1b3SJooyung Han 
872*5f39d1b3SJooyung Han         "bne " GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
873*5f39d1b3SJooyung Han         "f\n"
874*5f39d1b3SJooyung Han 
875*5f39d1b3SJooyung Han         // Reduce 32bit accumulators horizontally, second pass
876*5f39d1b3SJooyung Han         // (each pass adds pairwise. we need to add 4-wise).
877*5f39d1b3SJooyung Han         "vpadd.s32 d8, d0, d2\n"
878*5f39d1b3SJooyung Han         "vpadd.s32 d9, d4, d6\n"
879*5f39d1b3SJooyung Han         "vpadd.s32 d10, d1, d3\n"
880*5f39d1b3SJooyung Han         "vpadd.s32 d11, d5, d7\n"
881*5f39d1b3SJooyung Han 
882*5f39d1b3SJooyung Han         "b " GEMMLOWP_LABEL_STORE "f\n"
883*5f39d1b3SJooyung Han 
884*5f39d1b3SJooyung Han         GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
885*5f39d1b3SJooyung Han         ":\n"
886*5f39d1b3SJooyung Han 
887*5f39d1b3SJooyung Han         // Reduce 32bit accumulators horizontally, second pass
888*5f39d1b3SJooyung Han         // (each pass adds pairwise. we need to add 4-wise),
889*5f39d1b3SJooyung Han         // and load destination values from memory.
890*5f39d1b3SJooyung Han         "mov r0, %[dst_ptr]\n"
891*5f39d1b3SJooyung Han         "vld1.32 {d16, d17}, [r0], %[dst_col_stride]\n"
892*5f39d1b3SJooyung Han         "vpadd.s32 d8, d0, d2\n"
893*5f39d1b3SJooyung Han         "vpadd.s32 d9, d4, d6\n"
894*5f39d1b3SJooyung Han         "vld1.32 {d18, d19}, [r0]\n"
895*5f39d1b3SJooyung Han         "vpadd.s32 d10, d1, d3\n"
896*5f39d1b3SJooyung Han         "vpadd.s32 d11, d5, d7\n"
897*5f39d1b3SJooyung Han 
898*5f39d1b3SJooyung Han         // Add horizontally-reduced accumulators into
899*5f39d1b3SJooyung Han         // the values loaded from memory
900*5f39d1b3SJooyung Han         "vadd.s32 q4, q8, q4\n"
901*5f39d1b3SJooyung Han         "vadd.s32 q5, q9, q5\n"
902*5f39d1b3SJooyung Han 
903*5f39d1b3SJooyung Han         GEMMLOWP_LABEL_STORE
904*5f39d1b3SJooyung Han         ":\n"
905*5f39d1b3SJooyung Han         // Store back into memory
906*5f39d1b3SJooyung Han         "mov r0, %[dst_ptr]\n"
907*5f39d1b3SJooyung Han         "vst1.32 {d8, d9}, [r0], %[dst_col_stride]\n"
908*5f39d1b3SJooyung Han         "vst1.32 {d10, d11}, [r0]\n"
909*5f39d1b3SJooyung Han         :  // outputs
910*5f39d1b3SJooyung Han         [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
911*5f39d1b3SJooyung Han         [dst_ptr] "+r"(dst_ptr), [run_depth] "+r"(run_depth)
912*5f39d1b3SJooyung Han         :  // inputs
913*5f39d1b3SJooyung Han         [start_depth] "r"(start_depth),
914*5f39d1b3SJooyung Han         [dst_col_stride] "r"(dst_col_stride)
915*5f39d1b3SJooyung Han         :  // clobbers
916*5f39d1b3SJooyung Han         "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
917*5f39d1b3SJooyung Han         "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
918*5f39d1b3SJooyung Han         "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
919*5f39d1b3SJooyung Han         "d28", "d29", "d30", "d31");
920*5f39d1b3SJooyung Han #undef GEMMLOWP_LABEL_LOOP
921*5f39d1b3SJooyung Han #undef GEMMLOWP_LABEL_AFTER_LOOP
922*5f39d1b3SJooyung Han #undef GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
923*5f39d1b3SJooyung Han #undef GEMMLOWP_LABEL_STORE
924*5f39d1b3SJooyung Han   }
925*5f39d1b3SJooyung Han };
926*5f39d1b3SJooyung Han 
927*5f39d1b3SJooyung Han // Same as NEON_32bit_GEMM_Int8Operands_LhsNonzero, but uses a side format that
928*5f39d1b3SJooyung Han // requires that user inputs were originally int8. This avoids the uint8->int8
929*5f39d1b3SJooyung Han // conversion in the pack step.
930*5f39d1b3SJooyung Han struct NEON_32bit_GEMM_Int8Operands_LhsNonzero_Int8Inputs
931*5f39d1b3SJooyung Han     : NEON_32bit_GEMM_Int8Operands_LhsNonzero {
932*5f39d1b3SJooyung Han   typedef KernelFormat<
933*5f39d1b3SJooyung Han       KernelSideFormatInt8Inputs<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
934*5f39d1b3SJooyung Han       KernelSideFormatInt8Inputs<CellFormat<2, 16, CellOrder::WidthMajor>, 1> >
935*5f39d1b3SJooyung Han       Format;
936*5f39d1b3SJooyung Han };
937*5f39d1b3SJooyung Han 
938*5f39d1b3SJooyung Han #endif  // GEMMLOWP_NEON_32
939*5f39d1b3SJooyung Han 
940*5f39d1b3SJooyung Han // The kernels here are specifically arm 64bit assembly, not arm 32bit.
941*5f39d1b3SJooyung Han #ifdef GEMMLOWP_NEON_64
942*5f39d1b3SJooyung Han 
943*5f39d1b3SJooyung Han struct NEON_64bit_GEMM_Int8Operands_LhsNonzero : KernelBase {
944*5f39d1b3SJooyung Han   typedef KernelFormat<
945*5f39d1b3SJooyung Han       KernelSideFormatInt8<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
946*5f39d1b3SJooyung Han       KernelSideFormatInt8<CellFormat<4, 16, CellOrder::WidthMajor>, 1> >
947*5f39d1b3SJooyung Han       Format;
NameNEON_64bit_GEMM_Int8Operands_LhsNonzero948*5f39d1b3SJooyung Han   const char* Name() const override {
949*5f39d1b3SJooyung Han     return "NEON, 4x4, depth 16, accumulating two within signed int16";
950*5f39d1b3SJooyung Han   }
951*5f39d1b3SJooyung Han 
952*5f39d1b3SJooyung Han   // TODO(benoitjacob): reorder function arguments so dst comes last
RunNEON_64bit_GEMM_Int8Operands_LhsNonzero953*5f39d1b3SJooyung Han   void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
954*5f39d1b3SJooyung Han            std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
955*5f39d1b3SJooyung Han            const std::uint8_t* rhs_ptr, std::size_t start_depth,
956*5f39d1b3SJooyung Han            std::size_t run_depth) const override {
957*5f39d1b3SJooyung Han     (void)dst_row_stride;
958*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_AFTER_LOOP_LAST16 "1"
959*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_LOOP "2"
960*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES "3"
961*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_STORE "4"
962*5f39d1b3SJooyung Han     asm volatile(
963*5f39d1b3SJooyung Han         // Clear accumulators, and, interleaved with it,
964*5f39d1b3SJooyung Han         // initial loads of the first loop iteration,
965*5f39d1b3SJooyung Han         // taken out of the loop so that in the loop itself we have
966*5f39d1b3SJooyung Han         // optimal streaming of data from memory.
967*5f39d1b3SJooyung Han         "ld1 {v0.16b}, [%[rhs_ptr]], #16\n"
968*5f39d1b3SJooyung Han         "dup v16.4s, wzr\n"
969*5f39d1b3SJooyung Han         "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"
970*5f39d1b3SJooyung Han         "dup v17.4s, wzr\n"
971*5f39d1b3SJooyung Han         "ld1 {v1.16b}, [%[rhs_ptr]], #16\n"
972*5f39d1b3SJooyung Han         "dup v18.4s, wzr\n"
973*5f39d1b3SJooyung Han         "ld1 {v5.16b}, [%[lhs_ptr]], #16\n"
974*5f39d1b3SJooyung Han         "dup v19.4s, wzr\n"
975*5f39d1b3SJooyung Han         "ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
976*5f39d1b3SJooyung Han         "dup v20.4s, wzr\n"
977*5f39d1b3SJooyung Han         "ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
978*5f39d1b3SJooyung Han         "dup v21.4s, wzr\n"
979*5f39d1b3SJooyung Han         "ld1 {v6.16b}, [%[lhs_ptr]], #16\n"
980*5f39d1b3SJooyung Han         "dup v22.4s, wzr\n"
981*5f39d1b3SJooyung Han         "ld1 {v7.16b}, [%[lhs_ptr]], #16\n"
982*5f39d1b3SJooyung Han         "dup v23.4s, wzr\n"
983*5f39d1b3SJooyung Han         "dup v24.4s, wzr\n"
984*5f39d1b3SJooyung Han         "dup v25.4s, wzr\n"
985*5f39d1b3SJooyung Han         "dup v26.4s, wzr\n"
986*5f39d1b3SJooyung Han         "dup v27.4s, wzr\n"
987*5f39d1b3SJooyung Han         "dup v28.4s, wzr\n"
988*5f39d1b3SJooyung Han         "dup v29.4s, wzr\n"
989*5f39d1b3SJooyung Han         "dup v30.4s, wzr\n"
990*5f39d1b3SJooyung Han         "dup v31.4s, wzr\n"
991*5f39d1b3SJooyung Han 
992*5f39d1b3SJooyung Han         // Multiply dst_col_stride by 4 == sizeof(int32) to use
993*5f39d1b3SJooyung Han         // it as a byte offset below.
994*5f39d1b3SJooyung Han         "lsl %[dst_col_stride], %[dst_col_stride], #2\n"
995*5f39d1b3SJooyung Han 
996*5f39d1b3SJooyung Han         // Initial arithmetic of the first loop iteration,
997*5f39d1b3SJooyung Han         // taken out of the loop so that in the loop itself we have
998*5f39d1b3SJooyung Han         // optimal streaming of data from memory.
999*5f39d1b3SJooyung Han         "smull    v8.8h,  v0.8b,  v4.8b\n"
1000*5f39d1b3SJooyung Han         "smull    v9.8h,  v1.8b,  v4.8b\n"
1001*5f39d1b3SJooyung Han         "smull    v10.8h,  v2.8b,  v4.8b\n"
1002*5f39d1b3SJooyung Han         "smull    v11.8h,  v3.8b,  v4.8b\n"
1003*5f39d1b3SJooyung Han         "smull    v12.8h,  v0.8b,  v5.8b\n"
1004*5f39d1b3SJooyung Han         "smull    v13.8h,  v1.8b,  v5.8b\n"
1005*5f39d1b3SJooyung Han         "smull    v14.8h,  v2.8b,  v5.8b\n"
1006*5f39d1b3SJooyung Han         "smull    v15.8h,  v3.8b,  v5.8b\n"
1007*5f39d1b3SJooyung Han 
1008*5f39d1b3SJooyung Han         // Multiply-accumulate second-half, again into the same
1009*5f39d1b3SJooyung Han         // 16bit local accumulator registers. This is where we
1010*5f39d1b3SJooyung Han         // take advantage of having int8 instead of uint8 and therefore
1011*5f39d1b3SJooyung Han         // being able to accumulate two products into int16.
1012*5f39d1b3SJooyung Han         "smlal2   v8.8h,  v0.16b,  v4.16b\n"
1013*5f39d1b3SJooyung Han         "smlal2   v9.8h,  v1.16b,  v4.16b\n"
1014*5f39d1b3SJooyung Han         "smlal2   v10.8h,  v2.16b,  v4.16b\n"
1015*5f39d1b3SJooyung Han         "smlal2   v11.8h,  v3.16b,  v4.16b\n"
1016*5f39d1b3SJooyung Han         "smlal2   v12.8h,  v0.16b,  v5.16b\n"
1017*5f39d1b3SJooyung Han         "smlal2   v13.8h,  v1.16b,  v5.16b\n"
1018*5f39d1b3SJooyung Han         "smlal2   v14.8h,  v2.16b,  v5.16b\n"
1019*5f39d1b3SJooyung Han         "smlal2   v15.8h,  v3.16b,  v5.16b\n"
1020*5f39d1b3SJooyung Han 
1021*5f39d1b3SJooyung Han         "subs %[run_depth], %[run_depth], #16\n"
1022*5f39d1b3SJooyung Han 
1023*5f39d1b3SJooyung Han         // If the loop depth is only 16, then we can skip the general loop
1024*5f39d1b3SJooyung Han         // and go straight to the final part of the code.
1025*5f39d1b3SJooyung Han         "beq " GEMMLOWP_LABEL_AFTER_LOOP_LAST16 "f\n"
1026*5f39d1b3SJooyung Han 
1027*5f39d1b3SJooyung Han         // General loop.
1028*5f39d1b3SJooyung Han         GEMMLOWP_LABEL_LOOP
1029*5f39d1b3SJooyung Han         ":\n"
1030*5f39d1b3SJooyung Han 
1031*5f39d1b3SJooyung Han         // Overview of register layout:
1032*5f39d1b3SJooyung Han         //
1033*5f39d1b3SJooyung Han         // A 4x16 block of Rhs is stored in 8 bit in v0--v3.
1034*5f39d1b3SJooyung Han         // A 4x16 block of Lhs is stored in 8 bit in v4--v7.
1035*5f39d1b3SJooyung Han         //
1036*5f39d1b3SJooyung Han         // A 4x4 block of accumulators is stored in v16-v31 (as 4x32 bit
1037*5f39d1b3SJooyung Han         // components which need to be horizontally-added at the end)
1038*5f39d1b3SJooyung Han         //
1039*5f39d1b3SJooyung Han         // The Lhs vectors are multiplied by the Rhs vectors with a widening
1040*5f39d1b3SJooyung Han         // multiply over the 8 first levels of depth, producing int16x8
1041*5f39d1b3SJooyung Han         // vectors of products for each position in the accumulator matrix.
1042*5f39d1b3SJooyung Han         // Here comes the special trick: since the operands are signed int8,
1043*5f39d1b3SJooyung Han         // their range being [ -2^7 , 2^7 ), their products are in range
1044*5f39d1b3SJooyung Han         // [ -2^14 , 2^14 - 1 ), meaning that we can add two such values
1045*5f39d1b3SJooyung Han         // without any risk of overflowing int16.
1046*5f39d1b3SJooyung Han         // We thus proceed with the 8 next levels of depth, multiplying
1047*5f39d1b3SJooyung Han         // again Lhs by Rhs, accumulating into this existing int16x8 vector.
1048*5f39d1b3SJooyung Han         //
1049*5f39d1b3SJooyung Han         // Only then, having processed 16 levels of depth, do we need to
1050*5f39d1b3SJooyung Han         // horizontally add these int16x8 accumulators into the final
1051*5f39d1b3SJooyung Han         // int32x4 accumulators.
1052*5f39d1b3SJooyung Han         //
1053*5f39d1b3SJooyung Han         // As we do not have enough registers to store all 16 int16x8
1054*5f39d1b3SJooyung Han         // temporary-16bit-accumulators, we have them cycle through v8--v15.
1055*5f39d1b3SJooyung Han         //
1056*5f39d1b3SJooyung Han         //
1057*5f39d1b3SJooyung Han         // Register layout (ignoring the v8--v15 temporary 16bit accumulators):
1058*5f39d1b3SJooyung Han         //
1059*5f39d1b3SJooyung Han         //                               +--------+--------+--------+--------+
1060*5f39d1b3SJooyung Han         //                               |v0.b[0] |v1.b[0] |v2.b[0] |v3.b[0] |
1061*5f39d1b3SJooyung Han         //                          Rhs  +--------+--------+--------+--------+
1062*5f39d1b3SJooyung Han         //                               |  ...   |  ...   |  ...   |  ...   |
1063*5f39d1b3SJooyung Han         //                               +--------+--------+--------+--------|
1064*5f39d1b3SJooyung Han         //                               |v0.b[15]|v1.b[15]|v2.b[15]|v3.b[15]|
1065*5f39d1b3SJooyung Han         //                               +--------+--------+--------+--------+
1066*5f39d1b3SJooyung Han         //
1067*5f39d1b3SJooyung Han         //                               |        |        |        |        |
1068*5f39d1b3SJooyung Han         //
1069*5f39d1b3SJooyung Han         //    Lhs                        |        |        |        |        |
1070*5f39d1b3SJooyung Han         //
1071*5f39d1b3SJooyung Han         //  +-------+-----+--------+ - - +--------+--------+--------+--------+
1072*5f39d1b3SJooyung Han         //  |v4.b[0]| ... |v4.b[15]|     | v16.4s | v17.4s | v18.4s | v19.4s |
1073*5f39d1b3SJooyung Han         //  |v5.b[0]| ... |v5.b[15]|     | v20.4s | v21.4s | v22.4s | v23.4s |
1074*5f39d1b3SJooyung Han         //  |v6.b[0]| ... |v6.b[15]|     | v24.4s | v25.4s | v26.4s | v27.4s |
1075*5f39d1b3SJooyung Han         //  |v7.b[0]| ... |v7.b[15]|     | v28.4s | v29.4s | v30.4s | v31.4s |
1076*5f39d1b3SJooyung Han         //  +-------+--------------+ - - +--------+--------+--------+--------+
1077*5f39d1b3SJooyung Han         //
1078*5f39d1b3SJooyung Han         //                                                Accumulator
1079*5f39d1b3SJooyung Han         //
1080*5f39d1b3SJooyung Han 
1081*5f39d1b3SJooyung Han         // Some multiplications and 16-bit accumulation were already done above,
1082*5f39d1b3SJooyung Han         // so we start right away in the middle.
1083*5f39d1b3SJooyung Han         "sadalp  v16.4s, v8.8h\n"
1084*5f39d1b3SJooyung Han         "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"
1085*5f39d1b3SJooyung Han         "smull    v8.8h,  v0.8b,  v6.8b\n"
1086*5f39d1b3SJooyung Han         "sadalp  v17.4s, v9.8h\n"
1087*5f39d1b3SJooyung Han         "ld1 {v5.16b}, [%[lhs_ptr]], #16\n"
1088*5f39d1b3SJooyung Han         "smull    v9.8h,  v1.8b,  v6.8b\n"
1089*5f39d1b3SJooyung Han         "sadalp  v18.4s, v10.8h\n"
1090*5f39d1b3SJooyung Han         "smull    v10.8h,  v2.8b,  v6.8b\n"
1091*5f39d1b3SJooyung Han         "sadalp  v19.4s, v11.8h\n"
1092*5f39d1b3SJooyung Han         "smull    v11.8h,  v3.8b,  v6.8b\n"
1093*5f39d1b3SJooyung Han         "sadalp  v20.4s, v12.8h\n"
1094*5f39d1b3SJooyung Han         "smull    v12.8h,  v0.8b,  v7.8b\n"
1095*5f39d1b3SJooyung Han         "sadalp  v21.4s, v13.8h\n"
1096*5f39d1b3SJooyung Han         "smull    v13.8h,  v1.8b,  v7.8b\n"
1097*5f39d1b3SJooyung Han         "sadalp  v22.4s, v14.8h\n"
1098*5f39d1b3SJooyung Han         "smull    v14.8h,  v2.8b,  v7.8b\n"
1099*5f39d1b3SJooyung Han         "sadalp  v23.4s, v15.8h\n"
1100*5f39d1b3SJooyung Han         "smull    v15.8h,  v3.8b,  v7.8b\n"
1101*5f39d1b3SJooyung Han 
1102*5f39d1b3SJooyung Han         // Multiply-accumulate second-half, again into the same
1103*5f39d1b3SJooyung Han         // 16bit local accumulator registers. This is where we
1104*5f39d1b3SJooyung Han         // take advantage of having int8 instead of uint8 and therefore
1105*5f39d1b3SJooyung Han         // being able to accumulate two products into int16.
1106*5f39d1b3SJooyung Han         "smlal2   v8.8h,  v0.16b,  v6.16b\n"
1107*5f39d1b3SJooyung Han         "smlal2   v9.8h,  v1.16b,  v6.16b\n"
1108*5f39d1b3SJooyung Han         "smlal2   v10.8h,  v2.16b,  v6.16b\n"
1109*5f39d1b3SJooyung Han         "smlal2   v11.8h,  v3.16b,  v6.16b\n"
1110*5f39d1b3SJooyung Han 
1111*5f39d1b3SJooyung Han         "ld1 {v6.16b}, [%[lhs_ptr]], #16\n"
1112*5f39d1b3SJooyung Han 
1113*5f39d1b3SJooyung Han         "smlal2   v12.8h,  v0.16b,  v7.16b\n"
1114*5f39d1b3SJooyung Han         "ld1 {v0.16b}, [%[rhs_ptr]], #16\n"
1115*5f39d1b3SJooyung Han         "smlal2   v13.8h,  v1.16b,  v7.16b\n"
1116*5f39d1b3SJooyung Han         "ld1 {v1.16b}, [%[rhs_ptr]], #16\n"
1117*5f39d1b3SJooyung Han         "smlal2   v14.8h,  v2.16b,  v7.16b\n"
1118*5f39d1b3SJooyung Han         "ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
1119*5f39d1b3SJooyung Han         "smlal2   v15.8h,  v3.16b,  v7.16b\n"
1120*5f39d1b3SJooyung Han         "ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
1121*5f39d1b3SJooyung Han 
1122*5f39d1b3SJooyung Han         "sadalp  v24.4s, v8.8h\n"
1123*5f39d1b3SJooyung Han         "smull    v8.8h,  v0.8b,  v4.8b\n"
1124*5f39d1b3SJooyung Han         "sadalp  v25.4s, v9.8h\n"
1125*5f39d1b3SJooyung Han         "ld1 {v7.16b}, [%[lhs_ptr]], #16\n"
1126*5f39d1b3SJooyung Han         "smull    v9.8h,  v1.8b,  v4.8b\n"
1127*5f39d1b3SJooyung Han         "sadalp  v26.4s, v10.8h\n"
1128*5f39d1b3SJooyung Han         "smull    v10.8h,  v2.8b,  v4.8b\n"
1129*5f39d1b3SJooyung Han         "sadalp  v27.4s, v11.8h\n"
1130*5f39d1b3SJooyung Han         "smull    v11.8h,  v3.8b,  v4.8b\n"
1131*5f39d1b3SJooyung Han         "sadalp  v28.4s, v12.8h\n"
1132*5f39d1b3SJooyung Han         "smull    v12.8h,  v0.8b,  v5.8b\n"
1133*5f39d1b3SJooyung Han         "sadalp  v29.4s, v13.8h\n"
1134*5f39d1b3SJooyung Han         "smull    v13.8h,  v1.8b,  v5.8b\n"
1135*5f39d1b3SJooyung Han         "sadalp  v30.4s, v14.8h\n"
1136*5f39d1b3SJooyung Han         "smull    v14.8h,  v2.8b,  v5.8b\n"
1137*5f39d1b3SJooyung Han         "sadalp  v31.4s, v15.8h\n"
1138*5f39d1b3SJooyung Han         "smull    v15.8h,  v3.8b,  v5.8b\n"
1139*5f39d1b3SJooyung Han 
1140*5f39d1b3SJooyung Han         // Multiply-accumulate second-half, again into the same
1141*5f39d1b3SJooyung Han         // 16bit local accumulator registers. This is where we
1142*5f39d1b3SJooyung Han         // take advantage of having int8 instead of uint8 and therefore
1143*5f39d1b3SJooyung Han         // being able to accumulate two products into int16.
1144*5f39d1b3SJooyung Han         "smlal2   v8.8h,  v0.16b,  v4.16b\n"
1145*5f39d1b3SJooyung Han         "smlal2   v9.8h,  v1.16b,  v4.16b\n"
1146*5f39d1b3SJooyung Han         "smlal2   v10.8h,  v2.16b,  v4.16b\n"
1147*5f39d1b3SJooyung Han         "smlal2   v11.8h,  v3.16b,  v4.16b\n"
1148*5f39d1b3SJooyung Han 
1149*5f39d1b3SJooyung Han         // Loop. Decrement loop index (depth) by 16, since we just handled
1150*5f39d1b3SJooyung Han         // 16 levels of depth.  Do this subs a bit before the end of the loop
1151*5f39d1b3SJooyung Han         // for better dispatch on A57.
1152*5f39d1b3SJooyung Han         "subs %[run_depth], %[run_depth], #16\n"
1153*5f39d1b3SJooyung Han 
1154*5f39d1b3SJooyung Han         "smlal2   v12.8h,  v0.16b,  v5.16b\n"
1155*5f39d1b3SJooyung Han         "smlal2   v13.8h,  v1.16b,  v5.16b\n"
1156*5f39d1b3SJooyung Han         "smlal2   v14.8h,  v2.16b,  v5.16b\n"
1157*5f39d1b3SJooyung Han         "smlal2   v15.8h,  v3.16b,  v5.16b\n"
1158*5f39d1b3SJooyung Han 
1159*5f39d1b3SJooyung Han         "bne " GEMMLOWP_LABEL_LOOP "b\n"
1160*5f39d1b3SJooyung Han 
1161*5f39d1b3SJooyung Han         // Final code for the last 16 levels of depth.
1162*5f39d1b3SJooyung Han         // There is nothing to load anymore, only some arithmetic to finish.
1163*5f39d1b3SJooyung Han         GEMMLOWP_LABEL_AFTER_LOOP_LAST16
1164*5f39d1b3SJooyung Han         ":\n"
1165*5f39d1b3SJooyung Han 
1166*5f39d1b3SJooyung Han         // Some multiplications and 16-bit accumulation were already done above,
1167*5f39d1b3SJooyung Han         // so we start right away in the middle.
1168*5f39d1b3SJooyung Han         "sadalp  v16.4s, v8.8h\n"
1169*5f39d1b3SJooyung Han         "smull    v8.8h,  v0.8b,  v6.8b\n"
1170*5f39d1b3SJooyung Han         "sadalp  v17.4s, v9.8h\n"
1171*5f39d1b3SJooyung Han         "smull    v9.8h,  v1.8b,  v6.8b\n"
1172*5f39d1b3SJooyung Han         "sadalp  v18.4s, v10.8h\n"
1173*5f39d1b3SJooyung Han         "smull    v10.8h,  v2.8b,  v6.8b\n"
1174*5f39d1b3SJooyung Han         "sadalp  v19.4s, v11.8h\n"
1175*5f39d1b3SJooyung Han         "smull    v11.8h,  v3.8b,  v6.8b\n"
1176*5f39d1b3SJooyung Han         "sadalp  v20.4s, v12.8h\n"
1177*5f39d1b3SJooyung Han         "smull    v12.8h,  v0.8b,  v7.8b\n"
1178*5f39d1b3SJooyung Han         "sadalp  v21.4s, v13.8h\n"
1179*5f39d1b3SJooyung Han         "smull    v13.8h,  v1.8b,  v7.8b\n"
1180*5f39d1b3SJooyung Han         "sadalp  v22.4s, v14.8h\n"
1181*5f39d1b3SJooyung Han         "smull    v14.8h,  v2.8b,  v7.8b\n"
1182*5f39d1b3SJooyung Han         "sadalp  v23.4s, v15.8h\n"
1183*5f39d1b3SJooyung Han         "smull    v15.8h,  v3.8b,  v7.8b\n"
1184*5f39d1b3SJooyung Han 
1185*5f39d1b3SJooyung Han         // Multiply-accumulate second-half, again into the same
1186*5f39d1b3SJooyung Han         // 16bit local accumulator registers. This is where we
1187*5f39d1b3SJooyung Han         // take advantage of having int8 instead of uint8 and therefore
1188*5f39d1b3SJooyung Han         // being able to accumulate two products into int16.
1189*5f39d1b3SJooyung Han         "smlal2   v8.8h,  v0.16b,  v6.16b\n"
1190*5f39d1b3SJooyung Han         "smlal2   v9.8h,  v1.16b,  v6.16b\n"
1191*5f39d1b3SJooyung Han         "smlal2   v10.8h,  v2.16b,  v6.16b\n"
1192*5f39d1b3SJooyung Han         "smlal2   v11.8h,  v3.16b,  v6.16b\n"
1193*5f39d1b3SJooyung Han         "smlal2   v12.8h,  v0.16b,  v7.16b\n"
1194*5f39d1b3SJooyung Han         "smlal2   v13.8h,  v1.16b,  v7.16b\n"
1195*5f39d1b3SJooyung Han         "smlal2   v14.8h,  v2.16b,  v7.16b\n"
1196*5f39d1b3SJooyung Han         "smlal2   v15.8h,  v3.16b,  v7.16b\n"
1197*5f39d1b3SJooyung Han 
1198*5f39d1b3SJooyung Han         "sadalp  v24.4s, v8.8h\n"
1199*5f39d1b3SJooyung Han         "sadalp  v25.4s, v9.8h\n"
1200*5f39d1b3SJooyung Han         "sadalp  v26.4s, v10.8h\n"
1201*5f39d1b3SJooyung Han         "sadalp  v27.4s, v11.8h\n"
1202*5f39d1b3SJooyung Han         "sadalp  v28.4s, v12.8h\n"
1203*5f39d1b3SJooyung Han         "sadalp  v29.4s, v13.8h\n"
1204*5f39d1b3SJooyung Han         "sadalp  v30.4s, v14.8h\n"
1205*5f39d1b3SJooyung Han         "sadalp  v31.4s, v15.8h\n"
1206*5f39d1b3SJooyung Han 
1207*5f39d1b3SJooyung Han         // Reduce 32bit accumulators horizontally.
1208*5f39d1b3SJooyung Han         "addp v0.4s, v16.4s, v20.4s\n"
1209*5f39d1b3SJooyung Han         "addp v2.4s, v17.4s, v21.4s\n"
1210*5f39d1b3SJooyung Han         "addp v4.4s, v18.4s, v22.4s\n"
1211*5f39d1b3SJooyung Han         "addp v6.4s, v19.4s, v23.4s\n"
1212*5f39d1b3SJooyung Han         "addp v1.4s, v24.4s, v28.4s\n"
1213*5f39d1b3SJooyung Han         "addp v3.4s, v25.4s, v29.4s\n"
1214*5f39d1b3SJooyung Han         "addp v5.4s, v26.4s, v30.4s\n"
1215*5f39d1b3SJooyung Han         "addp v7.4s, v27.4s, v31.4s\n"
1216*5f39d1b3SJooyung Han 
1217*5f39d1b3SJooyung Han         "cmp %[start_depth], #0\n"
1218*5f39d1b3SJooyung Han         "bne " GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
1219*5f39d1b3SJooyung Han         "f\n"
1220*5f39d1b3SJooyung Han 
1221*5f39d1b3SJooyung Han         // Reduce 32bit accumulators horizontally, second pass
1222*5f39d1b3SJooyung Han         // (each pass adds pairwise. we need to add 4-wise).
1223*5f39d1b3SJooyung Han         "addp v12.4s, v0.4s, v1.4s\n"
1224*5f39d1b3SJooyung Han         "addp v13.4s, v2.4s, v3.4s\n"
1225*5f39d1b3SJooyung Han         "addp v14.4s, v4.4s, v5.4s\n"
1226*5f39d1b3SJooyung Han         "addp v15.4s, v6.4s, v7.4s\n"
1227*5f39d1b3SJooyung Han 
1228*5f39d1b3SJooyung Han         "b " GEMMLOWP_LABEL_STORE "f\n"
1229*5f39d1b3SJooyung Han 
1230*5f39d1b3SJooyung Han         GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
1231*5f39d1b3SJooyung Han         ":\n"
1232*5f39d1b3SJooyung Han 
1233*5f39d1b3SJooyung Han         // Reduce 32bit accumulators horizontally, second pass
1234*5f39d1b3SJooyung Han         // (each pass adds pairwise. we need to add 4-wise),
1235*5f39d1b3SJooyung Han         // and load destination values from memory.
1236*5f39d1b3SJooyung Han         "mov x0, %[dst_ptr]\n"
1237*5f39d1b3SJooyung Han         "ld1 {v12.16b}, [x0], %[dst_col_stride]\n"
1238*5f39d1b3SJooyung Han         "addp v8.4s, v0.4s, v1.4s\n"
1239*5f39d1b3SJooyung Han         "ld1 {v13.16b}, [x0], %[dst_col_stride]\n"
1240*5f39d1b3SJooyung Han         "addp v9.4s, v2.4s, v3.4s\n"
1241*5f39d1b3SJooyung Han         "ld1 {v14.16b}, [x0], %[dst_col_stride]\n"
1242*5f39d1b3SJooyung Han         "addp v10.4s, v4.4s, v5.4s\n"
1243*5f39d1b3SJooyung Han         "ld1 {v15.16b}, [x0]\n"
1244*5f39d1b3SJooyung Han         "addp v11.4s, v6.4s, v7.4s\n"
1245*5f39d1b3SJooyung Han 
1246*5f39d1b3SJooyung Han         // Add horizontally-reduced accumulators into
1247*5f39d1b3SJooyung Han         // the values loaded from memory
1248*5f39d1b3SJooyung Han         "add v12.4s, v12.4s, v8.4s\n"
1249*5f39d1b3SJooyung Han         "add v13.4s, v13.4s, v9.4s\n"
1250*5f39d1b3SJooyung Han         "add v14.4s, v14.4s, v10.4s\n"
1251*5f39d1b3SJooyung Han         "add v15.4s, v15.4s, v11.4s\n"
1252*5f39d1b3SJooyung Han 
1253*5f39d1b3SJooyung Han         GEMMLOWP_LABEL_STORE
1254*5f39d1b3SJooyung Han         ":\n"
1255*5f39d1b3SJooyung Han         // Store back into memory
1256*5f39d1b3SJooyung Han         "mov x0, %[dst_ptr]\n"
1257*5f39d1b3SJooyung Han         "st1 {v12.16b}, [x0], %[dst_col_stride]\n"
1258*5f39d1b3SJooyung Han         "st1 {v13.16b}, [x0], %[dst_col_stride]\n"
1259*5f39d1b3SJooyung Han         "st1 {v14.16b}, [x0], %[dst_col_stride]\n"
1260*5f39d1b3SJooyung Han         "st1 {v15.16b}, [x0]\n"
1261*5f39d1b3SJooyung Han         :  // outputs
1262*5f39d1b3SJooyung Han         [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
1263*5f39d1b3SJooyung Han         [dst_ptr] "+r"(dst_ptr), [run_depth] "+r"(run_depth),
1264*5f39d1b3SJooyung Han         [dst_col_stride] "+r"(dst_col_stride)
1265*5f39d1b3SJooyung Han         :  // inputs
1266*5f39d1b3SJooyung Han         [start_depth] "r"(start_depth)
1267*5f39d1b3SJooyung Han         :  // clobbers
1268*5f39d1b3SJooyung Han         "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1269*5f39d1b3SJooyung Han         "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
1270*5f39d1b3SJooyung Han         "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
1271*5f39d1b3SJooyung Han         "v28", "v29", "v30", "v31");
1272*5f39d1b3SJooyung Han #undef GEMMLOWP_LABEL_LOOP
1273*5f39d1b3SJooyung Han #undef GEMMLOWP_LABEL_AFTER_LOOP_LAST16
1274*5f39d1b3SJooyung Han #undef GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
1275*5f39d1b3SJooyung Han #undef GEMMLOWP_LABEL_STORE
1276*5f39d1b3SJooyung Han   }
1277*5f39d1b3SJooyung Han };
1278*5f39d1b3SJooyung Han 
1279*5f39d1b3SJooyung Han // Same as NEON_32bit_GEMM_Int8Operands_LhsNonzero, but uses a side format that
1280*5f39d1b3SJooyung Han // requires that user inputs were originally int8. This avoids the uint8->int8
1281*5f39d1b3SJooyung Han // conversion in the pack step.
1282*5f39d1b3SJooyung Han struct NEON_64bit_GEMM_Int8Operands_LhsNonzero_Int8Inputs
1283*5f39d1b3SJooyung Han     : NEON_64bit_GEMM_Int8Operands_LhsNonzero {
1284*5f39d1b3SJooyung Han   typedef KernelFormat<
1285*5f39d1b3SJooyung Han       KernelSideFormatInt8Inputs<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
1286*5f39d1b3SJooyung Han       KernelSideFormatInt8Inputs<CellFormat<4, 16, CellOrder::WidthMajor>, 1> >
1287*5f39d1b3SJooyung Han       Format;
1288*5f39d1b3SJooyung Han };
1289*5f39d1b3SJooyung Han 
1290*5f39d1b3SJooyung Han // Our main GEMM kernel.
1291*5f39d1b3SJooyung Han struct NEON_64_Kernel12x8Depth2 : KernelBase {
1292*5f39d1b3SJooyung Han   typedef KernelFormat<KernelSideFormat<CellFormat<4, 2>, 3>,
1293*5f39d1b3SJooyung Han                        KernelSideFormat<CellFormat<4, 2>, 2> >
1294*5f39d1b3SJooyung Han       Format;
1295*5f39d1b3SJooyung Han 
NameNEON_64_Kernel12x8Depth21296*5f39d1b3SJooyung Han   const char* Name() const override { return "NEON, 12x8, depth 2"; }
1297*5f39d1b3SJooyung Han 
1298*5f39d1b3SJooyung Han   // TODO(benoitjacob): reorder function arguments so dst comes last
RunNEON_64_Kernel12x8Depth21299*5f39d1b3SJooyung Han   void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
1300*5f39d1b3SJooyung Han            std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
1301*5f39d1b3SJooyung Han            const std::uint8_t* rhs_ptr, std::size_t start_depth,
1302*5f39d1b3SJooyung Han            std::size_t run_depth) const override {
1303*5f39d1b3SJooyung Han     (void)dst_row_stride;
1304*5f39d1b3SJooyung Han     ScopedProfilingLabel label("optimized kernel (NEON 12x8)");
1305*5f39d1b3SJooyung Han // See comments above for why we need local numerical labels in our asm.
1306*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_CLEAR_ACCUMULATORS "1"
1307*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_BEFORE_LOOP "2"
1308*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_LOOP "3"
1309*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_AFTER_LOOP "4"
1310*5f39d1b3SJooyung Han 
1311*5f39d1b3SJooyung Han     assert(dst_row_stride == 1);
1312*5f39d1b3SJooyung Han     asm volatile(
1313*5f39d1b3SJooyung Han         // Load 1 Rhs cell of size 2x8
1314*5f39d1b3SJooyung Han         "ld1 {v5.8b}, [%[rhs_ptr]], #8\n"
1315*5f39d1b3SJooyung Han         "ld1 {v6.8b}, [%[rhs_ptr]], #8\n"
1316*5f39d1b3SJooyung Han 
1317*5f39d1b3SJooyung Han         // Load 3 Lhs cells of size 4x2 each
1318*5f39d1b3SJooyung Han         "ld1 {v2.8b}, [%[lhs_ptr]], #8\n"
1319*5f39d1b3SJooyung Han         "ld1 {v3.8b}, [%[lhs_ptr]], #8\n"
1320*5f39d1b3SJooyung Han         "ld1 {v4.8b}, [%[lhs_ptr]], #8\n"
1321*5f39d1b3SJooyung Han 
1322*5f39d1b3SJooyung Han         // Multiply dst_col_stride by 4 == sizeof(int32) to use
1323*5f39d1b3SJooyung Han         // it as a byte offset below.
1324*5f39d1b3SJooyung Han         "lsl %[dst_col_stride], %[dst_col_stride], #2\n"
1325*5f39d1b3SJooyung Han 
1326*5f39d1b3SJooyung Han         "cmp %[start_depth], #0\n"
1327*5f39d1b3SJooyung Han         "beq " GEMMLOWP_LABEL_CLEAR_ACCUMULATORS
1328*5f39d1b3SJooyung Han         "f\n"
1329*5f39d1b3SJooyung Han 
1330*5f39d1b3SJooyung Han         // Load accumulators
1331*5f39d1b3SJooyung Han         "mov x1, %[dst_ptr]\n"
1332*5f39d1b3SJooyung Han         "mov x0, x1\n"
1333*5f39d1b3SJooyung Han         "ld1 {v8.16b}, [x0], #16\n"
1334*5f39d1b3SJooyung Han         "subs %[run_depth], %[run_depth], #2\n"
1335*5f39d1b3SJooyung Han         "ld1 {v16.16b}, [x0], #16\n"
1336*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1337*5f39d1b3SJooyung Han         "ld1 {v24.16b}, [x0]\n"
1338*5f39d1b3SJooyung Han         "mov x0, x1\n"
1339*5f39d1b3SJooyung Han         "ld1 {v9.16b}, [x0], #16\n"
1340*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1341*5f39d1b3SJooyung Han         "ld1 {v17.16b}, [x0], #16\n"
1342*5f39d1b3SJooyung Han         "ld1 {v25.16b}, [x0]\n"
1343*5f39d1b3SJooyung Han         "mov x0, x1\n"
1344*5f39d1b3SJooyung Han         "ld1 {v10.16b}, [x0], #16\n"
1345*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1346*5f39d1b3SJooyung Han         "ld1 {v18.16b}, [x0], #16\n"
1347*5f39d1b3SJooyung Han         "ld1 {v26.16b}, [x0]\n"
1348*5f39d1b3SJooyung Han         "mov x0, x1\n"
1349*5f39d1b3SJooyung Han         "ld1 {v11.16b}, [x0], #16\n"
1350*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1351*5f39d1b3SJooyung Han         "ld1 {v19.16b}, [x0], #16\n"
1352*5f39d1b3SJooyung Han         "ld1 {v27.16b}, [x0]\n"
1353*5f39d1b3SJooyung Han         "mov x0, x1\n"
1354*5f39d1b3SJooyung Han         "ld1 {v12.16b}, [x0], #16\n"
1355*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1356*5f39d1b3SJooyung Han         "ld1 {v20.16b}, [x0], #16\n"
1357*5f39d1b3SJooyung Han         "ld1 {v28.16b}, [x0]\n"
1358*5f39d1b3SJooyung Han         "mov x0, x1\n"
1359*5f39d1b3SJooyung Han         "ld1 {v13.16b}, [x0], #16\n"
1360*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1361*5f39d1b3SJooyung Han         "ld1 {v21.16b}, [x0], #16\n"
1362*5f39d1b3SJooyung Han         "ld1 {v29.16b}, [x0]\n"
1363*5f39d1b3SJooyung Han         "mov x0, x1\n"
1364*5f39d1b3SJooyung Han         "ld1 {v14.16b}, [x0], #16\n"
1365*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1366*5f39d1b3SJooyung Han         "ld1 {v22.16b}, [x0], #16\n"
1367*5f39d1b3SJooyung Han         "ld1 {v30.16b}, [x0]\n"
1368*5f39d1b3SJooyung Han         "mov x0, x1\n"
1369*5f39d1b3SJooyung Han         "ld1 {v15.16b}, [x0], #16\n"
1370*5f39d1b3SJooyung Han         "ld1 {v23.16b}, [x0], #16\n"
1371*5f39d1b3SJooyung Han         "ld1 {v31.16b}, [x0]\n"
1372*5f39d1b3SJooyung Han 
1373*5f39d1b3SJooyung Han         "b " GEMMLOWP_LABEL_BEFORE_LOOP "f\n"
1374*5f39d1b3SJooyung Han 
1375*5f39d1b3SJooyung Han         GEMMLOWP_LABEL_CLEAR_ACCUMULATORS
1376*5f39d1b3SJooyung Han         ":\n"
1377*5f39d1b3SJooyung Han 
1378*5f39d1b3SJooyung Han         // Clear accumulator registers (see layout below)
1379*5f39d1b3SJooyung Han         "dup v8.4s, wzr\n"
1380*5f39d1b3SJooyung Han         "subs %[run_depth], %[run_depth], #2\n"
1381*5f39d1b3SJooyung Han         "dup v9.4s, wzr\n"
1382*5f39d1b3SJooyung Han         "dup v10.4s, wzr\n"
1383*5f39d1b3SJooyung Han         "dup v11.4s, wzr\n"
1384*5f39d1b3SJooyung Han         "dup v12.4s, wzr\n"
1385*5f39d1b3SJooyung Han         "dup v13.4s, wzr\n"
1386*5f39d1b3SJooyung Han         "dup v14.4s, wzr\n"
1387*5f39d1b3SJooyung Han         "dup v15.4s, wzr\n"
1388*5f39d1b3SJooyung Han         "dup v16.4s, wzr\n"
1389*5f39d1b3SJooyung Han         "dup v17.4s, wzr\n"
1390*5f39d1b3SJooyung Han         "dup v18.4s, wzr\n"
1391*5f39d1b3SJooyung Han         "dup v19.4s, wzr\n"
1392*5f39d1b3SJooyung Han         "dup v20.4s, wzr\n"
1393*5f39d1b3SJooyung Han         "dup v21.4s, wzr\n"
1394*5f39d1b3SJooyung Han         "dup v22.4s, wzr\n"
1395*5f39d1b3SJooyung Han         "dup v23.4s, wzr\n"
1396*5f39d1b3SJooyung Han         "dup v24.4s, wzr\n"
1397*5f39d1b3SJooyung Han         "dup v25.4s, wzr\n"
1398*5f39d1b3SJooyung Han         "dup v26.4s, wzr\n"
1399*5f39d1b3SJooyung Han         "dup v27.4s, wzr\n"
1400*5f39d1b3SJooyung Han         "dup v28.4s, wzr\n"
1401*5f39d1b3SJooyung Han         "dup v29.4s, wzr\n"
1402*5f39d1b3SJooyung Han         "dup v30.4s, wzr\n"
1403*5f39d1b3SJooyung Han         "dup v31.4s, wzr\n"
1404*5f39d1b3SJooyung Han 
1405*5f39d1b3SJooyung Han         GEMMLOWP_LABEL_BEFORE_LOOP
1406*5f39d1b3SJooyung Han         ":\n"
1407*5f39d1b3SJooyung Han 
1408*5f39d1b3SJooyung Han         "beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n"
1409*5f39d1b3SJooyung Han 
1410*5f39d1b3SJooyung Han         GEMMLOWP_LABEL_LOOP
1411*5f39d1b3SJooyung Han         ":\n"
1412*5f39d1b3SJooyung Han 
1413*5f39d1b3SJooyung Han         // Overview of register layout:
1414*5f39d1b3SJooyung Han         //
1415*5f39d1b3SJooyung Han         // A 2x8 block of 2 2x4 cells of Rhs is stored in 16bit in v0--v1.
1416*5f39d1b3SJooyung Han         // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in v2--v4.
1417*5f39d1b3SJooyung Han         // A 12x8 block of accumulators is stored in 32bit in v8--v31.
1418*5f39d1b3SJooyung Han         //
1419*5f39d1b3SJooyung Han         //                         +--------+--------+-----+--------+--------+
1420*5f39d1b3SJooyung Han         //                         |v0.h[0] |v0.h[1] | ... |v1.h[2] |v1.h[3] |
1421*5f39d1b3SJooyung Han         //                    Rhs  +--------+--------+-----+--------+--------+
1422*5f39d1b3SJooyung Han         //                         |v0.h[4] |v0.h[5] | ... |v1.h[6] |v1.h[7] |
1423*5f39d1b3SJooyung Han         //                         +--------+--------+-----+--------+--------+
1424*5f39d1b3SJooyung Han         //
1425*5f39d1b3SJooyung Han         //                         |        |        |     |        |        |
1426*5f39d1b3SJooyung Han         //
1427*5f39d1b3SJooyung Han         //    Lhs                  |        |        |     |        |        |
1428*5f39d1b3SJooyung Han         //
1429*5f39d1b3SJooyung Han         //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
1430*5f39d1b3SJooyung Han         //  |v2.h[0]|v2.h[4]|      |v8.s[0] |v9.s[0] | ... |v14.s[0]|v15.s[0]|
1431*5f39d1b3SJooyung Han         //  |v2.h[1]|v2.h[5]|      |v8.s[1] |v9.s[1] | ... |v14.s[1]|v15.s[1]|
1432*5f39d1b3SJooyung Han         //  |v2.h[2]|v2.h[6]|      |v8.s[2] |v9.s[2] | ... |v14.s[2]|v15.s[2]|
1433*5f39d1b3SJooyung Han         //  |v2.h[3]|v2.h[7]|      |v8.s[3] |v9.s[3] | ... |v14.s[3]|v15.s[3]|
1434*5f39d1b3SJooyung Han         //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
1435*5f39d1b3SJooyung Han         //  |v3.h[0]|v3.h[4]|      |v16.s[0]|v17.s[0]| ... |v22.s[0]|v23.s[0]|
1436*5f39d1b3SJooyung Han         //  |v3.h[1]|v3.h[5]|      |v16.s[1]|v17.s[1]| ... |v22.s[1]|v23.s[1]|
1437*5f39d1b3SJooyung Han         //  |v3.h[2]|v3.h[6]|      |v16.s[2]|v17.s[2]| ... |v22.s[2]|v23.s[2]|
1438*5f39d1b3SJooyung Han         //  |v3.h[3]|v3.h[7]|      |v16.s[3]|v17.s[3]| ... |v22.s[3]|v23.s[3]|
1439*5f39d1b3SJooyung Han         //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
1440*5f39d1b3SJooyung Han         //  |v4.h[0]|v4.h[4]|      |v24.s[0]|v25.s[0]| ... |v30.s[0]|v31.s[0]|
1441*5f39d1b3SJooyung Han         //  |v4.h[1]|v4.h[5]|      |v24.s[1]|v25.s[1]| ... |v30.s[1]|v31.s[1]|
1442*5f39d1b3SJooyung Han         //  |v4.h[2]|v4.h[6]|      |v24.s[2]|v25.s[2]| ... |v30.s[2]|v31.s[2]|
1443*5f39d1b3SJooyung Han         //  |v4.h[3]|v4.h[7]|      |v24.s[3]|v25.s[3]| ... |v30.s[3]|v31.s[3]|
1444*5f39d1b3SJooyung Han         //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
1445*5f39d1b3SJooyung Han         //
1446*5f39d1b3SJooyung Han         //                            Accumulator
1447*5f39d1b3SJooyung Han 
1448*5f39d1b3SJooyung Han         // Expand Lhs/Rhs cells to 16 bit.
1449*5f39d1b3SJooyung Han         "uxtl v0.8h, v5.8b\n"
1450*5f39d1b3SJooyung Han         "ld1 {v5.8b}, [%[rhs_ptr]], #8\n"
1451*5f39d1b3SJooyung Han         "uxtl v1.8h, v6.8b\n"
1452*5f39d1b3SJooyung Han         "ld1 {v6.8b}, [%[rhs_ptr]], #8\n"
1453*5f39d1b3SJooyung Han         "uxtl v2.8h, v2.8b\n"
1454*5f39d1b3SJooyung Han         "uxtl v3.8h, v3.8b\n"
1455*5f39d1b3SJooyung Han         "uxtl v4.8h, v4.8b\n"
1456*5f39d1b3SJooyung Han 
1457*5f39d1b3SJooyung Han         // Multiply-accumulate, top third
1458*5f39d1b3SJooyung Han         "umlal v8.4s, v2.4h, v0.h[0]\n"
1459*5f39d1b3SJooyung Han         "umlal v9.4s, v2.4h, v0.h[1]\n"
1460*5f39d1b3SJooyung Han         "umlal v10.4s, v2.4h, v0.h[2]\n"
1461*5f39d1b3SJooyung Han         "umlal v11.4s, v2.4h, v0.h[3]\n"
1462*5f39d1b3SJooyung Han         "umlal v12.4s, v2.4h, v1.h[0]\n"
1463*5f39d1b3SJooyung Han         "umlal v13.4s, v2.4h, v1.h[1]\n"
1464*5f39d1b3SJooyung Han         "umlal v14.4s, v2.4h, v1.h[2]\n"
1465*5f39d1b3SJooyung Han         "umlal v15.4s, v2.4h, v1.h[3]\n"
1466*5f39d1b3SJooyung Han         "umlal2 v8.4s, v2.8h, v0.h[4]\n"
1467*5f39d1b3SJooyung Han         "umlal2 v9.4s, v2.8h, v0.h[5]\n"
1468*5f39d1b3SJooyung Han         "umlal2 v10.4s, v2.8h, v0.h[6]\n"
1469*5f39d1b3SJooyung Han         "umlal2 v11.4s, v2.8h, v0.h[7]\n"
1470*5f39d1b3SJooyung Han         "umlal2 v12.4s, v2.8h, v1.h[4]\n"
1471*5f39d1b3SJooyung Han         "umlal2 v13.4s, v2.8h, v1.h[5]\n"
1472*5f39d1b3SJooyung Han         "umlal2 v14.4s, v2.8h, v1.h[6]\n"
1473*5f39d1b3SJooyung Han         "umlal2 v15.4s, v2.8h, v1.h[7]\n"
1474*5f39d1b3SJooyung Han         "ld1 {v2.8b}, [%[lhs_ptr]], #8\n"
1475*5f39d1b3SJooyung Han 
1476*5f39d1b3SJooyung Han         // Multiply-accumulate, middle third
1477*5f39d1b3SJooyung Han         "umlal v16.4s, v3.4h, v0.h[0]\n"
1478*5f39d1b3SJooyung Han         "umlal v17.4s, v3.4h, v0.h[1]\n"
1479*5f39d1b3SJooyung Han         "umlal v18.4s, v3.4h, v0.h[2]\n"
1480*5f39d1b3SJooyung Han         "umlal v19.4s, v3.4h, v0.h[3]\n"
1481*5f39d1b3SJooyung Han         "umlal v20.4s, v3.4h, v1.h[0]\n"
1482*5f39d1b3SJooyung Han         "umlal v21.4s, v3.4h, v1.h[1]\n"
1483*5f39d1b3SJooyung Han         "umlal v22.4s, v3.4h, v1.h[2]\n"
1484*5f39d1b3SJooyung Han         "umlal v23.4s, v3.4h, v1.h[3]\n"
1485*5f39d1b3SJooyung Han         "umlal2 v16.4s, v3.8h, v0.h[4]\n"
1486*5f39d1b3SJooyung Han         "umlal2 v17.4s, v3.8h, v0.h[5]\n"
1487*5f39d1b3SJooyung Han         "umlal2 v18.4s, v3.8h, v0.h[6]\n"
1488*5f39d1b3SJooyung Han         "umlal2 v19.4s, v3.8h, v0.h[7]\n"
1489*5f39d1b3SJooyung Han         "umlal2 v20.4s, v3.8h, v1.h[4]\n"
1490*5f39d1b3SJooyung Han         "umlal2 v21.4s, v3.8h, v1.h[5]\n"
1491*5f39d1b3SJooyung Han         "umlal2 v22.4s, v3.8h, v1.h[6]\n"
1492*5f39d1b3SJooyung Han         "umlal2 v23.4s, v3.8h, v1.h[7]\n"
1493*5f39d1b3SJooyung Han         "ld1 {v3.8b}, [%[lhs_ptr]], #8\n"
1494*5f39d1b3SJooyung Han 
1495*5f39d1b3SJooyung Han         "subs %[run_depth], %[run_depth], #2\n"
1496*5f39d1b3SJooyung Han 
1497*5f39d1b3SJooyung Han         // Multiply-accumulate, bottom third
1498*5f39d1b3SJooyung Han         "umlal v24.4s, v4.4h, v0.h[0]\n"
1499*5f39d1b3SJooyung Han         "umlal v25.4s, v4.4h, v0.h[1]\n"
1500*5f39d1b3SJooyung Han         "umlal v26.4s, v4.4h, v0.h[2]\n"
1501*5f39d1b3SJooyung Han         "umlal v27.4s, v4.4h, v0.h[3]\n"
1502*5f39d1b3SJooyung Han         "umlal v28.4s, v4.4h, v1.h[0]\n"
1503*5f39d1b3SJooyung Han         "umlal v29.4s, v4.4h, v1.h[1]\n"
1504*5f39d1b3SJooyung Han         "umlal v30.4s, v4.4h, v1.h[2]\n"
1505*5f39d1b3SJooyung Han         "umlal v31.4s, v4.4h, v1.h[3]\n"
1506*5f39d1b3SJooyung Han         "umlal2 v24.4s, v4.8h, v0.h[4]\n"
1507*5f39d1b3SJooyung Han         "umlal2 v25.4s, v4.8h, v0.h[5]\n"
1508*5f39d1b3SJooyung Han         "umlal2 v26.4s, v4.8h, v0.h[6]\n"
1509*5f39d1b3SJooyung Han         "umlal2 v27.4s, v4.8h, v0.h[7]\n"
1510*5f39d1b3SJooyung Han         "umlal2 v28.4s, v4.8h, v1.h[4]\n"
1511*5f39d1b3SJooyung Han         "umlal2 v29.4s, v4.8h, v1.h[5]\n"
1512*5f39d1b3SJooyung Han         "umlal2 v30.4s, v4.8h, v1.h[6]\n"
1513*5f39d1b3SJooyung Han         "umlal2 v31.4s, v4.8h, v1.h[7]\n"
1514*5f39d1b3SJooyung Han         "ld1 {v4.8b}, [%[lhs_ptr]], #8\n"
1515*5f39d1b3SJooyung Han 
1516*5f39d1b3SJooyung Han         "bne " GEMMLOWP_LABEL_LOOP "b\n"
1517*5f39d1b3SJooyung Han 
1518*5f39d1b3SJooyung Han         GEMMLOWP_LABEL_AFTER_LOOP
1519*5f39d1b3SJooyung Han         ":\n"
1520*5f39d1b3SJooyung Han 
1521*5f39d1b3SJooyung Han         // Expand Lhs/Rhs cells to 16 bit.
1522*5f39d1b3SJooyung Han         "uxtl v0.8h, v5.8b\n"
1523*5f39d1b3SJooyung Han         "uxtl v1.8h, v6.8b\n"
1524*5f39d1b3SJooyung Han         "uxtl v2.8h, v2.8b\n"
1525*5f39d1b3SJooyung Han         "uxtl v3.8h, v3.8b\n"
1526*5f39d1b3SJooyung Han         "uxtl v4.8h, v4.8b\n"
1527*5f39d1b3SJooyung Han 
1528*5f39d1b3SJooyung Han         // Multiply-accumulate, level of depth 0
1529*5f39d1b3SJooyung Han         "umlal v8.4s, v2.4h, v0.h[0]\n"
1530*5f39d1b3SJooyung Han         "umlal v9.4s, v2.4h, v0.h[1]\n"
1531*5f39d1b3SJooyung Han         "umlal v10.4s, v2.4h, v0.h[2]\n"
1532*5f39d1b3SJooyung Han         "umlal v11.4s, v2.4h, v0.h[3]\n"
1533*5f39d1b3SJooyung Han         "umlal v12.4s, v2.4h, v1.h[0]\n"
1534*5f39d1b3SJooyung Han         "umlal v13.4s, v2.4h, v1.h[1]\n"
1535*5f39d1b3SJooyung Han         "umlal v14.4s, v2.4h, v1.h[2]\n"
1536*5f39d1b3SJooyung Han         "umlal v15.4s, v2.4h, v1.h[3]\n"
1537*5f39d1b3SJooyung Han         "umlal v16.4s, v3.4h, v0.h[0]\n"
1538*5f39d1b3SJooyung Han         "umlal v17.4s, v3.4h, v0.h[1]\n"
1539*5f39d1b3SJooyung Han         "umlal v18.4s, v3.4h, v0.h[2]\n"
1540*5f39d1b3SJooyung Han         "umlal v19.4s, v3.4h, v0.h[3]\n"
1541*5f39d1b3SJooyung Han         "umlal v20.4s, v3.4h, v1.h[0]\n"
1542*5f39d1b3SJooyung Han         "umlal v21.4s, v3.4h, v1.h[1]\n"
1543*5f39d1b3SJooyung Han         "umlal v22.4s, v3.4h, v1.h[2]\n"
1544*5f39d1b3SJooyung Han         "umlal v23.4s, v3.4h, v1.h[3]\n"
1545*5f39d1b3SJooyung Han         "umlal v24.4s, v4.4h, v0.h[0]\n"
1546*5f39d1b3SJooyung Han         "umlal v25.4s, v4.4h, v0.h[1]\n"
1547*5f39d1b3SJooyung Han         "umlal v26.4s, v4.4h, v0.h[2]\n"
1548*5f39d1b3SJooyung Han         "umlal v27.4s, v4.4h, v0.h[3]\n"
1549*5f39d1b3SJooyung Han         "umlal v28.4s, v4.4h, v1.h[0]\n"
1550*5f39d1b3SJooyung Han         "umlal v29.4s, v4.4h, v1.h[1]\n"
1551*5f39d1b3SJooyung Han         "umlal v30.4s, v4.4h, v1.h[2]\n"
1552*5f39d1b3SJooyung Han         "umlal v31.4s, v4.4h, v1.h[3]\n"
1553*5f39d1b3SJooyung Han 
1554*5f39d1b3SJooyung Han         // Multiply-accumulate, level of depth 1
1555*5f39d1b3SJooyung Han         "umlal2 v8.4s, v2.8h, v0.h[4]\n"
1556*5f39d1b3SJooyung Han         "umlal2 v9.4s, v2.8h, v0.h[5]\n"
1557*5f39d1b3SJooyung Han         "umlal2 v10.4s, v2.8h, v0.h[6]\n"
1558*5f39d1b3SJooyung Han         "umlal2 v11.4s, v2.8h, v0.h[7]\n"
1559*5f39d1b3SJooyung Han         "umlal2 v12.4s, v2.8h, v1.h[4]\n"
1560*5f39d1b3SJooyung Han         "umlal2 v13.4s, v2.8h, v1.h[5]\n"
1561*5f39d1b3SJooyung Han         "umlal2 v14.4s, v2.8h, v1.h[6]\n"
1562*5f39d1b3SJooyung Han         "umlal2 v15.4s, v2.8h, v1.h[7]\n"
1563*5f39d1b3SJooyung Han         "umlal2 v16.4s, v3.8h, v0.h[4]\n"
1564*5f39d1b3SJooyung Han         "umlal2 v17.4s, v3.8h, v0.h[5]\n"
1565*5f39d1b3SJooyung Han         "umlal2 v18.4s, v3.8h, v0.h[6]\n"
1566*5f39d1b3SJooyung Han         "umlal2 v19.4s, v3.8h, v0.h[7]\n"
1567*5f39d1b3SJooyung Han         "umlal2 v20.4s, v3.8h, v1.h[4]\n"
1568*5f39d1b3SJooyung Han         "umlal2 v21.4s, v3.8h, v1.h[5]\n"
1569*5f39d1b3SJooyung Han         "umlal2 v22.4s, v3.8h, v1.h[6]\n"
1570*5f39d1b3SJooyung Han         "umlal2 v23.4s, v3.8h, v1.h[7]\n"
1571*5f39d1b3SJooyung Han         "umlal2 v24.4s, v4.8h, v0.h[4]\n"
1572*5f39d1b3SJooyung Han         "umlal2 v25.4s, v4.8h, v0.h[5]\n"
1573*5f39d1b3SJooyung Han         "umlal2 v26.4s, v4.8h, v0.h[6]\n"
1574*5f39d1b3SJooyung Han         "umlal2 v27.4s, v4.8h, v0.h[7]\n"
1575*5f39d1b3SJooyung Han         "umlal2 v28.4s, v4.8h, v1.h[4]\n"
1576*5f39d1b3SJooyung Han         "umlal2 v29.4s, v4.8h, v1.h[5]\n"
1577*5f39d1b3SJooyung Han         "umlal2 v30.4s, v4.8h, v1.h[6]\n"
1578*5f39d1b3SJooyung Han         "umlal2 v31.4s, v4.8h, v1.h[7]\n"
1579*5f39d1b3SJooyung Han 
1580*5f39d1b3SJooyung Han         // Store accumulators
1581*5f39d1b3SJooyung Han         "mov x1, %[dst_ptr]\n"
1582*5f39d1b3SJooyung Han         "mov x0, x1\n"
1583*5f39d1b3SJooyung Han         "st1 {v8.16b}, [x0], #16\n"
1584*5f39d1b3SJooyung Han         "subs %[run_depth], %[run_depth], #2\n"
1585*5f39d1b3SJooyung Han         "st1 {v16.16b}, [x0], #16\n"
1586*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1587*5f39d1b3SJooyung Han         "st1 {v24.16b}, [x0]\n"
1588*5f39d1b3SJooyung Han         "mov x0, x1\n"
1589*5f39d1b3SJooyung Han         "st1 {v9.16b}, [x0], #16\n"
1590*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1591*5f39d1b3SJooyung Han         "st1 {v17.16b}, [x0], #16\n"
1592*5f39d1b3SJooyung Han         "st1 {v25.16b}, [x0]\n"
1593*5f39d1b3SJooyung Han         "mov x0, x1\n"
1594*5f39d1b3SJooyung Han         "st1 {v10.16b}, [x0], #16\n"
1595*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1596*5f39d1b3SJooyung Han         "st1 {v18.16b}, [x0], #16\n"
1597*5f39d1b3SJooyung Han         "st1 {v26.16b}, [x0]\n"
1598*5f39d1b3SJooyung Han         "mov x0, x1\n"
1599*5f39d1b3SJooyung Han         "st1 {v11.16b}, [x0], #16\n"
1600*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1601*5f39d1b3SJooyung Han         "st1 {v19.16b}, [x0], #16\n"
1602*5f39d1b3SJooyung Han         "st1 {v27.16b}, [x0]\n"
1603*5f39d1b3SJooyung Han         "mov x0, x1\n"
1604*5f39d1b3SJooyung Han         "st1 {v12.16b}, [x0], #16\n"
1605*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1606*5f39d1b3SJooyung Han         "st1 {v20.16b}, [x0], #16\n"
1607*5f39d1b3SJooyung Han         "st1 {v28.16b}, [x0]\n"
1608*5f39d1b3SJooyung Han         "mov x0, x1\n"
1609*5f39d1b3SJooyung Han         "st1 {v13.16b}, [x0], #16\n"
1610*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1611*5f39d1b3SJooyung Han         "st1 {v21.16b}, [x0], #16\n"
1612*5f39d1b3SJooyung Han         "st1 {v29.16b}, [x0]\n"
1613*5f39d1b3SJooyung Han         "mov x0, x1\n"
1614*5f39d1b3SJooyung Han         "st1 {v14.16b}, [x0], #16\n"
1615*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1616*5f39d1b3SJooyung Han         "st1 {v22.16b}, [x0], #16\n"
1617*5f39d1b3SJooyung Han         "st1 {v30.16b}, [x0]\n"
1618*5f39d1b3SJooyung Han         "mov x0, x1\n"
1619*5f39d1b3SJooyung Han         "st1 {v15.16b}, [x0], #16\n"
1620*5f39d1b3SJooyung Han         "st1 {v23.16b}, [x0], #16\n"
1621*5f39d1b3SJooyung Han         "st1 {v31.16b}, [x0]\n"
1622*5f39d1b3SJooyung Han #undef GEMMLOWP_LABEL_CLEAR_ACCUMULATORS
1623*5f39d1b3SJooyung Han #undef GEMMLOWP_LABEL_BEFORE_LOOP
1624*5f39d1b3SJooyung Han #undef GEMMLOWP_LABEL_LOOP
1625*5f39d1b3SJooyung Han #undef GEMMLOWP_LABEL_AFTER_LOOP
1626*5f39d1b3SJooyung Han         :  // outputs
1627*5f39d1b3SJooyung Han         [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
1628*5f39d1b3SJooyung Han         [dst_ptr] "+r"(dst_ptr),
1629*5f39d1b3SJooyung Han         [run_depth] "+r"(run_depth)
1630*5f39d1b3SJooyung Han         :  // inputs
1631*5f39d1b3SJooyung Han         [start_depth] "r"(start_depth),
1632*5f39d1b3SJooyung Han         [dst_col_stride] "r"(dst_col_stride)
1633*5f39d1b3SJooyung Han         :  // clobbers
1634*5f39d1b3SJooyung Han         "cc", "memory", "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1635*5f39d1b3SJooyung Han         "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
1636*5f39d1b3SJooyung Han         "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
1637*5f39d1b3SJooyung Han         "v27", "v28", "v29", "v30", "v31");
1638*5f39d1b3SJooyung Han   }
1639*5f39d1b3SJooyung Han };
1640*5f39d1b3SJooyung Han 
1641*5f39d1b3SJooyung Han #ifdef GEMMLOWP_DOTPROD_KERNEL
1642*5f39d1b3SJooyung Han #ifndef __ARM_FEATURE_DOTPROD
1643*5f39d1b3SJooyung Han #error This kernel requires ARM dot-product instructions. Enable them by \
1644*5f39d1b3SJooyung Han   adding '+dotprod' to a compiler flag, e.g. -march=armv8.2-a+dotprod . \
1645*5f39d1b3SJooyung Han   Note that Clang up to version 7 fails to define the corresponding \
1646*5f39d1b3SJooyung Han   preprocessor token __ARM_FEATURE_DOTPROD, so you will still have to define \
1647*5f39d1b3SJooyung Han   it manually.
1648*5f39d1b3SJooyung Han #endif
1649*5f39d1b3SJooyung Han // Kernels utilizing the Armv8.2 Dot Product extension.
1650*5f39d1b3SJooyung Han //
1651*5f39d1b3SJooyung Han // The dot product instructions work by taking 4 consecutive 8-bit depth
1652*5f39d1b3SJooyung Han // values from each operand, multiplying the 4 pairs together and
1653*5f39d1b3SJooyung Han // accumulating all the results into the corresponding 32-bit accumulator
1654*5f39d1b3SJooyung Han // lane.  As such, the operation is identical to a 32-bit instruction (like
1655*5f39d1b3SJooyung Han // FMLA used in SGEMM), except that 4 depth values are processed at a time
1656*5f39d1b3SJooyung Han // instead of 1.
1657*5f39d1b3SJooyung Han 
1658*5f39d1b3SJooyung Han // Thus, this first kernel is a carbon copy of
1659*5f39d1b3SJooyung Han // "NEON_64bit_GEMM_Float32_WithScalar_A57" (which should provide good
1660*5f39d1b3SJooyung Han // performance for most processors) below with the opcode (fmla -> udot) and
1661*5f39d1b3SJooyung Han // types (float32 -> uint8/uint32) changed.
1662*5f39d1b3SJooyung Han //
1663*5f39d1b3SJooyung Han // A signed version of this kernel could be produced by replacing "udot"
1664*5f39d1b3SJooyung Han // with "sdot" - performance should be identical to this udot kernel.
1665*5f39d1b3SJooyung Han struct NEON_64_Kernel12x8Depth4_dotprod : KernelBase {
1666*5f39d1b3SJooyung Han   typedef KernelFormat<KernelSideFormat<CellFormat<4, 4, CellOrder::WidthMajor>, 3>,
1667*5f39d1b3SJooyung Han                        KernelSideFormat<CellFormat<4, 4, CellOrder::WidthMajor>, 2> >
1668*5f39d1b3SJooyung Han       Format;
1669*5f39d1b3SJooyung Han 
NameNEON_64_Kernel12x8Depth4_dotprod1670*5f39d1b3SJooyung Han   const char* Name() const override { return "NEON, 12x8, depth 4, dotprod"; }
1671*5f39d1b3SJooyung Han 
RunNEON_64_Kernel12x8Depth4_dotprod1672*5f39d1b3SJooyung Han   void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride, std::size_t dst_col_stride,
1673*5f39d1b3SJooyung Han            const std::uint8_t* lhs_ptr, const std::uint8_t* rhs_ptr, std::size_t start_depth,
1674*5f39d1b3SJooyung Han            std::size_t depth) const override {
1675*5f39d1b3SJooyung Han     (void)dst_row_stride;
1676*5f39d1b3SJooyung Han     ScopedProfilingLabel label("optimized kernel (NEON 12x8, depth 4, dotprod)");
1677*5f39d1b3SJooyung Han // See comments above for why we need local numerical labels in our asm.
1678*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_CLEAR_ACCUMULATORS "1"
1679*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_BEFORE_LOOP "2"
1680*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_LOOP "3"
1681*5f39d1b3SJooyung Han #define GEMMLOWP_LABEL_AFTER_LOOP "4"
1682*5f39d1b3SJooyung Han 
1683*5f39d1b3SJooyung Han     assert(dst_row_stride == 1);
1684*5f39d1b3SJooyung Han     asm volatile(
1685*5f39d1b3SJooyung Han         // Multiply dst_col_stride by 4 == sizeof(int32) to use
1686*5f39d1b3SJooyung Han         // it as a byte offset below.
1687*5f39d1b3SJooyung Han         "lsl %[dst_col_stride], %[dst_col_stride], #2\n"
1688*5f39d1b3SJooyung Han 
1689*5f39d1b3SJooyung Han         "cmp %[start_depth], #0\n"
1690*5f39d1b3SJooyung Han         "beq " GEMMLOWP_LABEL_CLEAR_ACCUMULATORS "f\n"
1691*5f39d1b3SJooyung Han 
1692*5f39d1b3SJooyung Han         // Load accumulators
1693*5f39d1b3SJooyung Han         "mov x1, %[dst_ptr]\n"
1694*5f39d1b3SJooyung Han         "mov x0, x1\n"
1695*5f39d1b3SJooyung Han         "ld1 {v8.16b}, [x0], #16\n"
1696*5f39d1b3SJooyung Han         "ld1 {v16.16b}, [x0], #16\n"
1697*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1698*5f39d1b3SJooyung Han         "ld1 {v24.16b}, [x0]\n"
1699*5f39d1b3SJooyung Han         "mov x0, x1\n"
1700*5f39d1b3SJooyung Han         "ld1 {v9.16b}, [x0], #16\n"
1701*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1702*5f39d1b3SJooyung Han         "ld1 {v17.16b}, [x0], #16\n"
1703*5f39d1b3SJooyung Han         "ld1 {v25.16b}, [x0]\n"
1704*5f39d1b3SJooyung Han         "mov x0, x1\n"
1705*5f39d1b3SJooyung Han         "ld1 {v10.16b}, [x0], #16\n"
1706*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1707*5f39d1b3SJooyung Han         "ld1 {v18.16b}, [x0], #16\n"
1708*5f39d1b3SJooyung Han         "ld1 {v26.16b}, [x0]\n"
1709*5f39d1b3SJooyung Han         "mov x0, x1\n"
1710*5f39d1b3SJooyung Han         "ld1 {v11.16b}, [x0], #16\n"
1711*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1712*5f39d1b3SJooyung Han         "ld1 {v19.16b}, [x0], #16\n"
1713*5f39d1b3SJooyung Han         "ld1 {v27.16b}, [x0]\n"
1714*5f39d1b3SJooyung Han         "mov x0, x1\n"
1715*5f39d1b3SJooyung Han         "ld1 {v12.16b}, [x0], #16\n"
1716*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1717*5f39d1b3SJooyung Han         "ld1 {v20.16b}, [x0], #16\n"
1718*5f39d1b3SJooyung Han         "ld1 {v28.16b}, [x0]\n"
1719*5f39d1b3SJooyung Han         "mov x0, x1\n"
1720*5f39d1b3SJooyung Han         "ld1 {v13.16b}, [x0], #16\n"
1721*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1722*5f39d1b3SJooyung Han         "ld1 {v21.16b}, [x0], #16\n"
1723*5f39d1b3SJooyung Han         "ld1 {v29.16b}, [x0]\n"
1724*5f39d1b3SJooyung Han         "mov x0, x1\n"
1725*5f39d1b3SJooyung Han         "ld1 {v14.16b}, [x0], #16\n"
1726*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1727*5f39d1b3SJooyung Han         "ld1 {v22.16b}, [x0], #16\n"
1728*5f39d1b3SJooyung Han         "ld1 {v30.16b}, [x0]\n"
1729*5f39d1b3SJooyung Han         "mov x0, x1\n"
1730*5f39d1b3SJooyung Han         "ld1 {v15.16b}, [x0], #16\n"
1731*5f39d1b3SJooyung Han         "ld1 {v23.16b}, [x0], #16\n"
1732*5f39d1b3SJooyung Han         "ld1 {v31.16b}, [x0]\n"
1733*5f39d1b3SJooyung Han 
1734*5f39d1b3SJooyung Han         "b " GEMMLOWP_LABEL_BEFORE_LOOP "f\n"
1735*5f39d1b3SJooyung Han 
1736*5f39d1b3SJooyung Han         GEMMLOWP_LABEL_CLEAR_ACCUMULATORS ":\n"
1737*5f39d1b3SJooyung Han 
1738*5f39d1b3SJooyung Han         // Clear accumulator registers (see layout below)
1739*5f39d1b3SJooyung Han         "dup v8.4s, wzr\n"
1740*5f39d1b3SJooyung Han         "dup v9.4s, wzr\n"
1741*5f39d1b3SJooyung Han         "dup v10.4s, wzr\n"
1742*5f39d1b3SJooyung Han         "dup v11.4s, wzr\n"
1743*5f39d1b3SJooyung Han         "dup v12.4s, wzr\n"
1744*5f39d1b3SJooyung Han         "dup v13.4s, wzr\n"
1745*5f39d1b3SJooyung Han         "dup v14.4s, wzr\n"
1746*5f39d1b3SJooyung Han         "dup v15.4s, wzr\n"
1747*5f39d1b3SJooyung Han         "dup v16.4s, wzr\n"
1748*5f39d1b3SJooyung Han         "dup v17.4s, wzr\n"
1749*5f39d1b3SJooyung Han         "dup v18.4s, wzr\n"
1750*5f39d1b3SJooyung Han         "dup v19.4s, wzr\n"
1751*5f39d1b3SJooyung Han         "dup v20.4s, wzr\n"
1752*5f39d1b3SJooyung Han         "dup v21.4s, wzr\n"
1753*5f39d1b3SJooyung Han         "dup v22.4s, wzr\n"
1754*5f39d1b3SJooyung Han         "dup v23.4s, wzr\n"
1755*5f39d1b3SJooyung Han         "dup v24.4s, wzr\n"
1756*5f39d1b3SJooyung Han         "dup v25.4s, wzr\n"
1757*5f39d1b3SJooyung Han         "dup v26.4s, wzr\n"
1758*5f39d1b3SJooyung Han         "dup v27.4s, wzr\n"
1759*5f39d1b3SJooyung Han         "dup v28.4s, wzr\n"
1760*5f39d1b3SJooyung Han         "dup v29.4s, wzr\n"
1761*5f39d1b3SJooyung Han         "dup v30.4s, wzr\n"
1762*5f39d1b3SJooyung Han         "dup v31.4s, wzr\n"
1763*5f39d1b3SJooyung Han 
1764*5f39d1b3SJooyung Han         GEMMLOWP_LABEL_BEFORE_LOOP ":\n"
1765*5f39d1b3SJooyung Han 
1766*5f39d1b3SJooyung Han         "subs %w[depth], %w[depth], #4\n"
1767*5f39d1b3SJooyung Han 
1768*5f39d1b3SJooyung Han         // The start of the loop assumes first Rhs cell is already loaded, so
1769*5f39d1b3SJooyung Han         // do it here for first iteration.
1770*5f39d1b3SJooyung Han         "ld1 {v0.16b}, [%[rhs_ptr]], #16\n"
1771*5f39d1b3SJooyung Han 
1772*5f39d1b3SJooyung Han         // And the same for the first Lhs cell.
1773*5f39d1b3SJooyung Han         "ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
1774*5f39d1b3SJooyung Han 
1775*5f39d1b3SJooyung Han         "beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n"
1776*5f39d1b3SJooyung Han 
1777*5f39d1b3SJooyung Han         GEMMLOWP_LABEL_LOOP ":\n"
1778*5f39d1b3SJooyung Han 
1779*5f39d1b3SJooyung Han         // Start the MACs at the head of the loop - 1st cell from each side
1780*5f39d1b3SJooyung Han         // already loaded.
1781*5f39d1b3SJooyung Han         ".word 0x6f80e048  // udot v8.4s, v2.16b, v0.4b[0]\n"
1782*5f39d1b3SJooyung Han         ".word 0x6fa0e049  // udot v9.4s, v2.16b, v0.4b[1]\n"
1783*5f39d1b3SJooyung Han         "ld1 {v1.16b}, [%[rhs_ptr]], #16\n"  // Load second Rhs cell.
1784*5f39d1b3SJooyung Han         ".word 0x6f80e84a  // udot v10.4s, v2.16b, v0.4b[2]\n"
1785*5f39d1b3SJooyung Han         ".word 0x6fa0e84b  // udot v11.4s, v2.16b, v0.4b[3]\n"
1786*5f39d1b3SJooyung Han         "ld1 {v3.16b}, [%[lhs_ptr]], #16\n"  // Load second Lhs cell.
1787*5f39d1b3SJooyung Han         ".word 0x6f81e04c  // udot v12.4s, v2.16b, v1.4b[0]\n"
1788*5f39d1b3SJooyung Han         ".word 0x6fa1e04d  // udot v13.4s, v2.16b, v1.4b[1]\n"
1789*5f39d1b3SJooyung Han         "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"  // Load third Lhs cell.
1790*5f39d1b3SJooyung Han         ".word 0x6f81e84e  // udot v14.4s, v2.16b, v1.4b[2]\n"
1791*5f39d1b3SJooyung Han         ".word 0x6fa1e84f  // udot v15.4s, v2.16b, v1.4b[3]\n"
1792*5f39d1b3SJooyung Han         "ld1 {v2.16b}, [%[lhs_ptr]], #16\n"  // Done with first Lhs cell - load
1793*5f39d1b3SJooyung Han         // for the next iteration early.
1794*5f39d1b3SJooyung Han         ".word 0x6f80e070  // udot v16.4s, v3.16b, v0.4b[0]\n"
1795*5f39d1b3SJooyung Han         ".word 0x6fa0e071  // udot v17.4s, v3.16b, v0.4b[1]\n"
1796*5f39d1b3SJooyung Han         ".word 0x6f80e872  // udot v18.4s, v3.16b, v0.4b[2]\n"
1797*5f39d1b3SJooyung Han         ".word 0x6fa0e873  // udot v19.4s, v3.16b, v0.4b[3]\n"
1798*5f39d1b3SJooyung Han         ".word 0x6f81e074  // udot v20.4s, v3.16b, v1.4b[0]\n"
1799*5f39d1b3SJooyung Han         ".word 0x6fa1e075  // udot v21.4s, v3.16b, v1.4b[1]\n"
1800*5f39d1b3SJooyung Han         ".word 0x6f81e876  // udot v22.4s, v3.16b, v1.4b[2]\n"
1801*5f39d1b3SJooyung Han         ".word 0x6fa1e877  // udot v23.4s, v3.16b, v1.4b[3]\n"
1802*5f39d1b3SJooyung Han         ".word 0x6f80e098  // udot v24.4s, v4.16b, v0.4b[0]\n"
1803*5f39d1b3SJooyung Han         ".word 0x6fa0e099  // udot v25.4s, v4.16b, v0.4b[1]\n"
1804*5f39d1b3SJooyung Han         ".word 0x6f80e89a  // udot v26.4s, v4.16b, v0.4b[2]\n"
1805*5f39d1b3SJooyung Han         ".word 0x6fa0e89b  // udot v27.4s, v4.16b, v0.4b[3]\n"
1806*5f39d1b3SJooyung Han         "ld1 {v0.16b}, [%[rhs_ptr]], #16\n"  // Done with the first Rhs cell -
1807*5f39d1b3SJooyung Han         // load for the next iteration early.
1808*5f39d1b3SJooyung Han         ".word 0x6f81e09c  // udot v28.4s, v4.16b, v1.4b[0]\n"
1809*5f39d1b3SJooyung Han         ".word 0x6fa1e09d  // udot v29.4s, v4.16b, v1.4b[1]\n"
1810*5f39d1b3SJooyung Han 
1811*5f39d1b3SJooyung Han         // Loop.  Decrement loop index (depth) by 4 as udot processes 4
1812*5f39d1b3SJooyung Han         // depth values.
1813*5f39d1b3SJooyung Han         "subs %w[depth], %w[depth], #4\n"
1814*5f39d1b3SJooyung Han         ".word 0x6f81e89e  // udot v30.4s, v4.16b, v1.4b[2]\n"
1815*5f39d1b3SJooyung Han         ".word 0x6fa1e89f  // udot v31.4s, v4.16b, v1.4b[3]\n"
1816*5f39d1b3SJooyung Han 
1817*5f39d1b3SJooyung Han         "bne " GEMMLOWP_LABEL_LOOP "b\n"
1818*5f39d1b3SJooyung Han 
1819*5f39d1b3SJooyung Han         GEMMLOWP_LABEL_AFTER_LOOP ":\n"
1820*5f39d1b3SJooyung Han 
1821*5f39d1b3SJooyung Han         // Final iteration. v0 and v2 were already loaded, don't load
1822*5f39d1b3SJooyung Han         // them again, don't read past the end of buffers.
1823*5f39d1b3SJooyung Han         ".word 0x6f80e048  // udot v8.4s, v2.16b, v0.4b[0]\n"
1824*5f39d1b3SJooyung Han         ".word 0x6fa0e049  // udot v9.4s, v2.16b, v0.4b[1]\n"
1825*5f39d1b3SJooyung Han         "ld1 {v1.16b}, [%[rhs_ptr]], #16\n"  // Load second Rhs cell.
1826*5f39d1b3SJooyung Han         ".word 0x6f80e84a  // udot v10.4s, v2.16b, v0.4b[2]\n"
1827*5f39d1b3SJooyung Han         ".word 0x6fa0e84b  // udot v11.4s, v2.16b, v0.4b[3]\n"
1828*5f39d1b3SJooyung Han         "ld1 {v3.16b}, [%[lhs_ptr]], #16\n"  // Load second Lhs cell.
1829*5f39d1b3SJooyung Han         ".word 0x6f81e04c  // udot v12.4s, v2.16b, v1.4b[0]\n"
1830*5f39d1b3SJooyung Han         ".word 0x6fa1e04d  // udot v13.4s, v2.16b, v1.4b[1]\n"
1831*5f39d1b3SJooyung Han         "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"  // Load third Lhs cell.
1832*5f39d1b3SJooyung Han         ".word 0x6f81e84e  // udot v14.4s, v2.16b, v1.4b[2]\n"
1833*5f39d1b3SJooyung Han         ".word 0x6fa1e84f  // udot v15.4s, v2.16b, v1.4b[3]\n"
1834*5f39d1b3SJooyung Han         ".word 0x6f80e070  // udot v16.4s, v3.16b, v0.4b[0]\n"
1835*5f39d1b3SJooyung Han         ".word 0x6fa0e071  // udot v17.4s, v3.16b, v0.4b[1]\n"
1836*5f39d1b3SJooyung Han         ".word 0x6f80e872  // udot v18.4s, v3.16b, v0.4b[2]\n"
1837*5f39d1b3SJooyung Han         ".word 0x6fa0e873  // udot v19.4s, v3.16b, v0.4b[3]\n"
1838*5f39d1b3SJooyung Han         ".word 0x6f81e074  // udot v20.4s, v3.16b, v1.4b[0]\n"
1839*5f39d1b3SJooyung Han         ".word 0x6fa1e075  // udot v21.4s, v3.16b, v1.4b[1]\n"
1840*5f39d1b3SJooyung Han         ".word 0x6f81e876  // udot v22.4s, v3.16b, v1.4b[2]\n"
1841*5f39d1b3SJooyung Han         ".word 0x6fa1e877  // udot v23.4s, v3.16b, v1.4b[3]\n"
1842*5f39d1b3SJooyung Han         ".word 0x6f80e098  // udot v24.4s, v4.16b, v0.4b[0]\n"
1843*5f39d1b3SJooyung Han         ".word 0x6fa0e099  // udot v25.4s, v4.16b, v0.4b[1]\n"
1844*5f39d1b3SJooyung Han         ".word 0x6f80e89a  // udot v26.4s, v4.16b, v0.4b[2]\n"
1845*5f39d1b3SJooyung Han         ".word 0x6fa0e89b  // udot v27.4s, v4.16b, v0.4b[3]\n"
1846*5f39d1b3SJooyung Han         ".word 0x6f81e09c  // udot v28.4s, v4.16b, v1.4b[0]\n"
1847*5f39d1b3SJooyung Han         ".word 0x6fa1e09d  // udot v29.4s, v4.16b, v1.4b[1]\n"
1848*5f39d1b3SJooyung Han 
1849*5f39d1b3SJooyung Han         // Loop.  Decrement loop index (depth) by 4 as udot processes 4
1850*5f39d1b3SJooyung Han         // depth values.
1851*5f39d1b3SJooyung Han         "subs %w[depth], %w[depth], #4\n"
1852*5f39d1b3SJooyung Han         ".word 0x6f81e89e  // udot v30.4s, v4.16b, v1.4b[2]\n"
1853*5f39d1b3SJooyung Han         ".word 0x6fa1e89f  // udot v31.4s, v4.16b, v1.4b[3]\n"
1854*5f39d1b3SJooyung Han 
1855*5f39d1b3SJooyung Han         // Store accumulators
1856*5f39d1b3SJooyung Han         "mov x1, %[dst_ptr]\n"
1857*5f39d1b3SJooyung Han         "mov x0, x1\n"
1858*5f39d1b3SJooyung Han         "st1 {v8.16b}, [x0], #16\n"
1859*5f39d1b3SJooyung Han         "st1 {v16.16b}, [x0], #16\n"
1860*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1861*5f39d1b3SJooyung Han         "st1 {v24.16b}, [x0]\n"
1862*5f39d1b3SJooyung Han         "mov x0, x1\n"
1863*5f39d1b3SJooyung Han         "st1 {v9.16b}, [x0], #16\n"
1864*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1865*5f39d1b3SJooyung Han         "st1 {v17.16b}, [x0], #16\n"
1866*5f39d1b3SJooyung Han         "st1 {v25.16b}, [x0]\n"
1867*5f39d1b3SJooyung Han         "mov x0, x1\n"
1868*5f39d1b3SJooyung Han         "st1 {v10.16b}, [x0], #16\n"
1869*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1870*5f39d1b3SJooyung Han         "st1 {v18.16b}, [x0], #16\n"
1871*5f39d1b3SJooyung Han         "st1 {v26.16b}, [x0]\n"
1872*5f39d1b3SJooyung Han         "mov x0, x1\n"
1873*5f39d1b3SJooyung Han         "st1 {v11.16b}, [x0], #16\n"
1874*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1875*5f39d1b3SJooyung Han         "st1 {v19.16b}, [x0], #16\n"
1876*5f39d1b3SJooyung Han         "st1 {v27.16b}, [x0]\n"
1877*5f39d1b3SJooyung Han         "mov x0, x1\n"
1878*5f39d1b3SJooyung Han         "st1 {v12.16b}, [x0], #16\n"
1879*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1880*5f39d1b3SJooyung Han         "st1 {v20.16b}, [x0], #16\n"
1881*5f39d1b3SJooyung Han         "st1 {v28.16b}, [x0]\n"
1882*5f39d1b3SJooyung Han         "mov x0, x1\n"
1883*5f39d1b3SJooyung Han         "st1 {v13.16b}, [x0], #16\n"
1884*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1885*5f39d1b3SJooyung Han         "st1 {v21.16b}, [x0], #16\n"
1886*5f39d1b3SJooyung Han         "st1 {v29.16b}, [x0]\n"
1887*5f39d1b3SJooyung Han         "mov x0, x1\n"
1888*5f39d1b3SJooyung Han         "st1 {v14.16b}, [x0], #16\n"
1889*5f39d1b3SJooyung Han         "add x1, x1, %[dst_col_stride]\n"
1890*5f39d1b3SJooyung Han         "st1 {v22.16b}, [x0], #16\n"
1891*5f39d1b3SJooyung Han         "st1 {v30.16b}, [x0]\n"
1892*5f39d1b3SJooyung Han         "mov x0, x1\n"
1893*5f39d1b3SJooyung Han         "st1 {v15.16b}, [x0], #16\n"
1894*5f39d1b3SJooyung Han         "st1 {v23.16b}, [x0], #16\n"
1895*5f39d1b3SJooyung Han         "st1 {v31.16b}, [x0]\n"
1896*5f39d1b3SJooyung Han         :  // outputs
1897*5f39d1b3SJooyung Han         [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
1898*5f39d1b3SJooyung Han         [depth] "+r"(depth)
1899*5f39d1b3SJooyung Han         :  // inputs
1900*5f39d1b3SJooyung Han         [dst_ptr] "r"(dst_ptr), [dst_col_stride] "r"(dst_col_stride), [start_depth] "r"(start_depth)
1901*5f39d1b3SJooyung Han         :  // clobbers
1902*5f39d1b3SJooyung Han         "cc", "memory", "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
1903*5f39d1b3SJooyung Han         "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
1904*5f39d1b3SJooyung Han         "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
1905*5f39d1b3SJooyung Han   }
1906*5f39d1b3SJooyung Han };
1907*5f39d1b3SJooyung Han #endif  // GEMMLOWP_DOTPROD_KERNEL
1908*5f39d1b3SJooyung Han 
1909*5f39d1b3SJooyung Han #endif  // GEMMLOWP_NEON_64
1910*5f39d1b3SJooyung Han 
1911*5f39d1b3SJooyung Han }  // namespace gemmlowp
1912*5f39d1b3SJooyung Han 
1913*5f39d1b3SJooyung Han #endif  // GEMMLOWP_INTERNAL_KERNEL_NEON_H_
1914