1*4bdc9457SAndroid Build Coastguard Worker // Auto-generated file. Do not edit!
2*4bdc9457SAndroid Build Coastguard Worker // Template: src/x32-transposec/neon-zip.c.in
3*4bdc9457SAndroid Build Coastguard Worker // Generator: tools/xngen
4*4bdc9457SAndroid Build Coastguard Worker //
5*4bdc9457SAndroid Build Coastguard Worker // Copyright 2021 Google LLC
6*4bdc9457SAndroid Build Coastguard Worker //
7*4bdc9457SAndroid Build Coastguard Worker // This source code is licensed under the BSD-style license found in the
8*4bdc9457SAndroid Build Coastguard Worker // LICENSE file in the root directory of this source tree.
9*4bdc9457SAndroid Build Coastguard Worker
10*4bdc9457SAndroid Build Coastguard Worker #include <arm_neon.h>
11*4bdc9457SAndroid Build Coastguard Worker
12*4bdc9457SAndroid Build Coastguard Worker #include <assert.h>
13*4bdc9457SAndroid Build Coastguard Worker
14*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/common.h>
15*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/math.h>
16*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/transpose.h>
17*4bdc9457SAndroid Build Coastguard Worker
xnn_x8_transposec_ukernel__16x16_reuse_mov_zip_neon(const uint8_t * input,uint8_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)18*4bdc9457SAndroid Build Coastguard Worker void xnn_x8_transposec_ukernel__16x16_reuse_mov_zip_neon(
19*4bdc9457SAndroid Build Coastguard Worker const uint8_t* input,
20*4bdc9457SAndroid Build Coastguard Worker uint8_t* output,
21*4bdc9457SAndroid Build Coastguard Worker size_t input_stride,
22*4bdc9457SAndroid Build Coastguard Worker size_t output_stride,
23*4bdc9457SAndroid Build Coastguard Worker size_t block_width,
24*4bdc9457SAndroid Build Coastguard Worker size_t block_height) XNN_OOB_READS
25*4bdc9457SAndroid Build Coastguard Worker {
26*4bdc9457SAndroid Build Coastguard Worker assert(output_stride >= block_height * sizeof(uint8_t));
27*4bdc9457SAndroid Build Coastguard Worker assert(input_stride >= block_width * sizeof(uint8_t));
28*4bdc9457SAndroid Build Coastguard Worker
29*4bdc9457SAndroid Build Coastguard Worker const size_t tile_height = 16;
30*4bdc9457SAndroid Build Coastguard Worker const size_t tile_width = 16;
31*4bdc9457SAndroid Build Coastguard Worker const size_t tile_hbytes = tile_height * sizeof(uint8_t);
32*4bdc9457SAndroid Build Coastguard Worker const size_t tile_wbytes = tile_width * sizeof(uint8_t);
33*4bdc9457SAndroid Build Coastguard Worker const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
34*4bdc9457SAndroid Build Coastguard Worker const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint8_t) - tile_hbytes;
35*4bdc9457SAndroid Build Coastguard Worker
36*4bdc9457SAndroid Build Coastguard Worker const uint8_t* i0 = input;
37*4bdc9457SAndroid Build Coastguard Worker uint8_t* o = (uint8_t*) ((uintptr_t) output - tile_hbytes);
38*4bdc9457SAndroid Build Coastguard Worker const size_t minus_output_stride = -output_stride;
39*4bdc9457SAndroid Build Coastguard Worker
40*4bdc9457SAndroid Build Coastguard Worker do {
41*4bdc9457SAndroid Build Coastguard Worker const size_t rem = min(block_width - 1, 15);
42*4bdc9457SAndroid Build Coastguard Worker const size_t oN_stride = rem * output_stride;
43*4bdc9457SAndroid Build Coastguard Worker const size_t oN_offset = oN_stride + tile_hbytes;
44*4bdc9457SAndroid Build Coastguard Worker size_t bh = block_height;
45*4bdc9457SAndroid Build Coastguard Worker for (; bh >= 16; bh -= 16) {
46*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_0 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
47*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_1 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
48*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_2 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
49*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_3 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
50*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_4 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
51*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_5 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
52*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_6 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
53*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_7 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
54*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_8 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
55*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_9 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
56*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_10 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
57*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_11 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
58*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_12 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
59*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_13 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
60*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_14 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
61*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_15 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
62*4bdc9457SAndroid Build Coastguard Worker
63*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v3_0 = vzipq_u8(v4_0, v4_8);
64*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v3_1 = vzipq_u8(v4_1, v4_9);
65*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v3_2 = vzipq_u8(v4_2, v4_10);
66*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v3_3 = vzipq_u8(v4_3, v4_11);
67*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v3_4 = vzipq_u8(v4_4, v4_12);
68*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v3_5 = vzipq_u8(v4_5, v4_13);
69*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v3_6 = vzipq_u8(v4_6, v4_14);
70*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v3_7 = vzipq_u8(v4_7, v4_15);
71*4bdc9457SAndroid Build Coastguard Worker
72*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v2_0 = vzipq_u8(v3_0.val[0], v3_4.val[0]);
73*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v2_1 = vzipq_u8(v3_0.val[1], v3_4.val[1]);
74*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v2_2 = vzipq_u8(v3_1.val[0], v3_5.val[0]);
75*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v2_3 = vzipq_u8(v3_1.val[1], v3_5.val[1]);
76*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v2_4 = vzipq_u8(v3_2.val[0], v3_6.val[0]);
77*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v2_5 = vzipq_u8(v3_2.val[1], v3_6.val[1]);
78*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v2_6 = vzipq_u8(v3_3.val[0], v3_7.val[0]);
79*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v2_7 = vzipq_u8(v3_3.val[1], v3_7.val[1]);
80*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v1_0 = vzipq_u8(v2_0.val[0], v2_4.val[0]);
81*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v1_1 = vzipq_u8(v2_0.val[1], v2_4.val[1]);
82*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v1_2 = vzipq_u8(v2_1.val[0], v2_5.val[0]);
83*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v1_3 = vzipq_u8(v2_1.val[1], v2_5.val[1]);
84*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v1_4 = vzipq_u8(v2_2.val[0], v2_6.val[0]);
85*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v1_5 = vzipq_u8(v2_2.val[1], v2_6.val[1]);
86*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v1_6 = vzipq_u8(v2_3.val[0], v2_7.val[0]);
87*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v1_7 = vzipq_u8(v2_3.val[1], v2_7.val[1]);
88*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v0_0 = vzipq_u8(v1_0.val[0], v1_4.val[0]);
89*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v0_1 = vzipq_u8(v1_0.val[1], v1_4.val[1]);
90*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v0_2 = vzipq_u8(v1_1.val[0], v1_5.val[0]);
91*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v0_3 = vzipq_u8(v1_1.val[1], v1_5.val[1]);
92*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v0_4 = vzipq_u8(v1_2.val[0], v1_6.val[0]);
93*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v0_5 = vzipq_u8(v1_2.val[1], v1_6.val[1]);
94*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v0_6 = vzipq_u8(v1_3.val[0], v1_7.val[0]);
95*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v0_7 = vzipq_u8(v1_3.val[1], v1_7.val[1]);
96*4bdc9457SAndroid Build Coastguard Worker
97*4bdc9457SAndroid Build Coastguard Worker o = (uint8_t*) ((uintptr_t) o + oN_offset);
98*4bdc9457SAndroid Build Coastguard Worker vst1q_u8(o, v0_7.val[1]);
99*4bdc9457SAndroid Build Coastguard Worker uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
100*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 15) {
101*4bdc9457SAndroid Build Coastguard Worker o = oN;
102*4bdc9457SAndroid Build Coastguard Worker }
103*4bdc9457SAndroid Build Coastguard Worker vst1q_u8(o, v0_7.val[0]);
104*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
105*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 15) {
106*4bdc9457SAndroid Build Coastguard Worker o = oN;
107*4bdc9457SAndroid Build Coastguard Worker }
108*4bdc9457SAndroid Build Coastguard Worker vst1q_u8(o, v0_6.val[1]);
109*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
110*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 13) {
111*4bdc9457SAndroid Build Coastguard Worker o = oN;
112*4bdc9457SAndroid Build Coastguard Worker }
113*4bdc9457SAndroid Build Coastguard Worker vst1q_u8(o, v0_6.val[0]);
114*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
115*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 13) {
116*4bdc9457SAndroid Build Coastguard Worker o = oN;
117*4bdc9457SAndroid Build Coastguard Worker }
118*4bdc9457SAndroid Build Coastguard Worker vst1q_u8(o, v0_5.val[1]);
119*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
120*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 11) {
121*4bdc9457SAndroid Build Coastguard Worker o = oN;
122*4bdc9457SAndroid Build Coastguard Worker }
123*4bdc9457SAndroid Build Coastguard Worker vst1q_u8(o, v0_5.val[0]);
124*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
125*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 11) {
126*4bdc9457SAndroid Build Coastguard Worker o = oN;
127*4bdc9457SAndroid Build Coastguard Worker }
128*4bdc9457SAndroid Build Coastguard Worker vst1q_u8(o, v0_4.val[1]);
129*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
130*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 9) {
131*4bdc9457SAndroid Build Coastguard Worker o = oN;
132*4bdc9457SAndroid Build Coastguard Worker }
133*4bdc9457SAndroid Build Coastguard Worker vst1q_u8(o, v0_4.val[0]);
134*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
135*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 9) {
136*4bdc9457SAndroid Build Coastguard Worker o = oN;
137*4bdc9457SAndroid Build Coastguard Worker }
138*4bdc9457SAndroid Build Coastguard Worker vst1q_u8(o, v0_3.val[1]);
139*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
140*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 7) {
141*4bdc9457SAndroid Build Coastguard Worker o = oN;
142*4bdc9457SAndroid Build Coastguard Worker }
143*4bdc9457SAndroid Build Coastguard Worker vst1q_u8(o, v0_3.val[0]);
144*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
145*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 7) {
146*4bdc9457SAndroid Build Coastguard Worker o = oN;
147*4bdc9457SAndroid Build Coastguard Worker }
148*4bdc9457SAndroid Build Coastguard Worker vst1q_u8(o, v0_2.val[1]);
149*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
150*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 5) {
151*4bdc9457SAndroid Build Coastguard Worker o = oN;
152*4bdc9457SAndroid Build Coastguard Worker }
153*4bdc9457SAndroid Build Coastguard Worker vst1q_u8(o, v0_2.val[0]);
154*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
155*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 5) {
156*4bdc9457SAndroid Build Coastguard Worker o = oN;
157*4bdc9457SAndroid Build Coastguard Worker }
158*4bdc9457SAndroid Build Coastguard Worker vst1q_u8(o, v0_1.val[1]);
159*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
160*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 3) {
161*4bdc9457SAndroid Build Coastguard Worker o = oN;
162*4bdc9457SAndroid Build Coastguard Worker }
163*4bdc9457SAndroid Build Coastguard Worker vst1q_u8(o, v0_1.val[0]);
164*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
165*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 3) {
166*4bdc9457SAndroid Build Coastguard Worker o = oN;
167*4bdc9457SAndroid Build Coastguard Worker }
168*4bdc9457SAndroid Build Coastguard Worker vst1q_u8(o, v0_0.val[1]);
169*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
170*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 1) {
171*4bdc9457SAndroid Build Coastguard Worker o = oN;
172*4bdc9457SAndroid Build Coastguard Worker }
173*4bdc9457SAndroid Build Coastguard Worker vst1q_u8(o, v0_0.val[0]);
174*4bdc9457SAndroid Build Coastguard Worker }
175*4bdc9457SAndroid Build Coastguard Worker o = (uint8_t*) ((uintptr_t) o + tile_hbytes);
176*4bdc9457SAndroid Build Coastguard Worker
177*4bdc9457SAndroid Build Coastguard Worker if (bh != 0) {
178*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_0 = vld1q_u8(i0);
179*4bdc9457SAndroid Build Coastguard Worker const uint8_t *i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
180*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh < 2) {
181*4bdc9457SAndroid Build Coastguard Worker i1 = i0;
182*4bdc9457SAndroid Build Coastguard Worker }
183*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_1 = vld1q_u8(i1);
184*4bdc9457SAndroid Build Coastguard Worker const uint8_t *i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
185*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh <= 2) {
186*4bdc9457SAndroid Build Coastguard Worker i2 = i1;
187*4bdc9457SAndroid Build Coastguard Worker }
188*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_2 = vld1q_u8(i2);
189*4bdc9457SAndroid Build Coastguard Worker const uint8_t *i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
190*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh < 4) {
191*4bdc9457SAndroid Build Coastguard Worker i3 = i2;
192*4bdc9457SAndroid Build Coastguard Worker }
193*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_3 = vld1q_u8(i3);
194*4bdc9457SAndroid Build Coastguard Worker const uint8_t *i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
195*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh <= 4) {
196*4bdc9457SAndroid Build Coastguard Worker i4 = i3;
197*4bdc9457SAndroid Build Coastguard Worker }
198*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_4 = vld1q_u8(i4);
199*4bdc9457SAndroid Build Coastguard Worker const uint8_t *i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
200*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh < 6) {
201*4bdc9457SAndroid Build Coastguard Worker i5 = i4;
202*4bdc9457SAndroid Build Coastguard Worker }
203*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_5 = vld1q_u8(i5);
204*4bdc9457SAndroid Build Coastguard Worker const uint8_t *i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
205*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh <= 6) {
206*4bdc9457SAndroid Build Coastguard Worker i6 = i5;
207*4bdc9457SAndroid Build Coastguard Worker }
208*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_6 = vld1q_u8(i6);
209*4bdc9457SAndroid Build Coastguard Worker const uint8_t *i7 = (const uint8_t*) ((uintptr_t) i6 + input_stride);
210*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh < 8) {
211*4bdc9457SAndroid Build Coastguard Worker i7 = i6;
212*4bdc9457SAndroid Build Coastguard Worker }
213*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_7 = vld1q_u8(i7);
214*4bdc9457SAndroid Build Coastguard Worker const uint8_t *i8 = (const uint8_t*) ((uintptr_t) i7 + input_stride);
215*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh <= 8) {
216*4bdc9457SAndroid Build Coastguard Worker i8 = i7;
217*4bdc9457SAndroid Build Coastguard Worker }
218*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_8 = vld1q_u8(i8);
219*4bdc9457SAndroid Build Coastguard Worker const uint8_t *i9 = (const uint8_t*) ((uintptr_t) i8 + input_stride);
220*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh < 10) {
221*4bdc9457SAndroid Build Coastguard Worker i9 = i8;
222*4bdc9457SAndroid Build Coastguard Worker }
223*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_9 = vld1q_u8(i9);
224*4bdc9457SAndroid Build Coastguard Worker const uint8_t *i10 = (const uint8_t*) ((uintptr_t) i9 + input_stride);
225*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh <= 10) {
226*4bdc9457SAndroid Build Coastguard Worker i10 = i9;
227*4bdc9457SAndroid Build Coastguard Worker }
228*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_10 = vld1q_u8(i10);
229*4bdc9457SAndroid Build Coastguard Worker const uint8_t *i11 = (const uint8_t*) ((uintptr_t) i10 + input_stride);
230*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh < 12) {
231*4bdc9457SAndroid Build Coastguard Worker i11 = i10;
232*4bdc9457SAndroid Build Coastguard Worker }
233*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_11 = vld1q_u8(i11);
234*4bdc9457SAndroid Build Coastguard Worker const uint8_t *i12 = (const uint8_t*) ((uintptr_t) i11 + input_stride);
235*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh <= 12) {
236*4bdc9457SAndroid Build Coastguard Worker i12 = i11;
237*4bdc9457SAndroid Build Coastguard Worker }
238*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_12 = vld1q_u8(i12);
239*4bdc9457SAndroid Build Coastguard Worker const uint8_t *i13 = (const uint8_t*) ((uintptr_t) i12 + input_stride);
240*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh < 14) {
241*4bdc9457SAndroid Build Coastguard Worker i13 = i12;
242*4bdc9457SAndroid Build Coastguard Worker }
243*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_13 = vld1q_u8(i13);
244*4bdc9457SAndroid Build Coastguard Worker const uint8_t *i14 = (const uint8_t*) ((uintptr_t) i13 + input_stride);
245*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(bh <= 14) {
246*4bdc9457SAndroid Build Coastguard Worker i14 = i13;
247*4bdc9457SAndroid Build Coastguard Worker }
248*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_14 = vld1q_u8(i14);
249*4bdc9457SAndroid Build Coastguard Worker const uint8x16_t v4_15 = vmovq_n_u8(0);
250*4bdc9457SAndroid Build Coastguard Worker
251*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v3_0 = vzipq_u8(v4_0, v4_8);
252*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v3_1 = vzipq_u8(v4_1, v4_9);
253*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v3_2 = vzipq_u8(v4_2, v4_10);
254*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v3_3 = vzipq_u8(v4_3, v4_11);
255*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v3_4 = vzipq_u8(v4_4, v4_12);
256*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v3_5 = vzipq_u8(v4_5, v4_13);
257*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v3_6 = vzipq_u8(v4_6, v4_14);
258*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v3_7 = vzipq_u8(v4_7, v4_15);
259*4bdc9457SAndroid Build Coastguard Worker
260*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v2_0 = vzipq_u8(v3_0.val[0], v3_4.val[0]);
261*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v2_1 = vzipq_u8(v3_0.val[1], v3_4.val[1]);
262*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v2_2 = vzipq_u8(v3_1.val[0], v3_5.val[0]);
263*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v2_3 = vzipq_u8(v3_1.val[1], v3_5.val[1]);
264*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v2_4 = vzipq_u8(v3_2.val[0], v3_6.val[0]);
265*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v2_5 = vzipq_u8(v3_2.val[1], v3_6.val[1]);
266*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v2_6 = vzipq_u8(v3_3.val[0], v3_7.val[0]);
267*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v2_7 = vzipq_u8(v3_3.val[1], v3_7.val[1]);
268*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v1_0 = vzipq_u8(v2_0.val[0], v2_4.val[0]);
269*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v1_1 = vzipq_u8(v2_0.val[1], v2_4.val[1]);
270*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v1_2 = vzipq_u8(v2_1.val[0], v2_5.val[0]);
271*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v1_3 = vzipq_u8(v2_1.val[1], v2_5.val[1]);
272*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v1_4 = vzipq_u8(v2_2.val[0], v2_6.val[0]);
273*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v1_5 = vzipq_u8(v2_2.val[1], v2_6.val[1]);
274*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v1_6 = vzipq_u8(v2_3.val[0], v2_7.val[0]);
275*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v1_7 = vzipq_u8(v2_3.val[1], v2_7.val[1]);
276*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v0_0 = vzipq_u8(v1_0.val[0], v1_4.val[0]);
277*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v0_1 = vzipq_u8(v1_0.val[1], v1_4.val[1]);
278*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v0_2 = vzipq_u8(v1_1.val[0], v1_5.val[0]);
279*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v0_3 = vzipq_u8(v1_1.val[1], v1_5.val[1]);
280*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v0_4 = vzipq_u8(v1_2.val[0], v1_6.val[0]);
281*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v0_5 = vzipq_u8(v1_2.val[1], v1_6.val[1]);
282*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v0_6 = vzipq_u8(v1_3.val[0], v1_7.val[0]);
283*4bdc9457SAndroid Build Coastguard Worker const uint8x16x2_t v0_7 = vzipq_u8(v1_3.val[1], v1_7.val[1]);
284*4bdc9457SAndroid Build Coastguard Worker
285*4bdc9457SAndroid Build Coastguard Worker uint8x8_t v0_low = vget_low_u8(v0_0.val[0]);
286*4bdc9457SAndroid Build Coastguard Worker uint8x8_t v1_low = vget_low_u8(v0_0.val[1]);
287*4bdc9457SAndroid Build Coastguard Worker uint8x8_t v2_low = vget_low_u8(v0_1.val[0]);
288*4bdc9457SAndroid Build Coastguard Worker uint8x8_t v3_low = vget_low_u8(v0_1.val[1]);
289*4bdc9457SAndroid Build Coastguard Worker uint8x8_t v4_low = vget_low_u8(v0_2.val[0]);
290*4bdc9457SAndroid Build Coastguard Worker uint8x8_t v5_low = vget_low_u8(v0_2.val[1]);
291*4bdc9457SAndroid Build Coastguard Worker uint8x8_t v6_low = vget_low_u8(v0_3.val[0]);
292*4bdc9457SAndroid Build Coastguard Worker uint8x8_t v7_low = vget_low_u8(v0_3.val[1]);
293*4bdc9457SAndroid Build Coastguard Worker uint8x8_t v8_low = vget_low_u8(v0_4.val[0]);
294*4bdc9457SAndroid Build Coastguard Worker uint8x8_t v9_low = vget_low_u8(v0_4.val[1]);
295*4bdc9457SAndroid Build Coastguard Worker uint8x8_t v10_low = vget_low_u8(v0_5.val[0]);
296*4bdc9457SAndroid Build Coastguard Worker uint8x8_t v11_low = vget_low_u8(v0_5.val[1]);
297*4bdc9457SAndroid Build Coastguard Worker uint8x8_t v12_low = vget_low_u8(v0_6.val[0]);
298*4bdc9457SAndroid Build Coastguard Worker uint8x8_t v13_low = vget_low_u8(v0_6.val[1]);
299*4bdc9457SAndroid Build Coastguard Worker uint8x8_t v14_low = vget_low_u8(v0_7.val[0]);
300*4bdc9457SAndroid Build Coastguard Worker uint8x8_t v15_low = vget_low_u8(v0_7.val[1]);
301*4bdc9457SAndroid Build Coastguard Worker
302*4bdc9457SAndroid Build Coastguard Worker if (bh & 8) {
303*4bdc9457SAndroid Build Coastguard Worker o = (uint8_t*) ((uintptr_t) o + oN_stride);
304*4bdc9457SAndroid Build Coastguard Worker vst1_u8(o, v15_low);
305*4bdc9457SAndroid Build Coastguard Worker uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
306*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 15) {
307*4bdc9457SAndroid Build Coastguard Worker o = oN;
308*4bdc9457SAndroid Build Coastguard Worker }
309*4bdc9457SAndroid Build Coastguard Worker vst1_u8(o, v14_low);
310*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
311*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 15) {
312*4bdc9457SAndroid Build Coastguard Worker o = oN;
313*4bdc9457SAndroid Build Coastguard Worker }
314*4bdc9457SAndroid Build Coastguard Worker vst1_u8(o, v13_low);
315*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
316*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 13) {
317*4bdc9457SAndroid Build Coastguard Worker o = oN;
318*4bdc9457SAndroid Build Coastguard Worker }
319*4bdc9457SAndroid Build Coastguard Worker vst1_u8(o, v12_low);
320*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
321*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 13) {
322*4bdc9457SAndroid Build Coastguard Worker o = oN;
323*4bdc9457SAndroid Build Coastguard Worker }
324*4bdc9457SAndroid Build Coastguard Worker vst1_u8(o, v11_low);
325*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
326*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 11) {
327*4bdc9457SAndroid Build Coastguard Worker o = oN;
328*4bdc9457SAndroid Build Coastguard Worker }
329*4bdc9457SAndroid Build Coastguard Worker vst1_u8(o, v10_low);
330*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
331*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 11) {
332*4bdc9457SAndroid Build Coastguard Worker o = oN;
333*4bdc9457SAndroid Build Coastguard Worker }
334*4bdc9457SAndroid Build Coastguard Worker vst1_u8(o, v9_low);
335*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
336*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 9) {
337*4bdc9457SAndroid Build Coastguard Worker o = oN;
338*4bdc9457SAndroid Build Coastguard Worker }
339*4bdc9457SAndroid Build Coastguard Worker vst1_u8(o, v8_low);
340*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
341*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 9) {
342*4bdc9457SAndroid Build Coastguard Worker o = oN;
343*4bdc9457SAndroid Build Coastguard Worker }
344*4bdc9457SAndroid Build Coastguard Worker vst1_u8(o, v7_low);
345*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
346*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 7) {
347*4bdc9457SAndroid Build Coastguard Worker o = oN;
348*4bdc9457SAndroid Build Coastguard Worker }
349*4bdc9457SAndroid Build Coastguard Worker vst1_u8(o, v6_low);
350*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
351*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 7) {
352*4bdc9457SAndroid Build Coastguard Worker o = oN;
353*4bdc9457SAndroid Build Coastguard Worker }
354*4bdc9457SAndroid Build Coastguard Worker vst1_u8(o, v5_low);
355*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
356*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 5) {
357*4bdc9457SAndroid Build Coastguard Worker o = oN;
358*4bdc9457SAndroid Build Coastguard Worker }
359*4bdc9457SAndroid Build Coastguard Worker vst1_u8(o, v4_low);
360*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
361*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 5) {
362*4bdc9457SAndroid Build Coastguard Worker o = oN;
363*4bdc9457SAndroid Build Coastguard Worker }
364*4bdc9457SAndroid Build Coastguard Worker vst1_u8(o, v3_low);
365*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
366*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 3) {
367*4bdc9457SAndroid Build Coastguard Worker o = oN;
368*4bdc9457SAndroid Build Coastguard Worker }
369*4bdc9457SAndroid Build Coastguard Worker vst1_u8(o, v2_low);
370*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
371*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 3) {
372*4bdc9457SAndroid Build Coastguard Worker o = oN;
373*4bdc9457SAndroid Build Coastguard Worker }
374*4bdc9457SAndroid Build Coastguard Worker vst1_u8(o, v1_low);
375*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
376*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 1) {
377*4bdc9457SAndroid Build Coastguard Worker o = oN;
378*4bdc9457SAndroid Build Coastguard Worker }
379*4bdc9457SAndroid Build Coastguard Worker vst1_u8(o, v0_low); o += 8;
380*4bdc9457SAndroid Build Coastguard Worker v0_low = vget_high_u8(v0_0.val[0]);
381*4bdc9457SAndroid Build Coastguard Worker v1_low = vget_high_u8(v0_0.val[1]);
382*4bdc9457SAndroid Build Coastguard Worker v2_low = vget_high_u8(v0_1.val[0]);
383*4bdc9457SAndroid Build Coastguard Worker v3_low = vget_high_u8(v0_1.val[1]);
384*4bdc9457SAndroid Build Coastguard Worker v4_low = vget_high_u8(v0_2.val[0]);
385*4bdc9457SAndroid Build Coastguard Worker v5_low = vget_high_u8(v0_2.val[1]);
386*4bdc9457SAndroid Build Coastguard Worker v6_low = vget_high_u8(v0_3.val[0]);
387*4bdc9457SAndroid Build Coastguard Worker v7_low = vget_high_u8(v0_3.val[1]);
388*4bdc9457SAndroid Build Coastguard Worker v8_low = vget_high_u8(v0_4.val[0]);
389*4bdc9457SAndroid Build Coastguard Worker v9_low = vget_high_u8(v0_4.val[1]);
390*4bdc9457SAndroid Build Coastguard Worker v10_low = vget_high_u8(v0_5.val[0]);
391*4bdc9457SAndroid Build Coastguard Worker v11_low = vget_high_u8(v0_5.val[1]);
392*4bdc9457SAndroid Build Coastguard Worker v12_low = vget_high_u8(v0_6.val[0]);
393*4bdc9457SAndroid Build Coastguard Worker v13_low = vget_high_u8(v0_6.val[1]);
394*4bdc9457SAndroid Build Coastguard Worker v14_low = vget_high_u8(v0_7.val[0]);
395*4bdc9457SAndroid Build Coastguard Worker v15_low = vget_high_u8(v0_7.val[1]);
396*4bdc9457SAndroid Build Coastguard Worker }
397*4bdc9457SAndroid Build Coastguard Worker
398*4bdc9457SAndroid Build Coastguard Worker if (bh & 4) {
399*4bdc9457SAndroid Build Coastguard Worker o = (uint8_t*) ((uintptr_t) o + oN_stride);
400*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_u8(v15_low), 0);
401*4bdc9457SAndroid Build Coastguard Worker uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
402*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 15) {
403*4bdc9457SAndroid Build Coastguard Worker o = oN;
404*4bdc9457SAndroid Build Coastguard Worker }
405*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_u8(v14_low), 0);
406*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
407*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 15) {
408*4bdc9457SAndroid Build Coastguard Worker o = oN;
409*4bdc9457SAndroid Build Coastguard Worker }
410*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_u8(v13_low), 0);
411*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
412*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 13) {
413*4bdc9457SAndroid Build Coastguard Worker o = oN;
414*4bdc9457SAndroid Build Coastguard Worker }
415*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_u8(v12_low), 0);
416*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
417*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 13) {
418*4bdc9457SAndroid Build Coastguard Worker o = oN;
419*4bdc9457SAndroid Build Coastguard Worker }
420*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_u8(v11_low), 0);
421*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
422*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 11) {
423*4bdc9457SAndroid Build Coastguard Worker o = oN;
424*4bdc9457SAndroid Build Coastguard Worker }
425*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_u8(v10_low), 0);
426*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
427*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 11) {
428*4bdc9457SAndroid Build Coastguard Worker o = oN;
429*4bdc9457SAndroid Build Coastguard Worker }
430*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_u8(v9_low), 0);
431*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
432*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 9) {
433*4bdc9457SAndroid Build Coastguard Worker o = oN;
434*4bdc9457SAndroid Build Coastguard Worker }
435*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_u8(v8_low), 0);
436*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
437*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 9) {
438*4bdc9457SAndroid Build Coastguard Worker o = oN;
439*4bdc9457SAndroid Build Coastguard Worker }
440*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_u8(v7_low), 0);
441*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
442*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 7) {
443*4bdc9457SAndroid Build Coastguard Worker o = oN;
444*4bdc9457SAndroid Build Coastguard Worker }
445*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_u8(v6_low), 0);
446*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
447*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 7) {
448*4bdc9457SAndroid Build Coastguard Worker o = oN;
449*4bdc9457SAndroid Build Coastguard Worker }
450*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_u8(v5_low), 0);
451*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
452*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 5) {
453*4bdc9457SAndroid Build Coastguard Worker o = oN;
454*4bdc9457SAndroid Build Coastguard Worker }
455*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_u8(v4_low), 0);
456*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
457*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 5) {
458*4bdc9457SAndroid Build Coastguard Worker o = oN;
459*4bdc9457SAndroid Build Coastguard Worker }
460*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_u8(v3_low), 0);
461*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
462*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 3) {
463*4bdc9457SAndroid Build Coastguard Worker o = oN;
464*4bdc9457SAndroid Build Coastguard Worker }
465*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_u8(v2_low), 0);
466*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
467*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 3) {
468*4bdc9457SAndroid Build Coastguard Worker o = oN;
469*4bdc9457SAndroid Build Coastguard Worker }
470*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_u8(v1_low), 0);
471*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
472*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 1) {
473*4bdc9457SAndroid Build Coastguard Worker o = oN;
474*4bdc9457SAndroid Build Coastguard Worker }
475*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_u8(v0_low), 0); o += 4;
476*4bdc9457SAndroid Build Coastguard Worker v0_low = vext_u8(v0_low, v0_low, 4);
477*4bdc9457SAndroid Build Coastguard Worker v1_low = vext_u8(v1_low, v1_low, 4);
478*4bdc9457SAndroid Build Coastguard Worker v2_low = vext_u8(v2_low, v2_low, 4);
479*4bdc9457SAndroid Build Coastguard Worker v3_low = vext_u8(v3_low, v3_low, 4);
480*4bdc9457SAndroid Build Coastguard Worker v4_low = vext_u8(v4_low, v4_low, 4);
481*4bdc9457SAndroid Build Coastguard Worker v5_low = vext_u8(v5_low, v5_low, 4);
482*4bdc9457SAndroid Build Coastguard Worker v6_low = vext_u8(v6_low, v6_low, 4);
483*4bdc9457SAndroid Build Coastguard Worker v7_low = vext_u8(v7_low, v7_low, 4);
484*4bdc9457SAndroid Build Coastguard Worker v8_low = vext_u8(v8_low, v8_low, 4);
485*4bdc9457SAndroid Build Coastguard Worker v9_low = vext_u8(v9_low, v9_low, 4);
486*4bdc9457SAndroid Build Coastguard Worker v10_low = vext_u8(v10_low, v10_low, 4);
487*4bdc9457SAndroid Build Coastguard Worker v11_low = vext_u8(v11_low, v11_low, 4);
488*4bdc9457SAndroid Build Coastguard Worker v12_low = vext_u8(v12_low, v12_low, 4);
489*4bdc9457SAndroid Build Coastguard Worker v13_low = vext_u8(v13_low, v13_low, 4);
490*4bdc9457SAndroid Build Coastguard Worker v14_low = vext_u8(v14_low, v14_low, 4);
491*4bdc9457SAndroid Build Coastguard Worker v15_low = vext_u8(v15_low, v15_low, 4);
492*4bdc9457SAndroid Build Coastguard Worker }
493*4bdc9457SAndroid Build Coastguard Worker if (bh & 2) {
494*4bdc9457SAndroid Build Coastguard Worker o = (uint8_t*) ((uintptr_t) o + oN_stride);
495*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u8(v15_low), 0);
496*4bdc9457SAndroid Build Coastguard Worker uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
497*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 15) {
498*4bdc9457SAndroid Build Coastguard Worker o = oN;
499*4bdc9457SAndroid Build Coastguard Worker }
500*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u8(v14_low), 0);
501*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
502*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 15) {
503*4bdc9457SAndroid Build Coastguard Worker o = oN;
504*4bdc9457SAndroid Build Coastguard Worker }
505*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u8(v13_low), 0);
506*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
507*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 13) {
508*4bdc9457SAndroid Build Coastguard Worker o = oN;
509*4bdc9457SAndroid Build Coastguard Worker }
510*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u8(v12_low), 0);
511*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
512*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 13) {
513*4bdc9457SAndroid Build Coastguard Worker o = oN;
514*4bdc9457SAndroid Build Coastguard Worker }
515*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u8(v11_low), 0);
516*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
517*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 11) {
518*4bdc9457SAndroid Build Coastguard Worker o = oN;
519*4bdc9457SAndroid Build Coastguard Worker }
520*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u8(v10_low), 0);
521*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
522*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 11) {
523*4bdc9457SAndroid Build Coastguard Worker o = oN;
524*4bdc9457SAndroid Build Coastguard Worker }
525*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u8(v9_low), 0);
526*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
527*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 9) {
528*4bdc9457SAndroid Build Coastguard Worker o = oN;
529*4bdc9457SAndroid Build Coastguard Worker }
530*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u8(v8_low), 0);
531*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
532*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 9) {
533*4bdc9457SAndroid Build Coastguard Worker o = oN;
534*4bdc9457SAndroid Build Coastguard Worker }
535*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u8(v7_low), 0);
536*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
537*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 7) {
538*4bdc9457SAndroid Build Coastguard Worker o = oN;
539*4bdc9457SAndroid Build Coastguard Worker }
540*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u8(v6_low), 0);
541*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
542*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 7) {
543*4bdc9457SAndroid Build Coastguard Worker o = oN;
544*4bdc9457SAndroid Build Coastguard Worker }
545*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u8(v5_low), 0);
546*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
547*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 5) {
548*4bdc9457SAndroid Build Coastguard Worker o = oN;
549*4bdc9457SAndroid Build Coastguard Worker }
550*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u8(v4_low), 0);
551*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
552*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 5) {
553*4bdc9457SAndroid Build Coastguard Worker o = oN;
554*4bdc9457SAndroid Build Coastguard Worker }
555*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u8(v3_low), 0);
556*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
557*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 3) {
558*4bdc9457SAndroid Build Coastguard Worker o = oN;
559*4bdc9457SAndroid Build Coastguard Worker }
560*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u8(v2_low), 0);
561*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
562*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 3) {
563*4bdc9457SAndroid Build Coastguard Worker o = oN;
564*4bdc9457SAndroid Build Coastguard Worker }
565*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u8(v1_low), 0);
566*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
567*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 1) {
568*4bdc9457SAndroid Build Coastguard Worker o = oN;
569*4bdc9457SAndroid Build Coastguard Worker }
570*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16((void*) o, vreinterpret_u16_u8(v0_low), 0); o += 2;
571*4bdc9457SAndroid Build Coastguard Worker v0_low = vext_u8(v0_low, v0_low, 2);
572*4bdc9457SAndroid Build Coastguard Worker v1_low = vext_u8(v1_low, v1_low, 2);
573*4bdc9457SAndroid Build Coastguard Worker v2_low = vext_u8(v2_low, v2_low, 2);
574*4bdc9457SAndroid Build Coastguard Worker v3_low = vext_u8(v3_low, v3_low, 2);
575*4bdc9457SAndroid Build Coastguard Worker v4_low = vext_u8(v4_low, v4_low, 2);
576*4bdc9457SAndroid Build Coastguard Worker v5_low = vext_u8(v5_low, v5_low, 2);
577*4bdc9457SAndroid Build Coastguard Worker v6_low = vext_u8(v6_low, v6_low, 2);
578*4bdc9457SAndroid Build Coastguard Worker v7_low = vext_u8(v7_low, v7_low, 2);
579*4bdc9457SAndroid Build Coastguard Worker v8_low = vext_u8(v8_low, v8_low, 2);
580*4bdc9457SAndroid Build Coastguard Worker v9_low = vext_u8(v9_low, v9_low, 2);
581*4bdc9457SAndroid Build Coastguard Worker v10_low = vext_u8(v10_low, v10_low, 2);
582*4bdc9457SAndroid Build Coastguard Worker v11_low = vext_u8(v11_low, v11_low, 2);
583*4bdc9457SAndroid Build Coastguard Worker v12_low = vext_u8(v12_low, v12_low, 2);
584*4bdc9457SAndroid Build Coastguard Worker v13_low = vext_u8(v13_low, v13_low, 2);
585*4bdc9457SAndroid Build Coastguard Worker v14_low = vext_u8(v14_low, v14_low, 2);
586*4bdc9457SAndroid Build Coastguard Worker v15_low = vext_u8(v15_low, v15_low, 2);
587*4bdc9457SAndroid Build Coastguard Worker }
588*4bdc9457SAndroid Build Coastguard Worker if (bh & 1) {
589*4bdc9457SAndroid Build Coastguard Worker o = (uint8_t*) ((uintptr_t) o + oN_stride);
590*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o, v15_low, 0);
591*4bdc9457SAndroid Build Coastguard Worker uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
592*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 15) {
593*4bdc9457SAndroid Build Coastguard Worker o = oN;
594*4bdc9457SAndroid Build Coastguard Worker }
595*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o, v14_low, 0);
596*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
597*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 15) {
598*4bdc9457SAndroid Build Coastguard Worker o = oN;
599*4bdc9457SAndroid Build Coastguard Worker }
600*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o, v13_low, 0);
601*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
602*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 13) {
603*4bdc9457SAndroid Build Coastguard Worker o = oN;
604*4bdc9457SAndroid Build Coastguard Worker }
605*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o, v12_low, 0);
606*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
607*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 13) {
608*4bdc9457SAndroid Build Coastguard Worker o = oN;
609*4bdc9457SAndroid Build Coastguard Worker }
610*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o, v11_low, 0);
611*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
612*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 11) {
613*4bdc9457SAndroid Build Coastguard Worker o = oN;
614*4bdc9457SAndroid Build Coastguard Worker }
615*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o, v10_low, 0);
616*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
617*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 11) {
618*4bdc9457SAndroid Build Coastguard Worker o = oN;
619*4bdc9457SAndroid Build Coastguard Worker }
620*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o, v9_low, 0);
621*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
622*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 9) {
623*4bdc9457SAndroid Build Coastguard Worker o = oN;
624*4bdc9457SAndroid Build Coastguard Worker }
625*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o, v8_low, 0);
626*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
627*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 9) {
628*4bdc9457SAndroid Build Coastguard Worker o = oN;
629*4bdc9457SAndroid Build Coastguard Worker }
630*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o, v7_low, 0);
631*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
632*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 7) {
633*4bdc9457SAndroid Build Coastguard Worker o = oN;
634*4bdc9457SAndroid Build Coastguard Worker }
635*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o, v6_low, 0);
636*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
637*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 7) {
638*4bdc9457SAndroid Build Coastguard Worker o = oN;
639*4bdc9457SAndroid Build Coastguard Worker }
640*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o, v5_low, 0);
641*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
642*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 5) {
643*4bdc9457SAndroid Build Coastguard Worker o = oN;
644*4bdc9457SAndroid Build Coastguard Worker }
645*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o, v4_low, 0);
646*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
647*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 5) {
648*4bdc9457SAndroid Build Coastguard Worker o = oN;
649*4bdc9457SAndroid Build Coastguard Worker }
650*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o, v3_low, 0);
651*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
652*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 3) {
653*4bdc9457SAndroid Build Coastguard Worker o = oN;
654*4bdc9457SAndroid Build Coastguard Worker }
655*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o, v2_low, 0);
656*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
657*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width >= 3) {
658*4bdc9457SAndroid Build Coastguard Worker o = oN;
659*4bdc9457SAndroid Build Coastguard Worker }
660*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o, v1_low, 0);
661*4bdc9457SAndroid Build Coastguard Worker oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
662*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(block_width > 1) {
663*4bdc9457SAndroid Build Coastguard Worker o = oN;
664*4bdc9457SAndroid Build Coastguard Worker }
665*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u8(o, v0_low, 0);
666*4bdc9457SAndroid Build Coastguard Worker }
667*4bdc9457SAndroid Build Coastguard Worker }
668*4bdc9457SAndroid Build Coastguard Worker
669*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint8_t*) ((uintptr_t) i0 + input_reset);
670*4bdc9457SAndroid Build Coastguard Worker o = (uint8_t*) ((uintptr_t) o + output_reset);
671*4bdc9457SAndroid Build Coastguard Worker block_width = doz(block_width, tile_width);
672*4bdc9457SAndroid Build Coastguard Worker } while (block_width != 0);
673*4bdc9457SAndroid Build Coastguard Worker }
674