1*4bdc9457SAndroid Build Coastguard Worker // Auto-generated file. Do not edit!
2*4bdc9457SAndroid Build Coastguard Worker // Template: src/f32-spmm/neon-blocked.c.in
3*4bdc9457SAndroid Build Coastguard Worker // Generator: tools/xngen
4*4bdc9457SAndroid Build Coastguard Worker //
5*4bdc9457SAndroid Build Coastguard Worker // Copyright 2019 Google LLC
6*4bdc9457SAndroid Build Coastguard Worker //
7*4bdc9457SAndroid Build Coastguard Worker // This source code is licensed under the BSD-style license found in the
8*4bdc9457SAndroid Build Coastguard Worker // LICENSE file in the root directory of this source tree.
9*4bdc9457SAndroid Build Coastguard Worker
10*4bdc9457SAndroid Build Coastguard Worker #include <assert.h>
11*4bdc9457SAndroid Build Coastguard Worker
12*4bdc9457SAndroid Build Coastguard Worker #include <arm_neon.h>
13*4bdc9457SAndroid Build Coastguard Worker
14*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/spmm.h>
15*4bdc9457SAndroid Build Coastguard Worker
16*4bdc9457SAndroid Build Coastguard Worker
xnn_f32_spmm_minmax_ukernel_32x4__neonfma(size_t mc,size_t nc,const float * restrict input,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict output,size_t output_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])17*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_spmm_minmax_ukernel_32x4__neonfma(
18*4bdc9457SAndroid Build Coastguard Worker size_t mc,
19*4bdc9457SAndroid Build Coastguard Worker size_t nc,
20*4bdc9457SAndroid Build Coastguard Worker const float*restrict input,
21*4bdc9457SAndroid Build Coastguard Worker const float*restrict weights,
22*4bdc9457SAndroid Build Coastguard Worker const int32_t*restrict widx_dmap,
23*4bdc9457SAndroid Build Coastguard Worker const uint32_t*restrict nidx_nnzmap,
24*4bdc9457SAndroid Build Coastguard Worker float*restrict output,
25*4bdc9457SAndroid Build Coastguard Worker size_t output_stride,
26*4bdc9457SAndroid Build Coastguard Worker const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
27*4bdc9457SAndroid Build Coastguard Worker {
28*4bdc9457SAndroid Build Coastguard Worker assert(mc != 0);
29*4bdc9457SAndroid Build Coastguard Worker assert(mc % sizeof(float) == 0);
30*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0);
31*4bdc9457SAndroid Build Coastguard Worker
32*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min);
33*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max);
34*4bdc9457SAndroid Build Coastguard Worker size_t output_decrement = output_stride * nc - 32 * sizeof(float);
35*4bdc9457SAndroid Build Coastguard Worker while XNN_LIKELY(mc >= 32 * sizeof(float)) {
36*4bdc9457SAndroid Build Coastguard Worker const float*restrict w = weights;
37*4bdc9457SAndroid Build Coastguard Worker const int32_t* dmap = widx_dmap;
38*4bdc9457SAndroid Build Coastguard Worker const uint32_t* nnzmap = nidx_nnzmap;
39*4bdc9457SAndroid Build Coastguard Worker size_t n = nc;
40*4bdc9457SAndroid Build Coastguard Worker while (n >= 4) {
41*4bdc9457SAndroid Build Coastguard Worker uint32_t nnz = *nnzmap++;
42*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc0123n0 = vld1q_dup_f32(w); w += 1;
43*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc4567n0 = vacc0123n0;
44*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc89ABn0 = vacc0123n0;
45*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccCDEFn0 = vacc0123n0;
46*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccGHIJn0 = vacc0123n0;
47*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccKLMNn0 = vacc0123n0;
48*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccOPQRn0 = vacc0123n0;
49*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccSTUVn0 = vacc0123n0;
50*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc0123n1 = vld1q_dup_f32(w); w += 1;
51*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc4567n1 = vacc0123n1;
52*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc89ABn1 = vacc0123n1;
53*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccCDEFn1 = vacc0123n1;
54*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccGHIJn1 = vacc0123n1;
55*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccKLMNn1 = vacc0123n1;
56*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccOPQRn1 = vacc0123n1;
57*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccSTUVn1 = vacc0123n1;
58*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc0123n2 = vld1q_dup_f32(w); w += 1;
59*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc4567n2 = vacc0123n2;
60*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc89ABn2 = vacc0123n2;
61*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccCDEFn2 = vacc0123n2;
62*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccGHIJn2 = vacc0123n2;
63*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccKLMNn2 = vacc0123n2;
64*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccOPQRn2 = vacc0123n2;
65*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccSTUVn2 = vacc0123n2;
66*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc0123n3 = vld1q_dup_f32(w); w += 1;
67*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc4567n3 = vacc0123n3;
68*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc89ABn3 = vacc0123n3;
69*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccCDEFn3 = vacc0123n3;
70*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccGHIJn3 = vacc0123n3;
71*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccKLMNn3 = vacc0123n3;
72*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccOPQRn3 = vacc0123n3;
73*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccSTUVn3 = vacc0123n3;
74*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nnz != 0) {
75*4bdc9457SAndroid Build Coastguard Worker do {
76*4bdc9457SAndroid Build Coastguard Worker const intptr_t diff = *dmap++;
77*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi0123 = vld1q_f32(input);
78*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi4567 = vld1q_f32(input + 4);
79*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi89AB = vld1q_f32(input + 8);
80*4bdc9457SAndroid Build Coastguard Worker const float32x4_t viCDEF = vld1q_f32(input + 12);
81*4bdc9457SAndroid Build Coastguard Worker const float32x4_t viGHIJ = vld1q_f32(input + 16);
82*4bdc9457SAndroid Build Coastguard Worker const float32x4_t viKLMN = vld1q_f32(input + 20);
83*4bdc9457SAndroid Build Coastguard Worker const float32x4_t viOPQR = vld1q_f32(input + 24);
84*4bdc9457SAndroid Build Coastguard Worker const float32x4_t viSTUV = vld1q_f32(input + 28);
85*4bdc9457SAndroid Build Coastguard Worker input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
86*4bdc9457SAndroid Build Coastguard Worker __builtin_prefetch(input + 16);
87*4bdc9457SAndroid Build Coastguard Worker __builtin_prefetch(input + 32);
88*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vw = vld1q_f32(w); w += 4;
89*4bdc9457SAndroid Build Coastguard Worker __builtin_prefetch(w + 32);
90*4bdc9457SAndroid Build Coastguard Worker vacc0123n0 = vfmaq_laneq_f32(vacc0123n0, vi0123, vw, 0);
91*4bdc9457SAndroid Build Coastguard Worker vacc4567n0 = vfmaq_laneq_f32(vacc4567n0, vi4567, vw, 0);
92*4bdc9457SAndroid Build Coastguard Worker vacc89ABn0 = vfmaq_laneq_f32(vacc89ABn0, vi89AB, vw, 0);
93*4bdc9457SAndroid Build Coastguard Worker vaccCDEFn0 = vfmaq_laneq_f32(vaccCDEFn0, viCDEF, vw, 0);
94*4bdc9457SAndroid Build Coastguard Worker vaccGHIJn0 = vfmaq_laneq_f32(vaccGHIJn0, viGHIJ, vw, 0);
95*4bdc9457SAndroid Build Coastguard Worker vaccKLMNn0 = vfmaq_laneq_f32(vaccKLMNn0, viKLMN, vw, 0);
96*4bdc9457SAndroid Build Coastguard Worker vaccOPQRn0 = vfmaq_laneq_f32(vaccOPQRn0, viOPQR, vw, 0);
97*4bdc9457SAndroid Build Coastguard Worker vaccSTUVn0 = vfmaq_laneq_f32(vaccSTUVn0, viSTUV, vw, 0);
98*4bdc9457SAndroid Build Coastguard Worker vacc0123n1 = vfmaq_laneq_f32(vacc0123n1, vi0123, vw, 1);
99*4bdc9457SAndroid Build Coastguard Worker vacc4567n1 = vfmaq_laneq_f32(vacc4567n1, vi4567, vw, 1);
100*4bdc9457SAndroid Build Coastguard Worker vacc89ABn1 = vfmaq_laneq_f32(vacc89ABn1, vi89AB, vw, 1);
101*4bdc9457SAndroid Build Coastguard Worker vaccCDEFn1 = vfmaq_laneq_f32(vaccCDEFn1, viCDEF, vw, 1);
102*4bdc9457SAndroid Build Coastguard Worker vaccGHIJn1 = vfmaq_laneq_f32(vaccGHIJn1, viGHIJ, vw, 1);
103*4bdc9457SAndroid Build Coastguard Worker vaccKLMNn1 = vfmaq_laneq_f32(vaccKLMNn1, viKLMN, vw, 1);
104*4bdc9457SAndroid Build Coastguard Worker vaccOPQRn1 = vfmaq_laneq_f32(vaccOPQRn1, viOPQR, vw, 1);
105*4bdc9457SAndroid Build Coastguard Worker vaccSTUVn1 = vfmaq_laneq_f32(vaccSTUVn1, viSTUV, vw, 1);
106*4bdc9457SAndroid Build Coastguard Worker vacc0123n2 = vfmaq_laneq_f32(vacc0123n2, vi0123, vw, 2);
107*4bdc9457SAndroid Build Coastguard Worker vacc4567n2 = vfmaq_laneq_f32(vacc4567n2, vi4567, vw, 2);
108*4bdc9457SAndroid Build Coastguard Worker vacc89ABn2 = vfmaq_laneq_f32(vacc89ABn2, vi89AB, vw, 2);
109*4bdc9457SAndroid Build Coastguard Worker vaccCDEFn2 = vfmaq_laneq_f32(vaccCDEFn2, viCDEF, vw, 2);
110*4bdc9457SAndroid Build Coastguard Worker vaccGHIJn2 = vfmaq_laneq_f32(vaccGHIJn2, viGHIJ, vw, 2);
111*4bdc9457SAndroid Build Coastguard Worker vaccKLMNn2 = vfmaq_laneq_f32(vaccKLMNn2, viKLMN, vw, 2);
112*4bdc9457SAndroid Build Coastguard Worker vaccOPQRn2 = vfmaq_laneq_f32(vaccOPQRn2, viOPQR, vw, 2);
113*4bdc9457SAndroid Build Coastguard Worker vaccSTUVn2 = vfmaq_laneq_f32(vaccSTUVn2, viSTUV, vw, 2);
114*4bdc9457SAndroid Build Coastguard Worker vacc0123n3 = vfmaq_laneq_f32(vacc0123n3, vi0123, vw, 3);
115*4bdc9457SAndroid Build Coastguard Worker vacc4567n3 = vfmaq_laneq_f32(vacc4567n3, vi4567, vw, 3);
116*4bdc9457SAndroid Build Coastguard Worker vacc89ABn3 = vfmaq_laneq_f32(vacc89ABn3, vi89AB, vw, 3);
117*4bdc9457SAndroid Build Coastguard Worker vaccCDEFn3 = vfmaq_laneq_f32(vaccCDEFn3, viCDEF, vw, 3);
118*4bdc9457SAndroid Build Coastguard Worker vaccGHIJn3 = vfmaq_laneq_f32(vaccGHIJn3, viGHIJ, vw, 3);
119*4bdc9457SAndroid Build Coastguard Worker vaccKLMNn3 = vfmaq_laneq_f32(vaccKLMNn3, viKLMN, vw, 3);
120*4bdc9457SAndroid Build Coastguard Worker vaccOPQRn3 = vfmaq_laneq_f32(vaccOPQRn3, viOPQR, vw, 3);
121*4bdc9457SAndroid Build Coastguard Worker vaccSTUVn3 = vfmaq_laneq_f32(vaccSTUVn3, viSTUV, vw, 3);
122*4bdc9457SAndroid Build Coastguard Worker } while (--nnz != 0);
123*4bdc9457SAndroid Build Coastguard Worker }
124*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout0123n0 = vminq_f32(vacc0123n0, vmax);
125*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout4567n0 = vminq_f32(vacc4567n0, vmax);
126*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout89ABn0 = vminq_f32(vacc89ABn0, vmax);
127*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutCDEFn0 = vminq_f32(vaccCDEFn0, vmax);
128*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutGHIJn0 = vminq_f32(vaccGHIJn0, vmax);
129*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutKLMNn0 = vminq_f32(vaccKLMNn0, vmax);
130*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutOPQRn0 = vminq_f32(vaccOPQRn0, vmax);
131*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutSTUVn0 = vminq_f32(vaccSTUVn0, vmax);
132*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout0123n1 = vminq_f32(vacc0123n1, vmax);
133*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout4567n1 = vminq_f32(vacc4567n1, vmax);
134*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout89ABn1 = vminq_f32(vacc89ABn1, vmax);
135*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutCDEFn1 = vminq_f32(vaccCDEFn1, vmax);
136*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutGHIJn1 = vminq_f32(vaccGHIJn1, vmax);
137*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutKLMNn1 = vminq_f32(vaccKLMNn1, vmax);
138*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutOPQRn1 = vminq_f32(vaccOPQRn1, vmax);
139*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutSTUVn1 = vminq_f32(vaccSTUVn1, vmax);
140*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout0123n2 = vminq_f32(vacc0123n2, vmax);
141*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout4567n2 = vminq_f32(vacc4567n2, vmax);
142*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout89ABn2 = vminq_f32(vacc89ABn2, vmax);
143*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutCDEFn2 = vminq_f32(vaccCDEFn2, vmax);
144*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutGHIJn2 = vminq_f32(vaccGHIJn2, vmax);
145*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutKLMNn2 = vminq_f32(vaccKLMNn2, vmax);
146*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutOPQRn2 = vminq_f32(vaccOPQRn2, vmax);
147*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutSTUVn2 = vminq_f32(vaccSTUVn2, vmax);
148*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout0123n3 = vminq_f32(vacc0123n3, vmax);
149*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout4567n3 = vminq_f32(vacc4567n3, vmax);
150*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout89ABn3 = vminq_f32(vacc89ABn3, vmax);
151*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutCDEFn3 = vminq_f32(vaccCDEFn3, vmax);
152*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutGHIJn3 = vminq_f32(vaccGHIJn3, vmax);
153*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutKLMNn3 = vminq_f32(vaccKLMNn3, vmax);
154*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutOPQRn3 = vminq_f32(vaccOPQRn3, vmax);
155*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutSTUVn3 = vminq_f32(vaccSTUVn3, vmax);
156*4bdc9457SAndroid Build Coastguard Worker
157*4bdc9457SAndroid Build Coastguard Worker vout0123n0 = vmaxq_f32(vout0123n0, vmin);
158*4bdc9457SAndroid Build Coastguard Worker vout4567n0 = vmaxq_f32(vout4567n0, vmin);
159*4bdc9457SAndroid Build Coastguard Worker vout89ABn0 = vmaxq_f32(vout89ABn0, vmin);
160*4bdc9457SAndroid Build Coastguard Worker voutCDEFn0 = vmaxq_f32(voutCDEFn0, vmin);
161*4bdc9457SAndroid Build Coastguard Worker voutGHIJn0 = vmaxq_f32(voutGHIJn0, vmin);
162*4bdc9457SAndroid Build Coastguard Worker voutKLMNn0 = vmaxq_f32(voutKLMNn0, vmin);
163*4bdc9457SAndroid Build Coastguard Worker voutOPQRn0 = vmaxq_f32(voutOPQRn0, vmin);
164*4bdc9457SAndroid Build Coastguard Worker voutSTUVn0 = vmaxq_f32(voutSTUVn0, vmin);
165*4bdc9457SAndroid Build Coastguard Worker vout0123n1 = vmaxq_f32(vout0123n1, vmin);
166*4bdc9457SAndroid Build Coastguard Worker vout4567n1 = vmaxq_f32(vout4567n1, vmin);
167*4bdc9457SAndroid Build Coastguard Worker vout89ABn1 = vmaxq_f32(vout89ABn1, vmin);
168*4bdc9457SAndroid Build Coastguard Worker voutCDEFn1 = vmaxq_f32(voutCDEFn1, vmin);
169*4bdc9457SAndroid Build Coastguard Worker voutGHIJn1 = vmaxq_f32(voutGHIJn1, vmin);
170*4bdc9457SAndroid Build Coastguard Worker voutKLMNn1 = vmaxq_f32(voutKLMNn1, vmin);
171*4bdc9457SAndroid Build Coastguard Worker voutOPQRn1 = vmaxq_f32(voutOPQRn1, vmin);
172*4bdc9457SAndroid Build Coastguard Worker voutSTUVn1 = vmaxq_f32(voutSTUVn1, vmin);
173*4bdc9457SAndroid Build Coastguard Worker vout0123n2 = vmaxq_f32(vout0123n2, vmin);
174*4bdc9457SAndroid Build Coastguard Worker vout4567n2 = vmaxq_f32(vout4567n2, vmin);
175*4bdc9457SAndroid Build Coastguard Worker vout89ABn2 = vmaxq_f32(vout89ABn2, vmin);
176*4bdc9457SAndroid Build Coastguard Worker voutCDEFn2 = vmaxq_f32(voutCDEFn2, vmin);
177*4bdc9457SAndroid Build Coastguard Worker voutGHIJn2 = vmaxq_f32(voutGHIJn2, vmin);
178*4bdc9457SAndroid Build Coastguard Worker voutKLMNn2 = vmaxq_f32(voutKLMNn2, vmin);
179*4bdc9457SAndroid Build Coastguard Worker voutOPQRn2 = vmaxq_f32(voutOPQRn2, vmin);
180*4bdc9457SAndroid Build Coastguard Worker voutSTUVn2 = vmaxq_f32(voutSTUVn2, vmin);
181*4bdc9457SAndroid Build Coastguard Worker vout0123n3 = vmaxq_f32(vout0123n3, vmin);
182*4bdc9457SAndroid Build Coastguard Worker vout4567n3 = vmaxq_f32(vout4567n3, vmin);
183*4bdc9457SAndroid Build Coastguard Worker vout89ABn3 = vmaxq_f32(vout89ABn3, vmin);
184*4bdc9457SAndroid Build Coastguard Worker voutCDEFn3 = vmaxq_f32(voutCDEFn3, vmin);
185*4bdc9457SAndroid Build Coastguard Worker voutGHIJn3 = vmaxq_f32(voutGHIJn3, vmin);
186*4bdc9457SAndroid Build Coastguard Worker voutKLMNn3 = vmaxq_f32(voutKLMNn3, vmin);
187*4bdc9457SAndroid Build Coastguard Worker voutOPQRn3 = vmaxq_f32(voutOPQRn3, vmin);
188*4bdc9457SAndroid Build Coastguard Worker voutSTUVn3 = vmaxq_f32(voutSTUVn3, vmin);
189*4bdc9457SAndroid Build Coastguard Worker
190*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 0, vout0123n0);
191*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 4, vout4567n0);
192*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 8, vout89ABn0);
193*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 12, voutCDEFn0);
194*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 16, voutGHIJn0);
195*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 20, voutKLMNn0);
196*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 24, voutOPQRn0);
197*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 28, voutSTUVn0);
198*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
199*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 0, vout0123n1);
200*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 4, vout4567n1);
201*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 8, vout89ABn1);
202*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 12, voutCDEFn1);
203*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 16, voutGHIJn1);
204*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 20, voutKLMNn1);
205*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 24, voutOPQRn1);
206*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 28, voutSTUVn1);
207*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
208*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 0, vout0123n2);
209*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 4, vout4567n2);
210*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 8, vout89ABn2);
211*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 12, voutCDEFn2);
212*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 16, voutGHIJn2);
213*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 20, voutKLMNn2);
214*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 24, voutOPQRn2);
215*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 28, voutSTUVn2);
216*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
217*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 0, vout0123n3);
218*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 4, vout4567n3);
219*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 8, vout89ABn3);
220*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 12, voutCDEFn3);
221*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 16, voutGHIJn3);
222*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 20, voutKLMNn3);
223*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 24, voutOPQRn3);
224*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 28, voutSTUVn3);
225*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
226*4bdc9457SAndroid Build Coastguard Worker n -= 4;
227*4bdc9457SAndroid Build Coastguard Worker }
228*4bdc9457SAndroid Build Coastguard Worker
229*4bdc9457SAndroid Build Coastguard Worker // clean up loop, fall back to nr=1
230*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
231*4bdc9457SAndroid Build Coastguard Worker do {
232*4bdc9457SAndroid Build Coastguard Worker uint32_t nnz = *nnzmap++;
233*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
234*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc4567 = vacc0123;
235*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc89AB = vacc0123;
236*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccCDEF = vacc0123;
237*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccGHIJ = vacc0123;
238*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccKLMN = vacc0123;
239*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccOPQR = vacc0123;
240*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccSTUV = vacc0123;
241*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nnz != 0) {
242*4bdc9457SAndroid Build Coastguard Worker do {
243*4bdc9457SAndroid Build Coastguard Worker const intptr_t diff = *dmap++;
244*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi0123 = vld1q_f32(input);
245*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi4567 = vld1q_f32(input + 4);
246*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi89AB = vld1q_f32(input + 8);
247*4bdc9457SAndroid Build Coastguard Worker const float32x4_t viCDEF = vld1q_f32(input + 12);
248*4bdc9457SAndroid Build Coastguard Worker const float32x4_t viGHIJ = vld1q_f32(input + 16);
249*4bdc9457SAndroid Build Coastguard Worker const float32x4_t viKLMN = vld1q_f32(input + 20);
250*4bdc9457SAndroid Build Coastguard Worker const float32x4_t viOPQR = vld1q_f32(input + 24);
251*4bdc9457SAndroid Build Coastguard Worker const float32x4_t viSTUV = vld1q_f32(input + 28);
252*4bdc9457SAndroid Build Coastguard Worker input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
253*4bdc9457SAndroid Build Coastguard Worker __builtin_prefetch(input + 16);
254*4bdc9457SAndroid Build Coastguard Worker __builtin_prefetch(input + 32);
255*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vw = vld1q_dup_f32(w); w += 1;
256*4bdc9457SAndroid Build Coastguard Worker __builtin_prefetch(w + 32);
257*4bdc9457SAndroid Build Coastguard Worker vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
258*4bdc9457SAndroid Build Coastguard Worker vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
259*4bdc9457SAndroid Build Coastguard Worker vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw);
260*4bdc9457SAndroid Build Coastguard Worker vaccCDEF = vfmaq_f32(vaccCDEF, viCDEF, vw);
261*4bdc9457SAndroid Build Coastguard Worker vaccGHIJ = vfmaq_f32(vaccGHIJ, viGHIJ, vw);
262*4bdc9457SAndroid Build Coastguard Worker vaccKLMN = vfmaq_f32(vaccKLMN, viKLMN, vw);
263*4bdc9457SAndroid Build Coastguard Worker vaccOPQR = vfmaq_f32(vaccOPQR, viOPQR, vw);
264*4bdc9457SAndroid Build Coastguard Worker vaccSTUV = vfmaq_f32(vaccSTUV, viSTUV, vw);
265*4bdc9457SAndroid Build Coastguard Worker } while (--nnz != 0);
266*4bdc9457SAndroid Build Coastguard Worker }
267*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
268*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
269*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout89AB = vminq_f32(vacc89AB, vmax);
270*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutCDEF = vminq_f32(vaccCDEF, vmax);
271*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutGHIJ = vminq_f32(vaccGHIJ, vmax);
272*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutKLMN = vminq_f32(vaccKLMN, vmax);
273*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutOPQR = vminq_f32(vaccOPQR, vmax);
274*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutSTUV = vminq_f32(vaccSTUV, vmax);
275*4bdc9457SAndroid Build Coastguard Worker
276*4bdc9457SAndroid Build Coastguard Worker vout0123 = vmaxq_f32(vout0123, vmin);
277*4bdc9457SAndroid Build Coastguard Worker vout4567 = vmaxq_f32(vout4567, vmin);
278*4bdc9457SAndroid Build Coastguard Worker vout89AB = vmaxq_f32(vout89AB, vmin);
279*4bdc9457SAndroid Build Coastguard Worker voutCDEF = vmaxq_f32(voutCDEF, vmin);
280*4bdc9457SAndroid Build Coastguard Worker voutGHIJ = vmaxq_f32(voutGHIJ, vmin);
281*4bdc9457SAndroid Build Coastguard Worker voutKLMN = vmaxq_f32(voutKLMN, vmin);
282*4bdc9457SAndroid Build Coastguard Worker voutOPQR = vmaxq_f32(voutOPQR, vmin);
283*4bdc9457SAndroid Build Coastguard Worker voutSTUV = vmaxq_f32(voutSTUV, vmin);
284*4bdc9457SAndroid Build Coastguard Worker
285*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 0, vout0123);
286*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 4, vout4567);
287*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 8, vout89AB);
288*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 12, voutCDEF);
289*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 16, voutGHIJ);
290*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 20, voutKLMN);
291*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 24, voutOPQR);
292*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 28, voutSTUV);
293*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
294*4bdc9457SAndroid Build Coastguard Worker n -= 1;
295*4bdc9457SAndroid Build Coastguard Worker } while (n != 0);
296*4bdc9457SAndroid Build Coastguard Worker }
297*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output - output_decrement);
298*4bdc9457SAndroid Build Coastguard Worker input += 32;
299*4bdc9457SAndroid Build Coastguard Worker mc -= 32 * sizeof(float);
300*4bdc9457SAndroid Build Coastguard Worker }
301*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(mc != 0) {
302*4bdc9457SAndroid Build Coastguard Worker output_decrement += 16 * sizeof(float);
303*4bdc9457SAndroid Build Coastguard Worker if (mc & (16 * sizeof(float))) {
304*4bdc9457SAndroid Build Coastguard Worker const float*restrict w = weights;
305*4bdc9457SAndroid Build Coastguard Worker const int32_t* dmap = widx_dmap;
306*4bdc9457SAndroid Build Coastguard Worker const uint32_t* nnzmap = nidx_nnzmap;
307*4bdc9457SAndroid Build Coastguard Worker size_t n = nc;
308*4bdc9457SAndroid Build Coastguard Worker while (n >= 4) {
309*4bdc9457SAndroid Build Coastguard Worker uint32_t nnz = *nnzmap++;
310*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc0123n0 = vld1q_dup_f32(w); w += 1;
311*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc4567n0 = vacc0123n0;
312*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc89ABn0 = vacc0123n0;
313*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccCDEFn0 = vacc0123n0;
314*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc0123n1 = vld1q_dup_f32(w); w += 1;
315*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc4567n1 = vacc0123n1;
316*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc89ABn1 = vacc0123n1;
317*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccCDEFn1 = vacc0123n1;
318*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc0123n2 = vld1q_dup_f32(w); w += 1;
319*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc4567n2 = vacc0123n2;
320*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc89ABn2 = vacc0123n2;
321*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccCDEFn2 = vacc0123n2;
322*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc0123n3 = vld1q_dup_f32(w); w += 1;
323*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc4567n3 = vacc0123n3;
324*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc89ABn3 = vacc0123n3;
325*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccCDEFn3 = vacc0123n3;
326*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nnz != 0) {
327*4bdc9457SAndroid Build Coastguard Worker do {
328*4bdc9457SAndroid Build Coastguard Worker const intptr_t diff = *dmap++;
329*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi0123 = vld1q_f32(input);
330*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi4567 = vld1q_f32(input + 4);
331*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi89AB = vld1q_f32(input + 8);
332*4bdc9457SAndroid Build Coastguard Worker const float32x4_t viCDEF = vld1q_f32(input + 12);
333*4bdc9457SAndroid Build Coastguard Worker input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
334*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vw = vld1q_f32(w); w += 4;
335*4bdc9457SAndroid Build Coastguard Worker
336*4bdc9457SAndroid Build Coastguard Worker vacc0123n0 = vfmaq_laneq_f32(vacc0123n0, vi0123, vw, 0);
337*4bdc9457SAndroid Build Coastguard Worker vacc4567n0 = vfmaq_laneq_f32(vacc4567n0, vi4567, vw, 0);
338*4bdc9457SAndroid Build Coastguard Worker vacc89ABn0 = vfmaq_laneq_f32(vacc89ABn0, vi89AB, vw, 0);
339*4bdc9457SAndroid Build Coastguard Worker vaccCDEFn0 = vfmaq_laneq_f32(vaccCDEFn0, viCDEF, vw, 0);
340*4bdc9457SAndroid Build Coastguard Worker vacc0123n1 = vfmaq_laneq_f32(vacc0123n1, vi0123, vw, 1);
341*4bdc9457SAndroid Build Coastguard Worker vacc4567n1 = vfmaq_laneq_f32(vacc4567n1, vi4567, vw, 1);
342*4bdc9457SAndroid Build Coastguard Worker vacc89ABn1 = vfmaq_laneq_f32(vacc89ABn1, vi89AB, vw, 1);
343*4bdc9457SAndroid Build Coastguard Worker vaccCDEFn1 = vfmaq_laneq_f32(vaccCDEFn1, viCDEF, vw, 1);
344*4bdc9457SAndroid Build Coastguard Worker vacc0123n2 = vfmaq_laneq_f32(vacc0123n2, vi0123, vw, 2);
345*4bdc9457SAndroid Build Coastguard Worker vacc4567n2 = vfmaq_laneq_f32(vacc4567n2, vi4567, vw, 2);
346*4bdc9457SAndroid Build Coastguard Worker vacc89ABn2 = vfmaq_laneq_f32(vacc89ABn2, vi89AB, vw, 2);
347*4bdc9457SAndroid Build Coastguard Worker vaccCDEFn2 = vfmaq_laneq_f32(vaccCDEFn2, viCDEF, vw, 2);
348*4bdc9457SAndroid Build Coastguard Worker vacc0123n3 = vfmaq_laneq_f32(vacc0123n3, vi0123, vw, 3);
349*4bdc9457SAndroid Build Coastguard Worker vacc4567n3 = vfmaq_laneq_f32(vacc4567n3, vi4567, vw, 3);
350*4bdc9457SAndroid Build Coastguard Worker vacc89ABn3 = vfmaq_laneq_f32(vacc89ABn3, vi89AB, vw, 3);
351*4bdc9457SAndroid Build Coastguard Worker vaccCDEFn3 = vfmaq_laneq_f32(vaccCDEFn3, viCDEF, vw, 3);
352*4bdc9457SAndroid Build Coastguard Worker } while (--nnz != 0);
353*4bdc9457SAndroid Build Coastguard Worker }
354*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout0123n0 = vminq_f32(vacc0123n0, vmax);
355*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout4567n0 = vminq_f32(vacc4567n0, vmax);
356*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout89ABn0 = vminq_f32(vacc89ABn0, vmax);
357*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutCDEFn0 = vminq_f32(vaccCDEFn0, vmax);
358*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout0123n1 = vminq_f32(vacc0123n1, vmax);
359*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout4567n1 = vminq_f32(vacc4567n1, vmax);
360*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout89ABn1 = vminq_f32(vacc89ABn1, vmax);
361*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutCDEFn1 = vminq_f32(vaccCDEFn1, vmax);
362*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout0123n2 = vminq_f32(vacc0123n2, vmax);
363*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout4567n2 = vminq_f32(vacc4567n2, vmax);
364*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout89ABn2 = vminq_f32(vacc89ABn2, vmax);
365*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutCDEFn2 = vminq_f32(vaccCDEFn2, vmax);
366*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout0123n3 = vminq_f32(vacc0123n3, vmax);
367*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout4567n3 = vminq_f32(vacc4567n3, vmax);
368*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout89ABn3 = vminq_f32(vacc89ABn3, vmax);
369*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutCDEFn3 = vminq_f32(vaccCDEFn3, vmax);
370*4bdc9457SAndroid Build Coastguard Worker
371*4bdc9457SAndroid Build Coastguard Worker vout0123n0 = vmaxq_f32(vout0123n0, vmin);
372*4bdc9457SAndroid Build Coastguard Worker vout4567n0 = vmaxq_f32(vout4567n0, vmin);
373*4bdc9457SAndroid Build Coastguard Worker vout89ABn0 = vmaxq_f32(vout89ABn0, vmin);
374*4bdc9457SAndroid Build Coastguard Worker voutCDEFn0 = vmaxq_f32(voutCDEFn0, vmin);
375*4bdc9457SAndroid Build Coastguard Worker vout0123n1 = vmaxq_f32(vout0123n1, vmin);
376*4bdc9457SAndroid Build Coastguard Worker vout4567n1 = vmaxq_f32(vout4567n1, vmin);
377*4bdc9457SAndroid Build Coastguard Worker vout89ABn1 = vmaxq_f32(vout89ABn1, vmin);
378*4bdc9457SAndroid Build Coastguard Worker voutCDEFn1 = vmaxq_f32(voutCDEFn1, vmin);
379*4bdc9457SAndroid Build Coastguard Worker vout0123n2 = vmaxq_f32(vout0123n2, vmin);
380*4bdc9457SAndroid Build Coastguard Worker vout4567n2 = vmaxq_f32(vout4567n2, vmin);
381*4bdc9457SAndroid Build Coastguard Worker vout89ABn2 = vmaxq_f32(vout89ABn2, vmin);
382*4bdc9457SAndroid Build Coastguard Worker voutCDEFn2 = vmaxq_f32(voutCDEFn2, vmin);
383*4bdc9457SAndroid Build Coastguard Worker vout0123n3 = vmaxq_f32(vout0123n3, vmin);
384*4bdc9457SAndroid Build Coastguard Worker vout4567n3 = vmaxq_f32(vout4567n3, vmin);
385*4bdc9457SAndroid Build Coastguard Worker vout89ABn3 = vmaxq_f32(vout89ABn3, vmin);
386*4bdc9457SAndroid Build Coastguard Worker voutCDEFn3 = vmaxq_f32(voutCDEFn3, vmin);
387*4bdc9457SAndroid Build Coastguard Worker
388*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 0, vout0123n0);
389*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 4, vout4567n0);
390*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 8, vout89ABn0);
391*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 12, voutCDEFn0);
392*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
393*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 0, vout0123n1);
394*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 4, vout4567n1);
395*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 8, vout89ABn1);
396*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 12, voutCDEFn1);
397*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
398*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 0, vout0123n2);
399*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 4, vout4567n2);
400*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 8, vout89ABn2);
401*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 12, voutCDEFn2);
402*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
403*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 0, vout0123n3);
404*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 4, vout4567n3);
405*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 8, vout89ABn3);
406*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 12, voutCDEFn3);
407*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
408*4bdc9457SAndroid Build Coastguard Worker n -= 4;
409*4bdc9457SAndroid Build Coastguard Worker }
410*4bdc9457SAndroid Build Coastguard Worker
411*4bdc9457SAndroid Build Coastguard Worker // clean up loop, fall back to nr=1
412*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
413*4bdc9457SAndroid Build Coastguard Worker do {
414*4bdc9457SAndroid Build Coastguard Worker uint32_t nnz = *nnzmap++;
415*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
416*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc4567 = vacc0123;
417*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc89AB = vacc0123;
418*4bdc9457SAndroid Build Coastguard Worker float32x4_t vaccCDEF = vacc0123;
419*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nnz != 0) {
420*4bdc9457SAndroid Build Coastguard Worker do {
421*4bdc9457SAndroid Build Coastguard Worker const intptr_t diff = *dmap++;
422*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi0123 = vld1q_f32(input);
423*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi4567 = vld1q_f32(input + 4);
424*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi89AB = vld1q_f32(input + 8);
425*4bdc9457SAndroid Build Coastguard Worker const float32x4_t viCDEF = vld1q_f32(input + 12);
426*4bdc9457SAndroid Build Coastguard Worker input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
427*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vw = vld1q_dup_f32(w); w += 1;
428*4bdc9457SAndroid Build Coastguard Worker vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
429*4bdc9457SAndroid Build Coastguard Worker vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
430*4bdc9457SAndroid Build Coastguard Worker vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw);
431*4bdc9457SAndroid Build Coastguard Worker vaccCDEF = vfmaq_f32(vaccCDEF, viCDEF, vw);
432*4bdc9457SAndroid Build Coastguard Worker } while (--nnz != 0);
433*4bdc9457SAndroid Build Coastguard Worker }
434*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
435*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
436*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout89AB = vminq_f32(vacc89AB, vmax);
437*4bdc9457SAndroid Build Coastguard Worker float32x4_t voutCDEF = vminq_f32(vaccCDEF, vmax);
438*4bdc9457SAndroid Build Coastguard Worker
439*4bdc9457SAndroid Build Coastguard Worker vout0123 = vmaxq_f32(vout0123, vmin);
440*4bdc9457SAndroid Build Coastguard Worker vout4567 = vmaxq_f32(vout4567, vmin);
441*4bdc9457SAndroid Build Coastguard Worker vout89AB = vmaxq_f32(vout89AB, vmin);
442*4bdc9457SAndroid Build Coastguard Worker voutCDEF = vmaxq_f32(voutCDEF, vmin);
443*4bdc9457SAndroid Build Coastguard Worker
444*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 0, vout0123);
445*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 4, vout4567);
446*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 8, vout89AB);
447*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 12, voutCDEF);
448*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
449*4bdc9457SAndroid Build Coastguard Worker n -= 1;
450*4bdc9457SAndroid Build Coastguard Worker } while (n != 0);
451*4bdc9457SAndroid Build Coastguard Worker }
452*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output - output_decrement);
453*4bdc9457SAndroid Build Coastguard Worker input += 16;
454*4bdc9457SAndroid Build Coastguard Worker }
455*4bdc9457SAndroid Build Coastguard Worker output_decrement += 8 * sizeof(float);
456*4bdc9457SAndroid Build Coastguard Worker if (mc & (8 * sizeof(float))) {
457*4bdc9457SAndroid Build Coastguard Worker const float*restrict w = weights;
458*4bdc9457SAndroid Build Coastguard Worker const int32_t* dmap = widx_dmap;
459*4bdc9457SAndroid Build Coastguard Worker const uint32_t* nnzmap = nidx_nnzmap;
460*4bdc9457SAndroid Build Coastguard Worker size_t n = nc;
461*4bdc9457SAndroid Build Coastguard Worker while (n >= 4) {
462*4bdc9457SAndroid Build Coastguard Worker uint32_t nnz = *nnzmap++;
463*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc0123n0 = vld1q_dup_f32(w); w += 1;
464*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc4567n0 = vacc0123n0;
465*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc0123n1 = vld1q_dup_f32(w); w += 1;
466*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc4567n1 = vacc0123n1;
467*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc0123n2 = vld1q_dup_f32(w); w += 1;
468*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc4567n2 = vacc0123n2;
469*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc0123n3 = vld1q_dup_f32(w); w += 1;
470*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc4567n3 = vacc0123n3;
471*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nnz != 0) {
472*4bdc9457SAndroid Build Coastguard Worker do {
473*4bdc9457SAndroid Build Coastguard Worker const intptr_t diff = *dmap++;
474*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi0123 = vld1q_f32(input);
475*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi4567 = vld1q_f32(input + 4);
476*4bdc9457SAndroid Build Coastguard Worker input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
477*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vw = vld1q_f32(w); w += 4;
478*4bdc9457SAndroid Build Coastguard Worker
479*4bdc9457SAndroid Build Coastguard Worker vacc0123n0 = vfmaq_laneq_f32(vacc0123n0, vi0123, vw, 0);
480*4bdc9457SAndroid Build Coastguard Worker vacc4567n0 = vfmaq_laneq_f32(vacc4567n0, vi4567, vw, 0);
481*4bdc9457SAndroid Build Coastguard Worker vacc0123n1 = vfmaq_laneq_f32(vacc0123n1, vi0123, vw, 1);
482*4bdc9457SAndroid Build Coastguard Worker vacc4567n1 = vfmaq_laneq_f32(vacc4567n1, vi4567, vw, 1);
483*4bdc9457SAndroid Build Coastguard Worker vacc0123n2 = vfmaq_laneq_f32(vacc0123n2, vi0123, vw, 2);
484*4bdc9457SAndroid Build Coastguard Worker vacc4567n2 = vfmaq_laneq_f32(vacc4567n2, vi4567, vw, 2);
485*4bdc9457SAndroid Build Coastguard Worker vacc0123n3 = vfmaq_laneq_f32(vacc0123n3, vi0123, vw, 3);
486*4bdc9457SAndroid Build Coastguard Worker vacc4567n3 = vfmaq_laneq_f32(vacc4567n3, vi4567, vw, 3);
487*4bdc9457SAndroid Build Coastguard Worker } while (--nnz != 0);
488*4bdc9457SAndroid Build Coastguard Worker }
489*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout0123n0 = vminq_f32(vacc0123n0, vmax);
490*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout4567n0 = vminq_f32(vacc4567n0, vmax);
491*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout0123n1 = vminq_f32(vacc0123n1, vmax);
492*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout4567n1 = vminq_f32(vacc4567n1, vmax);
493*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout0123n2 = vminq_f32(vacc0123n2, vmax);
494*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout4567n2 = vminq_f32(vacc4567n2, vmax);
495*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout0123n3 = vminq_f32(vacc0123n3, vmax);
496*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout4567n3 = vminq_f32(vacc4567n3, vmax);
497*4bdc9457SAndroid Build Coastguard Worker
498*4bdc9457SAndroid Build Coastguard Worker vout0123n0 = vmaxq_f32(vout0123n0, vmin);
499*4bdc9457SAndroid Build Coastguard Worker vout4567n0 = vmaxq_f32(vout4567n0, vmin);
500*4bdc9457SAndroid Build Coastguard Worker vout0123n1 = vmaxq_f32(vout0123n1, vmin);
501*4bdc9457SAndroid Build Coastguard Worker vout4567n1 = vmaxq_f32(vout4567n1, vmin);
502*4bdc9457SAndroid Build Coastguard Worker vout0123n2 = vmaxq_f32(vout0123n2, vmin);
503*4bdc9457SAndroid Build Coastguard Worker vout4567n2 = vmaxq_f32(vout4567n2, vmin);
504*4bdc9457SAndroid Build Coastguard Worker vout0123n3 = vmaxq_f32(vout0123n3, vmin);
505*4bdc9457SAndroid Build Coastguard Worker vout4567n3 = vmaxq_f32(vout4567n3, vmin);
506*4bdc9457SAndroid Build Coastguard Worker
507*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 0, vout0123n0);
508*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 4, vout4567n0);
509*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
510*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 0, vout0123n1);
511*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 4, vout4567n1);
512*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
513*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 0, vout0123n2);
514*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 4, vout4567n2);
515*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
516*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 0, vout0123n3);
517*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 4, vout4567n3);
518*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
519*4bdc9457SAndroid Build Coastguard Worker n -= 4;
520*4bdc9457SAndroid Build Coastguard Worker }
521*4bdc9457SAndroid Build Coastguard Worker
522*4bdc9457SAndroid Build Coastguard Worker // clean up loop, fall back to nr=1
523*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
524*4bdc9457SAndroid Build Coastguard Worker do {
525*4bdc9457SAndroid Build Coastguard Worker uint32_t nnz = *nnzmap++;
526*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
527*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc4567 = vacc0123;
528*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nnz != 0) {
529*4bdc9457SAndroid Build Coastguard Worker do {
530*4bdc9457SAndroid Build Coastguard Worker const intptr_t diff = *dmap++;
531*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi0123 = vld1q_f32(input);
532*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi4567 = vld1q_f32(input + 4);
533*4bdc9457SAndroid Build Coastguard Worker input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
534*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vw = vld1q_dup_f32(w); w += 1;
535*4bdc9457SAndroid Build Coastguard Worker vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
536*4bdc9457SAndroid Build Coastguard Worker vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
537*4bdc9457SAndroid Build Coastguard Worker } while (--nnz != 0);
538*4bdc9457SAndroid Build Coastguard Worker }
539*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
540*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
541*4bdc9457SAndroid Build Coastguard Worker
542*4bdc9457SAndroid Build Coastguard Worker vout0123 = vmaxq_f32(vout0123, vmin);
543*4bdc9457SAndroid Build Coastguard Worker vout4567 = vmaxq_f32(vout4567, vmin);
544*4bdc9457SAndroid Build Coastguard Worker
545*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 0, vout0123);
546*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 4, vout4567);
547*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
548*4bdc9457SAndroid Build Coastguard Worker n -= 1;
549*4bdc9457SAndroid Build Coastguard Worker } while (n != 0);
550*4bdc9457SAndroid Build Coastguard Worker }
551*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output - output_decrement);
552*4bdc9457SAndroid Build Coastguard Worker input += 8;
553*4bdc9457SAndroid Build Coastguard Worker }
554*4bdc9457SAndroid Build Coastguard Worker output_decrement += 4 * sizeof(float);
555*4bdc9457SAndroid Build Coastguard Worker if (mc & (4 * sizeof(float))) {
556*4bdc9457SAndroid Build Coastguard Worker const float*restrict w = weights;
557*4bdc9457SAndroid Build Coastguard Worker const int32_t* dmap = widx_dmap;
558*4bdc9457SAndroid Build Coastguard Worker const uint32_t* nnzmap = nidx_nnzmap;
559*4bdc9457SAndroid Build Coastguard Worker size_t n = nc;
560*4bdc9457SAndroid Build Coastguard Worker while (n >= 4) {
561*4bdc9457SAndroid Build Coastguard Worker uint32_t nnz = *nnzmap++;
562*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc0123n0 = vld1q_dup_f32(w); w += 1;
563*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc0123n1 = vld1q_dup_f32(w); w += 1;
564*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc0123n2 = vld1q_dup_f32(w); w += 1;
565*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc0123n3 = vld1q_dup_f32(w); w += 1;
566*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nnz != 0) {
567*4bdc9457SAndroid Build Coastguard Worker do {
568*4bdc9457SAndroid Build Coastguard Worker const intptr_t diff = *dmap++;
569*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi0123 = vld1q_f32(input);
570*4bdc9457SAndroid Build Coastguard Worker input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
571*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vw = vld1q_f32(w); w += 4;
572*4bdc9457SAndroid Build Coastguard Worker
573*4bdc9457SAndroid Build Coastguard Worker vacc0123n0 = vfmaq_laneq_f32(vacc0123n0, vi0123, vw, 0);
574*4bdc9457SAndroid Build Coastguard Worker vacc0123n1 = vfmaq_laneq_f32(vacc0123n1, vi0123, vw, 1);
575*4bdc9457SAndroid Build Coastguard Worker vacc0123n2 = vfmaq_laneq_f32(vacc0123n2, vi0123, vw, 2);
576*4bdc9457SAndroid Build Coastguard Worker vacc0123n3 = vfmaq_laneq_f32(vacc0123n3, vi0123, vw, 3);
577*4bdc9457SAndroid Build Coastguard Worker } while (--nnz != 0);
578*4bdc9457SAndroid Build Coastguard Worker }
579*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout0123n0 = vminq_f32(vacc0123n0, vmax);
580*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout0123n1 = vminq_f32(vacc0123n1, vmax);
581*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout0123n2 = vminq_f32(vacc0123n2, vmax);
582*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout0123n3 = vminq_f32(vacc0123n3, vmax);
583*4bdc9457SAndroid Build Coastguard Worker
584*4bdc9457SAndroid Build Coastguard Worker vout0123n0 = vmaxq_f32(vout0123n0, vmin);
585*4bdc9457SAndroid Build Coastguard Worker vout0123n1 = vmaxq_f32(vout0123n1, vmin);
586*4bdc9457SAndroid Build Coastguard Worker vout0123n2 = vmaxq_f32(vout0123n2, vmin);
587*4bdc9457SAndroid Build Coastguard Worker vout0123n3 = vmaxq_f32(vout0123n3, vmin);
588*4bdc9457SAndroid Build Coastguard Worker
589*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 0, vout0123n0);
590*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
591*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 0, vout0123n1);
592*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
593*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 0, vout0123n2);
594*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
595*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 0, vout0123n3);
596*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
597*4bdc9457SAndroid Build Coastguard Worker n -= 4;
598*4bdc9457SAndroid Build Coastguard Worker }
599*4bdc9457SAndroid Build Coastguard Worker
600*4bdc9457SAndroid Build Coastguard Worker // clean up loop, fall back to nr=1
601*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
602*4bdc9457SAndroid Build Coastguard Worker do {
603*4bdc9457SAndroid Build Coastguard Worker uint32_t nnz = *nnzmap++;
604*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
605*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nnz != 0) {
606*4bdc9457SAndroid Build Coastguard Worker do {
607*4bdc9457SAndroid Build Coastguard Worker const intptr_t diff = *dmap++;
608*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi0123 = vld1q_f32(input);
609*4bdc9457SAndroid Build Coastguard Worker input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
610*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vw = vld1q_dup_f32(w); w += 1;
611*4bdc9457SAndroid Build Coastguard Worker vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
612*4bdc9457SAndroid Build Coastguard Worker } while (--nnz != 0);
613*4bdc9457SAndroid Build Coastguard Worker }
614*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
615*4bdc9457SAndroid Build Coastguard Worker
616*4bdc9457SAndroid Build Coastguard Worker vout0123 = vmaxq_f32(vout0123, vmin);
617*4bdc9457SAndroid Build Coastguard Worker
618*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + 0, vout0123);
619*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
620*4bdc9457SAndroid Build Coastguard Worker n -= 1;
621*4bdc9457SAndroid Build Coastguard Worker } while (n != 0);
622*4bdc9457SAndroid Build Coastguard Worker }
623*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output - output_decrement);
624*4bdc9457SAndroid Build Coastguard Worker input += 4;
625*4bdc9457SAndroid Build Coastguard Worker }
626*4bdc9457SAndroid Build Coastguard Worker output_decrement += 2 * sizeof(float);
627*4bdc9457SAndroid Build Coastguard Worker if (mc & (2 * sizeof(float))) {
628*4bdc9457SAndroid Build Coastguard Worker const float*restrict w = weights;
629*4bdc9457SAndroid Build Coastguard Worker const int32_t* dmap = widx_dmap;
630*4bdc9457SAndroid Build Coastguard Worker const uint32_t* nnzmap = nidx_nnzmap;
631*4bdc9457SAndroid Build Coastguard Worker size_t n = nc;
632*4bdc9457SAndroid Build Coastguard Worker while (n >= 4) {
633*4bdc9457SAndroid Build Coastguard Worker uint32_t nnz = *nnzmap++;
634*4bdc9457SAndroid Build Coastguard Worker float32x2_t vacc01n0 = vld1_dup_f32(w); w += 1;
635*4bdc9457SAndroid Build Coastguard Worker float32x2_t vacc01n1 = vld1_dup_f32(w); w += 1;
636*4bdc9457SAndroid Build Coastguard Worker float32x2_t vacc01n2 = vld1_dup_f32(w); w += 1;
637*4bdc9457SAndroid Build Coastguard Worker float32x2_t vacc01n3 = vld1_dup_f32(w); w += 1;
638*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nnz != 0) {
639*4bdc9457SAndroid Build Coastguard Worker do {
640*4bdc9457SAndroid Build Coastguard Worker const intptr_t diff = *dmap++;
641*4bdc9457SAndroid Build Coastguard Worker const float32x2_t vi01 = vld1_f32(input);
642*4bdc9457SAndroid Build Coastguard Worker input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
643*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vw = vld1q_f32(w); w += 4;
644*4bdc9457SAndroid Build Coastguard Worker
645*4bdc9457SAndroid Build Coastguard Worker vacc01n0 = vfma_laneq_f32(vacc01n0, vi01, vw, 0);
646*4bdc9457SAndroid Build Coastguard Worker vacc01n1 = vfma_laneq_f32(vacc01n1, vi01, vw, 1);
647*4bdc9457SAndroid Build Coastguard Worker vacc01n2 = vfma_laneq_f32(vacc01n2, vi01, vw, 2);
648*4bdc9457SAndroid Build Coastguard Worker vacc01n3 = vfma_laneq_f32(vacc01n3, vi01, vw, 3);
649*4bdc9457SAndroid Build Coastguard Worker } while (--nnz != 0);
650*4bdc9457SAndroid Build Coastguard Worker }
651*4bdc9457SAndroid Build Coastguard Worker float32x2_t vout01n0 = vmin_f32(vacc01n0, vget_low_f32(vmax));
652*4bdc9457SAndroid Build Coastguard Worker float32x2_t vout01n1 = vmin_f32(vacc01n1, vget_low_f32(vmax));
653*4bdc9457SAndroid Build Coastguard Worker float32x2_t vout01n2 = vmin_f32(vacc01n2, vget_low_f32(vmax));
654*4bdc9457SAndroid Build Coastguard Worker float32x2_t vout01n3 = vmin_f32(vacc01n3, vget_low_f32(vmax));
655*4bdc9457SAndroid Build Coastguard Worker
656*4bdc9457SAndroid Build Coastguard Worker vout01n0 = vmax_f32(vout01n0, vget_low_f32(vmin));
657*4bdc9457SAndroid Build Coastguard Worker vout01n1 = vmax_f32(vout01n1, vget_low_f32(vmin));
658*4bdc9457SAndroid Build Coastguard Worker vout01n2 = vmax_f32(vout01n2, vget_low_f32(vmin));
659*4bdc9457SAndroid Build Coastguard Worker vout01n3 = vmax_f32(vout01n3, vget_low_f32(vmin));
660*4bdc9457SAndroid Build Coastguard Worker
661*4bdc9457SAndroid Build Coastguard Worker vst1_f32(output + 0, vout01n0);
662*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
663*4bdc9457SAndroid Build Coastguard Worker vst1_f32(output + 0, vout01n1);
664*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
665*4bdc9457SAndroid Build Coastguard Worker vst1_f32(output + 0, vout01n2);
666*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
667*4bdc9457SAndroid Build Coastguard Worker vst1_f32(output + 0, vout01n3);
668*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
669*4bdc9457SAndroid Build Coastguard Worker n -= 4;
670*4bdc9457SAndroid Build Coastguard Worker }
671*4bdc9457SAndroid Build Coastguard Worker
672*4bdc9457SAndroid Build Coastguard Worker // clean up loop, fall back to nr=1
673*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
674*4bdc9457SAndroid Build Coastguard Worker do {
675*4bdc9457SAndroid Build Coastguard Worker uint32_t nnz = *nnzmap++;
676*4bdc9457SAndroid Build Coastguard Worker float32x2_t vacc01 = vld1_dup_f32(w); w += 1;
677*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nnz != 0) {
678*4bdc9457SAndroid Build Coastguard Worker do {
679*4bdc9457SAndroid Build Coastguard Worker const intptr_t diff = *dmap++;
680*4bdc9457SAndroid Build Coastguard Worker const float32x2_t vi01 = vld1_f32(input);
681*4bdc9457SAndroid Build Coastguard Worker input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
682*4bdc9457SAndroid Build Coastguard Worker const float32x2_t vw = vld1_dup_f32(w); w += 1;
683*4bdc9457SAndroid Build Coastguard Worker vacc01 = vfma_f32(vacc01, vi01, vw);
684*4bdc9457SAndroid Build Coastguard Worker } while (--nnz != 0);
685*4bdc9457SAndroid Build Coastguard Worker }
686*4bdc9457SAndroid Build Coastguard Worker float32x2_t vout01 = vmin_f32(vacc01, vget_low_f32(vmax));
687*4bdc9457SAndroid Build Coastguard Worker vout01 = vmax_f32(vout01, vget_low_f32(vmin));
688*4bdc9457SAndroid Build Coastguard Worker
689*4bdc9457SAndroid Build Coastguard Worker vst1_f32(output, vout01);
690*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
691*4bdc9457SAndroid Build Coastguard Worker n -= 1;
692*4bdc9457SAndroid Build Coastguard Worker } while (n != 0);
693*4bdc9457SAndroid Build Coastguard Worker }
694*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output - output_decrement);
695*4bdc9457SAndroid Build Coastguard Worker input += 2;
696*4bdc9457SAndroid Build Coastguard Worker }
697*4bdc9457SAndroid Build Coastguard Worker output_decrement += 1 * sizeof(float);
698*4bdc9457SAndroid Build Coastguard Worker if (mc & (1 * sizeof(float))) {
699*4bdc9457SAndroid Build Coastguard Worker const float*restrict w = weights;
700*4bdc9457SAndroid Build Coastguard Worker const int32_t* dmap = widx_dmap;
701*4bdc9457SAndroid Build Coastguard Worker const uint32_t* nnzmap = nidx_nnzmap;
702*4bdc9457SAndroid Build Coastguard Worker size_t n = nc;
703*4bdc9457SAndroid Build Coastguard Worker while (n >= 4) {
704*4bdc9457SAndroid Build Coastguard Worker uint32_t nnz = *nnzmap++;
705*4bdc9457SAndroid Build Coastguard Worker float32x2_t vacc0n0 = vld1_dup_f32(w); w += 1;
706*4bdc9457SAndroid Build Coastguard Worker float32x2_t vacc0n1 = vld1_dup_f32(w); w += 1;
707*4bdc9457SAndroid Build Coastguard Worker float32x2_t vacc0n2 = vld1_dup_f32(w); w += 1;
708*4bdc9457SAndroid Build Coastguard Worker float32x2_t vacc0n3 = vld1_dup_f32(w); w += 1;
709*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nnz != 0) {
710*4bdc9457SAndroid Build Coastguard Worker do {
711*4bdc9457SAndroid Build Coastguard Worker const intptr_t diff = *dmap++;
712*4bdc9457SAndroid Build Coastguard Worker const float32x2_t vi0 = vld1_dup_f32(input);
713*4bdc9457SAndroid Build Coastguard Worker input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
714*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vw = vld1q_f32(w); w += 4;
715*4bdc9457SAndroid Build Coastguard Worker
716*4bdc9457SAndroid Build Coastguard Worker vacc0n0 = vfma_laneq_f32(vacc0n0, vi0, vw, 0);
717*4bdc9457SAndroid Build Coastguard Worker vacc0n1 = vfma_laneq_f32(vacc0n1, vi0, vw, 1);
718*4bdc9457SAndroid Build Coastguard Worker vacc0n2 = vfma_laneq_f32(vacc0n2, vi0, vw, 2);
719*4bdc9457SAndroid Build Coastguard Worker vacc0n3 = vfma_laneq_f32(vacc0n3, vi0, vw, 3);
720*4bdc9457SAndroid Build Coastguard Worker } while (--nnz != 0);
721*4bdc9457SAndroid Build Coastguard Worker }
722*4bdc9457SAndroid Build Coastguard Worker float32x2_t vout0n0 = vmin_f32(vacc0n0, vget_low_f32(vmax));
723*4bdc9457SAndroid Build Coastguard Worker float32x2_t vout0n1 = vmin_f32(vacc0n1, vget_low_f32(vmax));
724*4bdc9457SAndroid Build Coastguard Worker float32x2_t vout0n2 = vmin_f32(vacc0n2, vget_low_f32(vmax));
725*4bdc9457SAndroid Build Coastguard Worker float32x2_t vout0n3 = vmin_f32(vacc0n3, vget_low_f32(vmax));
726*4bdc9457SAndroid Build Coastguard Worker
727*4bdc9457SAndroid Build Coastguard Worker vout0n0 = vmax_f32(vout0n0, vget_low_f32(vmin));
728*4bdc9457SAndroid Build Coastguard Worker vout0n1 = vmax_f32(vout0n1, vget_low_f32(vmin));
729*4bdc9457SAndroid Build Coastguard Worker vout0n2 = vmax_f32(vout0n2, vget_low_f32(vmin));
730*4bdc9457SAndroid Build Coastguard Worker vout0n3 = vmax_f32(vout0n3, vget_low_f32(vmin));
731*4bdc9457SAndroid Build Coastguard Worker
732*4bdc9457SAndroid Build Coastguard Worker vst1_lane_f32(output + 0, vout0n0, 0);
733*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
734*4bdc9457SAndroid Build Coastguard Worker vst1_lane_f32(output + 0, vout0n1, 0);
735*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
736*4bdc9457SAndroid Build Coastguard Worker vst1_lane_f32(output + 0, vout0n2, 0);
737*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
738*4bdc9457SAndroid Build Coastguard Worker vst1_lane_f32(output + 0, vout0n3, 0);
739*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
740*4bdc9457SAndroid Build Coastguard Worker n -= 4;
741*4bdc9457SAndroid Build Coastguard Worker }
742*4bdc9457SAndroid Build Coastguard Worker
743*4bdc9457SAndroid Build Coastguard Worker // clean up loop, fall back to nr=1
744*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
745*4bdc9457SAndroid Build Coastguard Worker do {
746*4bdc9457SAndroid Build Coastguard Worker uint32_t nnz = *nnzmap++;
747*4bdc9457SAndroid Build Coastguard Worker float32x2_t vacc0 = vld1_dup_f32(w); w += 1;
748*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nnz != 0) {
749*4bdc9457SAndroid Build Coastguard Worker do {
750*4bdc9457SAndroid Build Coastguard Worker const intptr_t diff = *dmap++;
751*4bdc9457SAndroid Build Coastguard Worker const float32x2_t vi0 = vld1_dup_f32(input);
752*4bdc9457SAndroid Build Coastguard Worker input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
753*4bdc9457SAndroid Build Coastguard Worker const float32x2_t vw = vld1_dup_f32(w); w += 1;
754*4bdc9457SAndroid Build Coastguard Worker vacc0 = vfma_f32(vacc0, vi0, vw);
755*4bdc9457SAndroid Build Coastguard Worker } while (--nnz != 0);
756*4bdc9457SAndroid Build Coastguard Worker }
757*4bdc9457SAndroid Build Coastguard Worker float32x2_t vout0 = vmin_f32(vacc0, vget_low_f32(vmax));
758*4bdc9457SAndroid Build Coastguard Worker vout0 = vmax_f32(vout0, vget_low_f32(vmin));
759*4bdc9457SAndroid Build Coastguard Worker
760*4bdc9457SAndroid Build Coastguard Worker vst1_lane_f32(output, vout0, 1);
761*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride);
762*4bdc9457SAndroid Build Coastguard Worker n -= 1;
763*4bdc9457SAndroid Build Coastguard Worker } while (n != 0);
764*4bdc9457SAndroid Build Coastguard Worker }
765*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output - output_decrement);
766*4bdc9457SAndroid Build Coastguard Worker input += 1;
767*4bdc9457SAndroid Build Coastguard Worker }
768*4bdc9457SAndroid Build Coastguard Worker }
769*4bdc9457SAndroid Build Coastguard Worker }
770