xref: /aosp_15_r20/external/XNNPACK/src/f32-spmm/neon-blocked.c.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2019 Google LLC
2*4bdc9457SAndroid Build Coastguard Worker//
3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the
4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree.
5*4bdc9457SAndroid Build Coastguard Worker
6*4bdc9457SAndroid Build Coastguard Worker$assert MR % 4 == 0
7*4bdc9457SAndroid Build Coastguard Worker$assert NR in [1, 2, 4]
8*4bdc9457SAndroid Build Coastguard Worker$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
9*4bdc9457SAndroid Build Coastguard Worker#include <assert.h>
10*4bdc9457SAndroid Build Coastguard Worker
11*4bdc9457SAndroid Build Coastguard Worker#include <arm_neon.h>
12*4bdc9457SAndroid Build Coastguard Worker
13*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/spmm.h>
14*4bdc9457SAndroid Build Coastguard Worker
15*4bdc9457SAndroid Build Coastguard Worker
16*4bdc9457SAndroid Build Coastguard Workervoid xnn_f32_spmm_minmax_ukernel_${MR}x${NR}__${"neonfma" if FMA else "neon"}(
17*4bdc9457SAndroid Build Coastguard Worker    size_t mc,
18*4bdc9457SAndroid Build Coastguard Worker    size_t nc,
19*4bdc9457SAndroid Build Coastguard Worker    const float*restrict input,
20*4bdc9457SAndroid Build Coastguard Worker    const float*restrict weights,
21*4bdc9457SAndroid Build Coastguard Worker    const int32_t*restrict widx_dmap,
22*4bdc9457SAndroid Build Coastguard Worker    const uint32_t*restrict nidx_nnzmap,
23*4bdc9457SAndroid Build Coastguard Worker    float*restrict output,
24*4bdc9457SAndroid Build Coastguard Worker    size_t output_stride,
25*4bdc9457SAndroid Build Coastguard Worker    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26*4bdc9457SAndroid Build Coastguard Worker{
27*4bdc9457SAndroid Build Coastguard Worker  assert(mc != 0);
28*4bdc9457SAndroid Build Coastguard Worker  assert(mc % sizeof(float) == 0);
29*4bdc9457SAndroid Build Coastguard Worker  assert(nc != 0);
30*4bdc9457SAndroid Build Coastguard Worker
31*4bdc9457SAndroid Build Coastguard Worker  const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
32*4bdc9457SAndroid Build Coastguard Worker  const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
33*4bdc9457SAndroid Build Coastguard Worker  size_t output_decrement = output_stride * nc - ${MR} * sizeof(float);
34*4bdc9457SAndroid Build Coastguard Worker  while XNN_LIKELY(mc >= ${MR} * sizeof(float)) {
35*4bdc9457SAndroid Build Coastguard Worker    const float*restrict w = weights;
36*4bdc9457SAndroid Build Coastguard Worker    const int32_t* dmap = widx_dmap;
37*4bdc9457SAndroid Build Coastguard Worker    const uint32_t* nnzmap = nidx_nnzmap;
38*4bdc9457SAndroid Build Coastguard Worker    size_t n = nc;
39*4bdc9457SAndroid Build Coastguard Worker    while (n >= ${NR}) {
40*4bdc9457SAndroid Build Coastguard Worker      uint32_t nnz = *nnzmap++;
41*4bdc9457SAndroid Build Coastguard Worker      $for N in range(0, NR, 1):
42*4bdc9457SAndroid Build Coastguard Worker        float32x4_t vacc${ABC[0:4]}n${N} = vld1q_dup_f32(w); w += 1;
43*4bdc9457SAndroid Build Coastguard Worker        $for M in range(4, MR, 4):
44*4bdc9457SAndroid Build Coastguard Worker          float32x4_t vacc${ABC[M:M+4]}n${N} = vacc${ABC[0:4]}n${N};
45*4bdc9457SAndroid Build Coastguard Worker      if XNN_LIKELY(nnz != 0) {
46*4bdc9457SAndroid Build Coastguard Worker        do {
47*4bdc9457SAndroid Build Coastguard Worker          const intptr_t diff = *dmap++;
48*4bdc9457SAndroid Build Coastguard Worker          const float32x4_t vi${ABC[0:4]} = vld1q_f32(input);
49*4bdc9457SAndroid Build Coastguard Worker          $for M in range(4, MR, 4):
50*4bdc9457SAndroid Build Coastguard Worker            const float32x4_t vi${ABC[M:M+4]} = vld1q_f32(input + ${M});
51*4bdc9457SAndroid Build Coastguard Worker          input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
52*4bdc9457SAndroid Build Coastguard Worker          $for M in range(0, MR, 16):
53*4bdc9457SAndroid Build Coastguard Worker            __builtin_prefetch(input + ${M+16});
54*4bdc9457SAndroid Build Coastguard Worker          $if NR == 1:
55*4bdc9457SAndroid Build Coastguard Worker            const float32x4_t vw = vld1q_dup_f32(w); w += 1;
56*4bdc9457SAndroid Build Coastguard Worker          $elif NR == 2:
57*4bdc9457SAndroid Build Coastguard Worker            const float32x2_t vw = vld1_f32(w); w += 2;
58*4bdc9457SAndroid Build Coastguard Worker          $elif NR == 4:
59*4bdc9457SAndroid Build Coastguard Worker            const float32x4_t vw = vld1q_f32(w); w += 4;
60*4bdc9457SAndroid Build Coastguard Worker          __builtin_prefetch(w + 32);
61*4bdc9457SAndroid Build Coastguard Worker          $if NR == 1:
62*4bdc9457SAndroid Build Coastguard Worker            $for M in range(0, MR, 4):
63*4bdc9457SAndroid Build Coastguard Worker              vacc${ABC[M:M+4]}c0 = vfmaq_f32(vacc${ABC[M:M+4]}c0, vi${ABC[M:M+4]}, vw);
64*4bdc9457SAndroid Build Coastguard Worker          $else:
65*4bdc9457SAndroid Build Coastguard Worker            $for N in range(NR):
66*4bdc9457SAndroid Build Coastguard Worker              $for M in range(0, MR, 4):
67*4bdc9457SAndroid Build Coastguard Worker                vacc${ABC[M:M+4]}n${N} = vfmaq_lane${"q" if NR == 4 else ""}_f32(vacc${ABC[M:M+4]}n${N}, vi${ABC[M:M+4]}, vw, ${N});
68*4bdc9457SAndroid Build Coastguard Worker        } while (--nnz != 0);
69*4bdc9457SAndroid Build Coastguard Worker      }
70*4bdc9457SAndroid Build Coastguard Worker      $for N in range(0, NR, 1):
71*4bdc9457SAndroid Build Coastguard Worker        $for M in range(0, MR, 4):
72*4bdc9457SAndroid Build Coastguard Worker          float32x4_t vout${ABC[M:M+4]}n${N} = vminq_f32(vacc${ABC[M:M+4]}n${N}, vmax);
73*4bdc9457SAndroid Build Coastguard Worker
74*4bdc9457SAndroid Build Coastguard Worker      $for N in range(0, NR, 1):
75*4bdc9457SAndroid Build Coastguard Worker        $for M in range(0, MR, 4):
76*4bdc9457SAndroid Build Coastguard Worker          vout${ABC[M:M+4]}n${N} = vmaxq_f32(vout${ABC[M:M+4]}n${N}, vmin);
77*4bdc9457SAndroid Build Coastguard Worker
78*4bdc9457SAndroid Build Coastguard Worker      $for N in range(0, NR, 1):
79*4bdc9457SAndroid Build Coastguard Worker        $for M in range(0, MR, 4):
80*4bdc9457SAndroid Build Coastguard Worker          vst1q_f32(output + ${M}, vout${ABC[M:M+4]}n${N});
81*4bdc9457SAndroid Build Coastguard Worker        output = (float*restrict) ((uintptr_t) output + output_stride);
82*4bdc9457SAndroid Build Coastguard Worker      n -= ${NR};
83*4bdc9457SAndroid Build Coastguard Worker    }
84*4bdc9457SAndroid Build Coastguard Worker
85*4bdc9457SAndroid Build Coastguard Worker    // clean up loop, fall back to nr=1
86*4bdc9457SAndroid Build Coastguard Worker    if XNN_UNLIKELY(n != 0) {
87*4bdc9457SAndroid Build Coastguard Worker      do {
88*4bdc9457SAndroid Build Coastguard Worker        uint32_t nnz = *nnzmap++;
89*4bdc9457SAndroid Build Coastguard Worker        float32x4_t vacc${ABC[0:4]} = vld1q_dup_f32(w); w += 1;
90*4bdc9457SAndroid Build Coastguard Worker        $for M in range(4, MR, 4):
91*4bdc9457SAndroid Build Coastguard Worker          float32x4_t vacc${ABC[M:M+4]} = vacc${ABC[0:4]};
92*4bdc9457SAndroid Build Coastguard Worker        if XNN_LIKELY(nnz != 0) {
93*4bdc9457SAndroid Build Coastguard Worker          do {
94*4bdc9457SAndroid Build Coastguard Worker            const intptr_t diff = *dmap++;
95*4bdc9457SAndroid Build Coastguard Worker            const float32x4_t vi${ABC[0:4]} = vld1q_f32(input);
96*4bdc9457SAndroid Build Coastguard Worker            $for M in range(4, MR, 4):
97*4bdc9457SAndroid Build Coastguard Worker              const float32x4_t vi${ABC[M:M+4]} = vld1q_f32(input + ${M});
98*4bdc9457SAndroid Build Coastguard Worker            input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
99*4bdc9457SAndroid Build Coastguard Worker            $for M in range(0, MR, 16):
100*4bdc9457SAndroid Build Coastguard Worker              __builtin_prefetch(input + ${M+16});
101*4bdc9457SAndroid Build Coastguard Worker            const float32x4_t vw = vld1q_dup_f32(w); w += 1;
102*4bdc9457SAndroid Build Coastguard Worker            __builtin_prefetch(w + 32);
103*4bdc9457SAndroid Build Coastguard Worker            $for M in range(0, MR, 4):
104*4bdc9457SAndroid Build Coastguard Worker              vacc${ABC[M:M+4]} = vfmaq_f32(vacc${ABC[M:M+4]}, vi${ABC[M:M+4]}, vw);
105*4bdc9457SAndroid Build Coastguard Worker          } while (--nnz != 0);
106*4bdc9457SAndroid Build Coastguard Worker        }
107*4bdc9457SAndroid Build Coastguard Worker        $for M in range(0, MR, 4):
108*4bdc9457SAndroid Build Coastguard Worker          float32x4_t vout${ABC[M:M+4]} = vminq_f32(vacc${ABC[M:M+4]}, vmax);
109*4bdc9457SAndroid Build Coastguard Worker
110*4bdc9457SAndroid Build Coastguard Worker        $for M in range(0, MR, 4):
111*4bdc9457SAndroid Build Coastguard Worker          vout${ABC[M:M+4]} = vmaxq_f32(vout${ABC[M:M+4]}, vmin);
112*4bdc9457SAndroid Build Coastguard Worker
113*4bdc9457SAndroid Build Coastguard Worker        $for M in range(0, MR, 4):
114*4bdc9457SAndroid Build Coastguard Worker          vst1q_f32(output + ${M}, vout${ABC[M:M+4]});
115*4bdc9457SAndroid Build Coastguard Worker        output = (float*restrict) ((uintptr_t) output + output_stride);
116*4bdc9457SAndroid Build Coastguard Worker        n -= 1;
117*4bdc9457SAndroid Build Coastguard Worker      } while (n != 0);
118*4bdc9457SAndroid Build Coastguard Worker    }
119*4bdc9457SAndroid Build Coastguard Worker    output = (float*restrict) ((uintptr_t) output - output_decrement);
120*4bdc9457SAndroid Build Coastguard Worker    input += ${MR};
121*4bdc9457SAndroid Build Coastguard Worker    mc -= ${MR} * sizeof(float);
122*4bdc9457SAndroid Build Coastguard Worker  }
123*4bdc9457SAndroid Build Coastguard Worker  if XNN_UNLIKELY(mc != 0) {
124*4bdc9457SAndroid Build Coastguard Worker    $for LOG2M in reversed(range((MR - 1).bit_length())):
125*4bdc9457SAndroid Build Coastguard Worker      $SUBMR = 1 << LOG2M
126*4bdc9457SAndroid Build Coastguard Worker      $if SUBMR * 2 >= MR:
127*4bdc9457SAndroid Build Coastguard Worker        output_decrement += ${MR - SUBMR} * sizeof(float);
128*4bdc9457SAndroid Build Coastguard Worker      $else:
129*4bdc9457SAndroid Build Coastguard Worker        output_decrement += ${SUBMR} * sizeof(float);
130*4bdc9457SAndroid Build Coastguard Worker      if (mc & (${SUBMR} * sizeof(float))) {
131*4bdc9457SAndroid Build Coastguard Worker        const float*restrict w = weights;
132*4bdc9457SAndroid Build Coastguard Worker        const int32_t* dmap = widx_dmap;
133*4bdc9457SAndroid Build Coastguard Worker        const uint32_t* nnzmap = nidx_nnzmap;
134*4bdc9457SAndroid Build Coastguard Worker        size_t n = nc;
135*4bdc9457SAndroid Build Coastguard Worker        while (n >= ${NR}) {
136*4bdc9457SAndroid Build Coastguard Worker          uint32_t nnz = *nnzmap++;
137*4bdc9457SAndroid Build Coastguard Worker          $for N in range(0, NR, 1):
138*4bdc9457SAndroid Build Coastguard Worker            $if SUBMR < 4:
139*4bdc9457SAndroid Build Coastguard Worker              float32x2_t vacc${ABC[0:SUBMR]}n${N} = vld1_dup_f32(w); w += 1;
140*4bdc9457SAndroid Build Coastguard Worker            $else:
141*4bdc9457SAndroid Build Coastguard Worker              float32x4_t vacc${ABC[0:4]}n${N} = vld1q_dup_f32(w); w += 1;
142*4bdc9457SAndroid Build Coastguard Worker            $for M in range(4, SUBMR, 4):
143*4bdc9457SAndroid Build Coastguard Worker              float32x4_t vacc${ABC[M:M+4]}n${N} = vacc${ABC[0:4]}n${N};
144*4bdc9457SAndroid Build Coastguard Worker          if XNN_LIKELY(nnz != 0) {
145*4bdc9457SAndroid Build Coastguard Worker            do {
146*4bdc9457SAndroid Build Coastguard Worker              const intptr_t diff = *dmap++;
147*4bdc9457SAndroid Build Coastguard Worker              $if SUBMR == 1:
148*4bdc9457SAndroid Build Coastguard Worker                const float32x2_t vi${ABC[0]} = vld1_dup_f32(input);
149*4bdc9457SAndroid Build Coastguard Worker              $elif SUBMR == 2:
150*4bdc9457SAndroid Build Coastguard Worker                const float32x2_t vi${ABC[0:2]} = vld1_f32(input);
151*4bdc9457SAndroid Build Coastguard Worker              $else:
152*4bdc9457SAndroid Build Coastguard Worker                const float32x4_t vi${ABC[0:4]} = vld1q_f32(input);
153*4bdc9457SAndroid Build Coastguard Worker              $for M in range(4, SUBMR, 4):
154*4bdc9457SAndroid Build Coastguard Worker                const float32x4_t vi${ABC[M:M+4]} = vld1q_f32(input + ${M});
155*4bdc9457SAndroid Build Coastguard Worker              input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
156*4bdc9457SAndroid Build Coastguard Worker              $if NR == 1:
157*4bdc9457SAndroid Build Coastguard Worker                $if SUBMR < 4:
158*4bdc9457SAndroid Build Coastguard Worker                  const float32x2_t vw = vld1_dup_f32(w); w += 1;
159*4bdc9457SAndroid Build Coastguard Worker                $else:
160*4bdc9457SAndroid Build Coastguard Worker                  const float32x4_t vw = vld1q_dup_f32(w); w += 1;
161*4bdc9457SAndroid Build Coastguard Worker              $elif NR == 2:
162*4bdc9457SAndroid Build Coastguard Worker                const float32x2_t vw = vld1_f32(w); w += 2;
163*4bdc9457SAndroid Build Coastguard Worker              $elif NR == 4:
164*4bdc9457SAndroid Build Coastguard Worker                const float32x4_t vw = vld1q_f32(w); w += 4;
165*4bdc9457SAndroid Build Coastguard Worker
166*4bdc9457SAndroid Build Coastguard Worker              $if NR == 1:
167*4bdc9457SAndroid Build Coastguard Worker                $if SUBMR < 4:
168*4bdc9457SAndroid Build Coastguard Worker                    vacc${ABC[0:SUBMR]}c0 = vfmaq_f32(vacc${ABC[0:SUBMR]}c0, vi${ABC[0:SUBMR]}, vw);
169*4bdc9457SAndroid Build Coastguard Worker                $else:
170*4bdc9457SAndroid Build Coastguard Worker                  $for M in range(0, SUBMR, 4):
171*4bdc9457SAndroid Build Coastguard Worker                    vacc${ABC[M:M+4]}c0 = vfmaq_f32(vacc${ABC[M:M+4]}c0, vi${ABC[M:M+4]}, vw);
172*4bdc9457SAndroid Build Coastguard Worker              $else:
173*4bdc9457SAndroid Build Coastguard Worker                $for N in range(NR):
174*4bdc9457SAndroid Build Coastguard Worker                  $if SUBMR < 4:
175*4bdc9457SAndroid Build Coastguard Worker                    vacc${ABC[0:SUBMR]}n${N} = vfma_lane${"q" if NR == 4 else ""}_f32(vacc${ABC[0:SUBMR]}n${N}, vi${ABC[0:SUBMR]}, vw, ${N});
176*4bdc9457SAndroid Build Coastguard Worker                  $else:
177*4bdc9457SAndroid Build Coastguard Worker                    $for M in range(0, SUBMR, 4):
178*4bdc9457SAndroid Build Coastguard Worker                      vacc${ABC[M:M+4]}n${N} = vfmaq_lane${"q" if NR == 4 else ""}_f32(vacc${ABC[M:M+4]}n${N}, vi${ABC[M:M+4]}, vw, ${N});
179*4bdc9457SAndroid Build Coastguard Worker            } while (--nnz != 0);
180*4bdc9457SAndroid Build Coastguard Worker          }
181*4bdc9457SAndroid Build Coastguard Worker          $for N in range(0, NR, 1):
182*4bdc9457SAndroid Build Coastguard Worker            $if SUBMR < 4:
183*4bdc9457SAndroid Build Coastguard Worker              float32x2_t vout${ABC[0:SUBMR]}n${N} = vmin_f32(vacc${ABC[0:SUBMR]}n${N}, vget_low_f32(vmax));
184*4bdc9457SAndroid Build Coastguard Worker            $else:
185*4bdc9457SAndroid Build Coastguard Worker              $for M in range(0, SUBMR, 4):
186*4bdc9457SAndroid Build Coastguard Worker                float32x4_t vout${ABC[M:M+4]}n${N} = vminq_f32(vacc${ABC[M:M+4]}n${N}, vmax);
187*4bdc9457SAndroid Build Coastguard Worker
188*4bdc9457SAndroid Build Coastguard Worker          $for N in range(0, NR, 1):
189*4bdc9457SAndroid Build Coastguard Worker            $if SUBMR < 4:
190*4bdc9457SAndroid Build Coastguard Worker              vout${ABC[0:SUBMR]}n${N} = vmax_f32(vout${ABC[0:SUBMR]}n${N}, vget_low_f32(vmin));
191*4bdc9457SAndroid Build Coastguard Worker            $else:
192*4bdc9457SAndroid Build Coastguard Worker              $for M in range(0, SUBMR, 4):
193*4bdc9457SAndroid Build Coastguard Worker                vout${ABC[M:M+4]}n${N} = vmaxq_f32(vout${ABC[M:M+4]}n${N}, vmin);
194*4bdc9457SAndroid Build Coastguard Worker
195*4bdc9457SAndroid Build Coastguard Worker          $for N in range(NR):
196*4bdc9457SAndroid Build Coastguard Worker            $if SUBMR == 1:
197*4bdc9457SAndroid Build Coastguard Worker              vst1_lane_f32(output + ${M}, vout${ABC[0:SUBMR]}n${N}, 0);
198*4bdc9457SAndroid Build Coastguard Worker            $elif SUBMR == 2:
199*4bdc9457SAndroid Build Coastguard Worker              vst1_f32(output + ${M}, vout${ABC[0:SUBMR]}n${N});
200*4bdc9457SAndroid Build Coastguard Worker            $else:
201*4bdc9457SAndroid Build Coastguard Worker              $for M in range(0, SUBMR, 4):
202*4bdc9457SAndroid Build Coastguard Worker                vst1q_f32(output + ${M}, vout${ABC[M:M+4]}n${N});
203*4bdc9457SAndroid Build Coastguard Worker            output = (float*restrict) ((uintptr_t) output + output_stride);
204*4bdc9457SAndroid Build Coastguard Worker          n -= ${NR};
205*4bdc9457SAndroid Build Coastguard Worker        }
206*4bdc9457SAndroid Build Coastguard Worker
207*4bdc9457SAndroid Build Coastguard Worker        // clean up loop, fall back to nr=1
208*4bdc9457SAndroid Build Coastguard Worker        if XNN_UNLIKELY(n != 0) {
209*4bdc9457SAndroid Build Coastguard Worker          do {
210*4bdc9457SAndroid Build Coastguard Worker            uint32_t nnz = *nnzmap++;
211*4bdc9457SAndroid Build Coastguard Worker            $if SUBMR < 4:
212*4bdc9457SAndroid Build Coastguard Worker              float32x2_t vacc${ABC[0:SUBMR]} = vld1_dup_f32(w); w += 1;
213*4bdc9457SAndroid Build Coastguard Worker            $else:
214*4bdc9457SAndroid Build Coastguard Worker              float32x4_t vacc${ABC[0:4]} = vld1q_dup_f32(w); w += 1;
215*4bdc9457SAndroid Build Coastguard Worker            $for M in range(4, SUBMR, 4):
216*4bdc9457SAndroid Build Coastguard Worker              float32x4_t vacc${ABC[M:M+4]} = vacc${ABC[0:4]};
217*4bdc9457SAndroid Build Coastguard Worker            if XNN_LIKELY(nnz != 0) {
218*4bdc9457SAndroid Build Coastguard Worker              do {
219*4bdc9457SAndroid Build Coastguard Worker                const intptr_t diff = *dmap++;
220*4bdc9457SAndroid Build Coastguard Worker                $if SUBMR == 1:
221*4bdc9457SAndroid Build Coastguard Worker                  const float32x2_t vi${ABC[0:1]} = vld1_dup_f32(input);
222*4bdc9457SAndroid Build Coastguard Worker                $elif SUBMR == 2:
223*4bdc9457SAndroid Build Coastguard Worker                  const float32x2_t vi${ABC[0:2]} = vld1_f32(input);
224*4bdc9457SAndroid Build Coastguard Worker                $else:
225*4bdc9457SAndroid Build Coastguard Worker                  const float32x4_t vi${ABC[0:4]} = vld1q_f32(input);
226*4bdc9457SAndroid Build Coastguard Worker                $for M in range(4, SUBMR, 4):
227*4bdc9457SAndroid Build Coastguard Worker                  const float32x4_t vi${ABC[M:M+4]} = vld1q_f32(input + ${M});
228*4bdc9457SAndroid Build Coastguard Worker                input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
229*4bdc9457SAndroid Build Coastguard Worker                $if SUBMR < 4:
230*4bdc9457SAndroid Build Coastguard Worker                  const float32x2_t vw = vld1_dup_f32(w); w += 1;
231*4bdc9457SAndroid Build Coastguard Worker                  vacc${ABC[0:SUBMR]} = vfma_f32(vacc${ABC[0:SUBMR]}, vi${ABC[0:SUBMR]}, vw);
232*4bdc9457SAndroid Build Coastguard Worker                $else:
233*4bdc9457SAndroid Build Coastguard Worker                  const float32x4_t vw = vld1q_dup_f32(w); w += 1;
234*4bdc9457SAndroid Build Coastguard Worker                  $for M in range(0, SUBMR, 4):
235*4bdc9457SAndroid Build Coastguard Worker                    vacc${ABC[M:M+4]} = vfmaq_f32(vacc${ABC[M:M+4]}, vi${ABC[M:M+4]}, vw);
236*4bdc9457SAndroid Build Coastguard Worker              } while (--nnz != 0);
237*4bdc9457SAndroid Build Coastguard Worker            }
238*4bdc9457SAndroid Build Coastguard Worker            $if SUBMR < 4:
239*4bdc9457SAndroid Build Coastguard Worker              float32x2_t vout${ABC[0:SUBMR]} = vmin_f32(vacc${ABC[0:SUBMR]}, vget_low_f32(vmax));
240*4bdc9457SAndroid Build Coastguard Worker              vout${ABC[0:SUBMR]} = vmax_f32(vout${ABC[0:SUBMR]}, vget_low_f32(vmin));
241*4bdc9457SAndroid Build Coastguard Worker            $else:
242*4bdc9457SAndroid Build Coastguard Worker              $for M in range(0, SUBMR, 4):
243*4bdc9457SAndroid Build Coastguard Worker                float32x4_t vout${ABC[M:M+4]} = vminq_f32(vacc${ABC[M:M+4]}, vmax);
244*4bdc9457SAndroid Build Coastguard Worker
245*4bdc9457SAndroid Build Coastguard Worker              $for M in range(0, SUBMR, 4):
246*4bdc9457SAndroid Build Coastguard Worker                vout${ABC[M:M+4]} = vmaxq_f32(vout${ABC[M:M+4]}, vmin);
247*4bdc9457SAndroid Build Coastguard Worker
248*4bdc9457SAndroid Build Coastguard Worker            $if SUBMR == 1:
249*4bdc9457SAndroid Build Coastguard Worker              vst1_lane_f32(output, vout${ABC[0:1]}, 1);
250*4bdc9457SAndroid Build Coastguard Worker            $elif SUBMR == 2:
251*4bdc9457SAndroid Build Coastguard Worker              vst1_f32(output, vout${ABC[0:2]});
252*4bdc9457SAndroid Build Coastguard Worker            $else:
253*4bdc9457SAndroid Build Coastguard Worker              $for M in range(0, SUBMR, 4):
254*4bdc9457SAndroid Build Coastguard Worker                vst1q_f32(output + ${M}, vout${ABC[M:M+4]});
255*4bdc9457SAndroid Build Coastguard Worker            output = (float*restrict) ((uintptr_t) output + output_stride);
256*4bdc9457SAndroid Build Coastguard Worker            n -= 1;
257*4bdc9457SAndroid Build Coastguard Worker          } while (n != 0);
258*4bdc9457SAndroid Build Coastguard Worker        }
259*4bdc9457SAndroid Build Coastguard Worker        output = (float*restrict) ((uintptr_t) output - output_decrement);
260*4bdc9457SAndroid Build Coastguard Worker        input += ${SUBMR};
261*4bdc9457SAndroid Build Coastguard Worker      }
262*4bdc9457SAndroid Build Coastguard Worker    }
263*4bdc9457SAndroid Build Coastguard Worker}
264