xref: /aosp_15_r20/external/XNNPACK/src/qs8-vmul/neon.c.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2021 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert DATATYPE in ["QS8", "QU8"]
7$assert REQUANTIZATION in ["FP32", "RNDNU"]
8$assert BATCH_TILE % (16 if LD128 else 8) == 0
9$assert BATCH_TILE >= 8
10$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
11#include <assert.h>
12
13#include <arm_neon.h>
14
15$if REQUANTIZATION == "FP32" and ARMV8:
16  #include <xnnpack/intrinsics-polyfill.h>
17#include <xnnpack/vmul.h>
18
19
20$PARAMS_STRUCT = REQUANTIZATION.lower() + "_" + ("neonv8" if ARMV8 else "neon")
21$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
22$XINT8X8_T = {"QS8": "int8x8_t", "QU8": "uint8x8_t"}[DATATYPE]
23$XINT8X16_T = {"QS8": "int8x16_t", "QU8": "uint8x16_t"}[DATATYPE]
24$VLD1_X8 = {"QS8": "vld1_s8", "QU8": "vld1_u8"}[DATATYPE]
25$VLD1Q_X8 = {"QS8": "vld1q_s8", "QU8": "vld1q_u8"}[DATATYPE]
26$VLD1_DUP_X8 = {"QS8": "vld1_dup_s8", "QU8": "vld1_dup_u8"}[DATATYPE]
27$VLD1Q_DUP_X8 = {"QS8": "vld1q_dup_s8", "QU8": "vld1q_dup_u8"}[DATATYPE]
28$VST1_LANE_X8 = {"QS8": "vst1_lane_s8", "QU8": "vst1_lane_u8"}[DATATYPE]
29$VST1_X8 = {"QS8": "vst1_s8", "QU8": "vst1_u8"}[DATATYPE]
30$VST1Q_X8 = {"QS8": "vst1q_s8", "QU8": "vst1q_u8"}[DATATYPE]
31$VMIN_X8 = {"QS8": "vmin_s8", "QU8": "vmin_u8"}[DATATYPE]
32$VMAX_X8 = {"QS8": "vmax_s8", "QU8": "vmax_u8"}[DATATYPE]
33$VMINQ_X8 = {"QS8": "vminq_s8", "QU8": "vminq_u8"}[DATATYPE]
34$VMAXQ_X8 = {"QS8": "vmaxq_s8", "QU8": "vmaxq_u8"}[DATATYPE]
35$VQMOVXN_S16 = {"QS8": "vqmovn_s16", "QU8": "vqmovun_s16"}[DATATYPE]
36$VQMOVXN_HIGH_S16 = {"QS8": "vqmovn_high_s16", "QU8": "vqmovun_high_s16"}[DATATYPE]
37$VEXT_X8 = {"QS8": "vext_s8", "QU8": "vext_u8"}[DATATYPE]
38$VGET_LOW_X8 = {"QS8": "vget_low_s8", "QU8": "vget_low_u8"}[DATATYPE]
39$VCOMBINE_X8 = {"QS8": "vcombine_s8", "QU8": "vcombine_u8"}[DATATYPE]
40$VREINTERPRET_U32_X8 = {"QS8": "vreinterpret_u32_s8", "QU8": "vreinterpret_u32_u8"}[DATATYPE]
41$VREINTERPRET_U16_X8 = {"QS8": "vreinterpret_u16_s8", "QU8": "vreinterpret_u16_u8"}[DATATYPE]
42void xnn_${DATATYPE.lower()}_vmul_minmax_${REQUANTIZATION.lower()}_ukernel__${"neonv8" if ARMV8 else "neon"}_${"ld128" if LD128 else "ld64"}_x${BATCH_TILE}(
43    size_t n,
44    const ${XINT8_T}* input_a,
45    const ${XINT8_T}* input_b,
46    ${XINT8_T}* output,
47    const union xnn_${DATATYPE.lower()}_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
48{
49  $if LD128:
50    #if XNN_ARCH_ARM64
51      const ${XINT8X16_T} va_zero_point = ${VLD1Q_DUP_X8}(params->${PARAMS_STRUCT}.a_zero_point);
52      const ${XINT8X16_T} vb_zero_point = ${VLD1Q_DUP_X8}(params->${PARAMS_STRUCT}.b_zero_point);
53    #else
54      const ${XINT8X8_T} va_zero_point = ${VLD1_DUP_X8}(params->${PARAMS_STRUCT}.a_zero_point);
55      const ${XINT8X8_T} vb_zero_point = ${VLD1_DUP_X8}(params->${PARAMS_STRUCT}.b_zero_point);
56    #endif
57  $else:
58    const ${XINT8X8_T} va_zero_point = ${VLD1_DUP_X8}(params->${PARAMS_STRUCT}.a_zero_point);
59    const ${XINT8X8_T} vb_zero_point = ${VLD1_DUP_X8}(params->${PARAMS_STRUCT}.b_zero_point);
60  $if REQUANTIZATION == "FP32":
61    const float32x4_t vscale = vld1q_dup_f32(&params->${PARAMS_STRUCT}.scale);
62    $if ARMV8:
63      const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->fp32_neonv8.output_zero_point);
64    $else:
65      const float32x4_t vmagic_bias = vld1q_dup_f32(&params->fp32_neon.magic_bias);
66      const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(&params->fp32_neon.magic_bias_less_output_zero_point);
67  $elif REQUANTIZATION == "RNDNU":
68    const int32x4_t vleft_pre_shift = vld1q_dup_s32(&params->rndnu_neon.left_pre_shift);
69    const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
70    const int32x4_t vleft_post_shift = vld1q_dup_s32(&params->rndnu_neon.left_post_shift);
71    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
72  $if BATCH_TILE > 8:
73    const ${XINT8X16_T} voutput_min = ${VLD1Q_DUP_X8}(&params->${PARAMS_STRUCT}.output_min);
74    const ${XINT8X16_T} voutput_max = ${VLD1Q_DUP_X8}(&params->${PARAMS_STRUCT}.output_max);
75  $else:
76    const ${XINT8X8_T} voutput_min = ${VLD1_DUP_X8}(&params->${PARAMS_STRUCT}.output_min);
77    const ${XINT8X8_T} voutput_max = ${VLD1_DUP_X8}(&params->${PARAMS_STRUCT}.output_max);
78
79  for (; n >= ${BATCH_TILE} * sizeof(${XINT8_T}); n -= ${BATCH_TILE} * sizeof(${XINT8_T})) {
80    $if LD128:
81      $for N in range(0, BATCH_TILE, 16):
82        const ${XINT8X16_T} va${ABC[N:N+16]} = ${VLD1Q_X8}(input_a); input_a += 16;
83        const ${XINT8X16_T} vb${ABC[N:N+16]} = ${VLD1Q_X8}(input_b); input_b += 16;
84
85      #if XNN_ARCH_ARM64
86        $for N in range(0, BATCH_TILE, 16):
87          $if DATATYPE == "QU8":
88            const int16x8_t vxa${ABC[N:N+8]} = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(va${ABC[N:N+16]}), vget_low_u8(va_zero_point)));
89            const int16x8_t vxa${ABC[N+8:N+16]} = vreinterpretq_s16_u16(vsubl_high_u8(va${ABC[N:N+16]}, va_zero_point));
90            const int16x8_t vxb${ABC[N:N+8]} = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(vb${ABC[N:N+16]}), vget_low_u8(vb_zero_point)));
91            const int16x8_t vxb${ABC[N+8:N+16]} = vreinterpretq_s16_u16(vsubl_high_u8(vb${ABC[N:N+16]}, vb_zero_point));
92          $else:
93            const int16x8_t vxa${ABC[N:N+8]} = vsubl_s8(vget_low_s8(va${ABC[N:N+16]}), vget_low_s8(va_zero_point));
94            const int16x8_t vxa${ABC[N+8:N+16]} = vsubl_high_s8(va${ABC[N:N+16]}, va_zero_point);
95            const int16x8_t vxb${ABC[N:N+8]} = vsubl_s8(vget_low_s8(vb${ABC[N:N+16]}), vget_low_s8(vb_zero_point));
96            const int16x8_t vxb${ABC[N+8:N+16]} = vsubl_high_s8(vb${ABC[N:N+16]}, vb_zero_point);
97      #else  // !XNN_ARCH_ARM64
98        $for N in range(0, BATCH_TILE, 16):
99          $if DATATYPE == "QU8":
100            const int16x8_t vxa${ABC[N:N+8]} = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(va${ABC[N:N+16]}), va_zero_point));
101            const int16x8_t vxa${ABC[N+8:N+16]} = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(va${ABC[N:N+16]}), va_zero_point));
102            const int16x8_t vxb${ABC[N:N+8]} = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(vb${ABC[N:N+16]}), vb_zero_point));
103            const int16x8_t vxb${ABC[N+8:N+16]} = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(vb${ABC[N:N+16]}), vb_zero_point));
104          $else:
105            const int16x8_t vxa${ABC[N:N+8]} = vsubl_s8(vget_low_s8(va${ABC[N:N+16]}), va_zero_point);
106            const int16x8_t vxa${ABC[N+8:N+16]} = vsubl_s8(vget_high_s8(va${ABC[N:N+16]}), va_zero_point);
107            const int16x8_t vxb${ABC[N:N+8]} = vsubl_s8(vget_low_s8(vb${ABC[N:N+16]}), vb_zero_point);
108            const int16x8_t vxb${ABC[N+8:N+16]} = vsubl_s8(vget_high_s8(vb${ABC[N:N+16]}), vb_zero_point);
109      #endif  // XNN_ARCH_ARM64
110    $else:
111      $for N in range(0, BATCH_TILE, 8):
112        const ${XINT8X8_T} va${ABC[N:N+8]} = ${VLD1_X8}(input_a); input_a += 8;
113        const ${XINT8X8_T} vb${ABC[N:N+8]} = ${VLD1_X8}(input_b); input_b += 8;
114
115      $for N in range(0, BATCH_TILE, 8):
116        $if DATATYPE == "QU8":
117          const int16x8_t vxa${ABC[N:N+8]} = vreinterpretq_s16_u16(vsubl_u8(va${ABC[N:N+8]}, va_zero_point));
118          const int16x8_t vxb${ABC[N:N+8]} = vreinterpretq_s16_u16(vsubl_u8(vb${ABC[N:N+8]}, vb_zero_point));
119        $else:
120          const int16x8_t vxa${ABC[N:N+8]} = vsubl_s8(va${ABC[N:N+8]}, va_zero_point);
121          const int16x8_t vxb${ABC[N:N+8]} = vsubl_s8(vb${ABC[N:N+8]}, vb_zero_point);
122
123    $for N in range(0, BATCH_TILE, 8):
124      int32x4_t vacc${ABC[N:N+4]} = vmull_s16(vget_low_s16(vxa${ABC[N:N+8]}), vget_low_s16(vxb${ABC[N:N+8]}));
125      int32x4_t vacc${ABC[N+4:N+8]} = vmull_s16(vget_high_s16(vxa${ABC[N:N+8]}), vget_high_s16(vxb${ABC[N:N+8]}));
126
127    $if REQUANTIZATION == "FP32":
128      $for N in range(0, BATCH_TILE, 4):
129        float32x4_t vfpacc${ABC[N:N+4]} = vcvtq_f32_s32(vacc${ABC[N:N+4]});
130
131      $for N in range(0, BATCH_TILE, 4):
132        vfpacc${ABC[N:N+4]} = vmulq_f32(vfpacc${ABC[N:N+4]}, vscale);
133
134      $if ARMV8:
135        $for N in range(0, BATCH_TILE, 4):
136          vacc${ABC[N:N+4]} = vcvtnq_s32_f32(vfpacc${ABC[N:N+4]});
137      $else:
138        $for N in range(0, BATCH_TILE, 4):
139          vacc${ABC[N:N+4]} = vreinterpretq_s32_f32(vaddq_f32(vfpacc${ABC[N:N+4]}, vmagic_bias));
140
141        $for N in range(0, BATCH_TILE, 4):
142          vacc${ABC[N:N+4]} = vqsubq_s32(vacc${ABC[N:N+4]}, vmagic_bias_less_output_zero_point);
143    $elif REQUANTIZATION == "RNDNU":
144      $for N in range(0, BATCH_TILE, 4):
145        vacc${ABC[N:N+4]} = vqshlq_s32(vacc${ABC[N:N+4]}, vleft_pre_shift);
146
147      $for N in range(0, BATCH_TILE, 4):
148        vacc${ABC[N:N+4]} = vqdmulhq_s32(vacc${ABC[N:N+4]}, vmultiplier);
149
150      $for N in range(0, BATCH_TILE, 4):
151        vacc${ABC[N:N+4]} = vrshlq_s32(vacc${ABC[N:N+4]}, vleft_post_shift);
152
153    #if XNN_ARCH_ARM64
154      $for N in range(0, BATCH_TILE, 8):
155        int16x8_t vacc${ABC[N:N+8]} = vqmovn_high_s32(vqmovn_s32(vacc${ABC[N:N+4]}), vacc${ABC[N+4:N+8]});
156    #else
157      $for N in range(0, BATCH_TILE, 8):
158        int16x8_t vacc${ABC[N:N+8]} = vcombine_s16(vqmovn_s32(vacc${ABC[N:N+4]}), vqmovn_s32(vacc${ABC[N+4:N+8]}));
159    #endif
160
161    $if REQUANTIZATION != "FP32" or ARMV8:
162      $for N in range(0, BATCH_TILE, 8):
163        vacc${ABC[N:N+8]} = vqaddq_s16(vacc${ABC[N:N+8]}, voutput_zero_point);
164
165    #if XNN_ARCH_ARM64
166      $for N in range(0, BATCH_TILE, 16):
167        $if N + 8 < BATCH_TILE:
168          ${XINT8X16_T} vout${ABC[N:N+16]} = ${VQMOVXN_HIGH_S16}(${VQMOVXN_S16}(vacc${ABC[N:N+8]}), vacc${ABC[N+8:N+16]});
169        $else:
170          ${XINT8X8_T} vout${ABC[N:N+8]} = ${VQMOVXN_S16}(vacc${ABC[N:N+8]});
171    #else
172      $for N in range(0, BATCH_TILE, 16):
173        $if N + 8 < BATCH_TILE:
174          ${XINT8X16_T} vout${ABC[N:N+16]} = ${VCOMBINE_X8}(${VQMOVXN_S16}(vacc${ABC[N:N+8]}), ${VQMOVXN_S16}(vacc${ABC[N+8:N+16]}));
175        $else:
176          ${XINT8X8_T} vout${ABC[N:N+8]} = ${VQMOVXN_S16}(vacc${ABC[N:N+8]});
177    #endif
178
179    $for N in range(0, BATCH_TILE, 16):
180      $if N + 8 < BATCH_TILE:
181        vout${ABC[N:N+16]} = ${VMAXQ_X8}(vout${ABC[N:N+16]}, voutput_min);
182      $elif BATCH_TILE > 8:
183        vout${ABC[N:N+8]} = ${VMAX_X8}(vout${ABC[N:N+8]}, ${VGET_LOW_X8}(voutput_min));
184      $else:
185        vout${ABC[N:N+8]} = ${VMAX_X8}(vout${ABC[N:N+8]}, voutput_min);
186
187    $for N in range(0, BATCH_TILE, 16):
188      $if N + 8 < BATCH_TILE:
189        vout${ABC[N:N+16]} = ${VMINQ_X8}(vout${ABC[N:N+16]}, voutput_max);
190      $elif BATCH_TILE > 8:
191        vout${ABC[N:N+8]} = ${VMIN_X8}(vout${ABC[N:N+8]}, ${VGET_LOW_X8}(voutput_max));
192      $else:
193        vout${ABC[N:N+8]} = ${VMIN_X8}(vout${ABC[N:N+8]}, voutput_max);
194
195    $for N in range(0, BATCH_TILE, 16):
196      $if N + 8 < BATCH_TILE:
197        ${VST1Q_X8}(output, vout${ABC[N:N+16]}); output += 16;
198      $else:
199        ${VST1_X8}(output, vout${ABC[N:N+8]}); output += 8;
200  }
201  if XNN_UNLIKELY(n != 0) {
202    ${"do " if BATCH_TILE > 8 else ""}{
203      $if BATCH_TILE > 8:
204        const ${XINT8X8_T} va${ABC[0:8]} = ${VLD1_X8}(input_a); input_a += 8;
205        const ${XINT8X8_T} vb${ABC[0:8]} = ${VLD1_X8}(input_b); input_b += 8;
206      $else:
207        const ${XINT8X8_T} va${ABC[0:8]} = ${VLD1_X8}(input_a);
208        const ${XINT8X8_T} vb${ABC[0:8]} = ${VLD1_X8}(input_b);
209
210      $if LD128:
211        $if DATATYPE == "QU8":
212          #if XNN_ARCH_ARM64
213            const int16x8_t vxa${ABC[0:8]} = vreinterpretq_s16_u16(vsubl_u8(va${ABC[0:8]}, vget_low_u8(va_zero_point)));
214            const int16x8_t vxb${ABC[0:8]} = vreinterpretq_s16_u16(vsubl_u8(vb${ABC[0:8]}, vget_low_u8(vb_zero_point)));
215          #else  // !XNN_ARCH_ARM64
216            const int16x8_t vxa${ABC[0:8]} = vreinterpretq_s16_u16(vsubl_u8(va${ABC[0:8]}, va_zero_point));
217            const int16x8_t vxb${ABC[0:8]} = vreinterpretq_s16_u16(vsubl_u8(vb${ABC[0:8]}, vb_zero_point));
218          #endif  // XNN_ARCH_ARM64
219        $else:
220          #if XNN_ARCH_ARM64
221            const int16x8_t vxa${ABC[0:8]} = vsubl_s8(va${ABC[0:8]}, vget_low_s8(va_zero_point));
222            const int16x8_t vxb${ABC[0:8]} = vsubl_s8(vb${ABC[0:8]}, vget_low_s8(vb_zero_point));
223          #else  // !XNN_ARCH_ARM64
224            const int16x8_t vxa${ABC[0:8]} = vsubl_s8(va${ABC[0:8]}, va_zero_point);
225            const int16x8_t vxb${ABC[0:8]} = vsubl_s8(vb${ABC[0:8]}, vb_zero_point);
226          #endif  // XNN_ARCH_ARM64
227      $else:
228        $if DATATYPE == "QU8":
229          const int16x8_t vxa${ABC[0:8]} = vreinterpretq_s16_u16(vsubl_u8(va${ABC[0:8]}, va_zero_point));
230          const int16x8_t vxb${ABC[0:8]} = vreinterpretq_s16_u16(vsubl_u8(vb${ABC[0:8]}, vb_zero_point));
231        $else:
232          const int16x8_t vxa${ABC[0:8]} = vsubl_s8(va${ABC[0:8]}, va_zero_point);
233          const int16x8_t vxb${ABC[0:8]} = vsubl_s8(vb${ABC[0:8]}, vb_zero_point);
234
235      int32x4_t vacc${ABC[0:4]} = vmull_s16(vget_low_s16(vxa${ABC[0:8]}), vget_low_s16(vxb${ABC[0:8]}));
236      int32x4_t vacc${ABC[4:8]} = vmull_s16(vget_high_s16(vxa${ABC[0:8]}), vget_high_s16(vxb${ABC[0:8]}));
237
238      $if REQUANTIZATION == "FP32":
239        float32x4_t vfpacc${ABC[0:4]} = vcvtq_f32_s32(vacc${ABC[0:4]});
240        float32x4_t vfpacc${ABC[4:8]} = vcvtq_f32_s32(vacc${ABC[4:8]});
241
242        vfpacc${ABC[0:4]} = vmulq_f32(vfpacc${ABC[0:4]}, vscale);
243        vfpacc${ABC[4:8]} = vmulq_f32(vfpacc${ABC[4:8]}, vscale);
244
245        $if ARMV8:
246          vacc${ABC[0:4]} = vcvtnq_s32_f32(vfpacc${ABC[0:4]});
247          vacc${ABC[4:8]} = vcvtnq_s32_f32(vfpacc${ABC[4:8]});
248        $else:
249          vacc${ABC[0:4]} = vreinterpretq_s32_f32(vaddq_f32(vfpacc${ABC[0:4]}, vmagic_bias));
250          vacc${ABC[4:8]} = vreinterpretq_s32_f32(vaddq_f32(vfpacc${ABC[4:8]}, vmagic_bias));
251
252          vacc${ABC[0:4]} = vqsubq_s32(vacc${ABC[0:4]}, vmagic_bias_less_output_zero_point);
253          vacc${ABC[4:8]} = vqsubq_s32(vacc${ABC[4:8]}, vmagic_bias_less_output_zero_point);
254      $elif REQUANTIZATION == "RNDNU":
255        vacc${ABC[0:4]} = vqshlq_s32(vacc${ABC[0:4]}, vleft_pre_shift);
256        vacc${ABC[4:8]} = vqshlq_s32(vacc${ABC[4:8]}, vleft_pre_shift);
257
258        vacc${ABC[0:4]} = vqdmulhq_s32(vacc${ABC[0:4]}, vmultiplier);
259        vacc${ABC[4:8]} = vqdmulhq_s32(vacc${ABC[4:8]}, vmultiplier);
260
261        vacc${ABC[0:4]} = vrshlq_s32(vacc${ABC[0:4]}, vleft_post_shift);
262        vacc${ABC[4:8]} = vrshlq_s32(vacc${ABC[4:8]}, vleft_post_shift);
263
264      #if XNN_ARCH_ARM64
265        int16x8_t vacc${ABC[0:8]} = vqmovn_high_s32(vqmovn_s32(vacc${ABC[0:4]}), vacc${ABC[4:8]});
266      #else
267        int16x8_t vacc${ABC[0:8]} = vcombine_s16(vqmovn_s32(vacc${ABC[0:4]}), vqmovn_s32(vacc${ABC[4:8]}));
268      #endif
269
270      $if REQUANTIZATION != "FP32" or ARMV8:
271        vacc${ABC[0:8]} = vqaddq_s16(vacc${ABC[0:8]}, voutput_zero_point);
272
273      ${XINT8X8_T} vout${ABC[0:8]} = ${VQMOVXN_S16}(vacc${ABC[0:8]});
274
275      $if BATCH_TILE > 8:
276        vout${ABC[0:8]} = ${VMAX_X8}(vout${ABC[0:8]}, ${VGET_LOW_X8}(voutput_min));
277        vout${ABC[0:8]} = ${VMIN_X8}(vout${ABC[0:8]}, ${VGET_LOW_X8}(voutput_max));
278        if XNN_LIKELY(n >= (8 * sizeof(${XINT8_T}))) {
279          ${VST1_X8}(output, vout${ABC[0:8]}); output += 8;
280          n -= 8 * sizeof(${XINT8_T});
281        } else {
282          if (n & (4 * sizeof(${XINT8_T}))) {
283            vst1_lane_u32((void*) output, ${VREINTERPRET_U32_X8}(vout${ABC[0:8]}), 0); output += 4;
284            vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 4);
285          }
286          if (n & (2 * sizeof(${XINT8_T}))) {
287            vst1_lane_u16((void*) output, ${VREINTERPRET_U16_X8}(vout${ABC[0:8]}), 0); output += 2;
288            vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 2);
289          }
290          if (n & (1 * sizeof(${XINT8_T}))) {
291            ${VST1_LANE_X8}(output, vout${ABC[0:8]}, 0);
292          }
293          n = 0;
294        }
295      $else:
296        vout${ABC[0:8]} = ${VMAX_X8}(vout${ABC[0:8]}, voutput_min);
297        vout${ABC[0:8]} = ${VMIN_X8}(vout${ABC[0:8]}, voutput_max);
298        if (n & (4 * sizeof(${XINT8_T}))) {
299          vst1_lane_u32((void*) output, ${VREINTERPRET_U32_X8}(vout${ABC[0:8]}), 0); output += 4;
300          vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 4);
301        }
302        if (n & (2 * sizeof(${XINT8_T}))) {
303          vst1_lane_u16((void*) output, ${VREINTERPRET_U16_X8}(vout${ABC[0:8]}), 0); output += 2;
304          vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 2);
305        }
306        if (n & (1 * sizeof(${XINT8_T}))) {
307          ${VST1_LANE_X8}(output, vout${ABC[0:8]}, 0);
308        }
309    }${" while (n != 0);" if BATCH_TILE > 8 else ""}
310  }
311}
312