1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2019 Google LLC 2*4bdc9457SAndroid Build Coastguard Worker// 3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the 4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree. 5*4bdc9457SAndroid Build Coastguard Worker 6*4bdc9457SAndroid Build Coastguard Worker$assert MR % 4 == 0 7*4bdc9457SAndroid Build Coastguard Worker$assert NR in [1, 2, 4] 8*4bdc9457SAndroid Build Coastguard Worker$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 9*4bdc9457SAndroid Build Coastguard Worker#include <assert.h> 10*4bdc9457SAndroid Build Coastguard Worker 11*4bdc9457SAndroid Build Coastguard Worker#include <arm_neon.h> 12*4bdc9457SAndroid Build Coastguard Worker 13*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/spmm.h> 14*4bdc9457SAndroid Build Coastguard Worker 15*4bdc9457SAndroid Build Coastguard Worker 16*4bdc9457SAndroid Build Coastguard Workervoid xnn_f32_spmm_minmax_ukernel_${MR}x${NR}__${"neonfma" if FMA else "neon"}( 17*4bdc9457SAndroid Build Coastguard Worker size_t mc, 18*4bdc9457SAndroid Build Coastguard Worker size_t nc, 19*4bdc9457SAndroid Build Coastguard Worker const float*restrict input, 20*4bdc9457SAndroid Build Coastguard Worker const float*restrict weights, 21*4bdc9457SAndroid Build Coastguard Worker const int32_t*restrict widx_dmap, 22*4bdc9457SAndroid Build Coastguard Worker const uint32_t*restrict nidx_nnzmap, 23*4bdc9457SAndroid Build Coastguard Worker float*restrict output, 24*4bdc9457SAndroid Build Coastguard Worker size_t output_stride, 25*4bdc9457SAndroid Build Coastguard Worker const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) 26*4bdc9457SAndroid Build Coastguard Worker{ 27*4bdc9457SAndroid Build Coastguard Worker assert(mc != 0); 28*4bdc9457SAndroid Build Coastguard Worker assert(mc % sizeof(float) == 0); 29*4bdc9457SAndroid Build Coastguard Worker assert(nc != 0); 30*4bdc9457SAndroid Build Coastguard Worker 31*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); 32*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); 33*4bdc9457SAndroid Build Coastguard Worker size_t output_decrement = output_stride * nc - ${MR} * sizeof(float); 34*4bdc9457SAndroid Build Coastguard Worker while XNN_LIKELY(mc >= ${MR} * sizeof(float)) { 35*4bdc9457SAndroid Build Coastguard Worker const float*restrict w = weights; 36*4bdc9457SAndroid Build Coastguard Worker const int32_t* dmap = widx_dmap; 37*4bdc9457SAndroid Build Coastguard Worker const uint32_t* nnzmap = nidx_nnzmap; 38*4bdc9457SAndroid Build Coastguard Worker size_t n = nc; 39*4bdc9457SAndroid Build Coastguard Worker while (n >= ${NR}) { 40*4bdc9457SAndroid Build Coastguard Worker uint32_t nnz = *nnzmap++; 41*4bdc9457SAndroid Build Coastguard Worker $for N in range(0, NR, 1): 42*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc${ABC[0:4]}n${N} = vld1q_dup_f32(w); w += 1; 43*4bdc9457SAndroid Build Coastguard Worker $for M in range(4, MR, 4): 44*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc${ABC[M:M+4]}n${N} = vacc${ABC[0:4]}n${N}; 45*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nnz != 0) { 46*4bdc9457SAndroid Build Coastguard Worker do { 47*4bdc9457SAndroid Build Coastguard Worker const intptr_t diff = *dmap++; 48*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi${ABC[0:4]} = vld1q_f32(input); 49*4bdc9457SAndroid Build Coastguard Worker $for M in range(4, MR, 4): 50*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi${ABC[M:M+4]} = vld1q_f32(input + ${M}); 51*4bdc9457SAndroid Build Coastguard Worker input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff); 52*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, MR, 16): 53*4bdc9457SAndroid Build Coastguard Worker __builtin_prefetch(input + ${M+16}); 54*4bdc9457SAndroid Build Coastguard Worker $if NR == 1: 55*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vw = vld1q_dup_f32(w); w += 1; 56*4bdc9457SAndroid Build Coastguard Worker $elif NR == 2: 57*4bdc9457SAndroid Build Coastguard Worker const float32x2_t vw = vld1_f32(w); w += 2; 58*4bdc9457SAndroid Build Coastguard Worker $elif NR == 4: 59*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vw = vld1q_f32(w); w += 4; 60*4bdc9457SAndroid Build Coastguard Worker __builtin_prefetch(w + 32); 61*4bdc9457SAndroid Build Coastguard Worker $if NR == 1: 62*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, MR, 4): 63*4bdc9457SAndroid Build Coastguard Worker vacc${ABC[M:M+4]}c0 = vfmaq_f32(vacc${ABC[M:M+4]}c0, vi${ABC[M:M+4]}, vw); 64*4bdc9457SAndroid Build Coastguard Worker $else: 65*4bdc9457SAndroid Build Coastguard Worker $for N in range(NR): 66*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, MR, 4): 67*4bdc9457SAndroid Build Coastguard Worker vacc${ABC[M:M+4]}n${N} = vfmaq_lane${"q" if NR == 4 else ""}_f32(vacc${ABC[M:M+4]}n${N}, vi${ABC[M:M+4]}, vw, ${N}); 68*4bdc9457SAndroid Build Coastguard Worker } while (--nnz != 0); 69*4bdc9457SAndroid Build Coastguard Worker } 70*4bdc9457SAndroid Build Coastguard Worker $for N in range(0, NR, 1): 71*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, MR, 4): 72*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout${ABC[M:M+4]}n${N} = vminq_f32(vacc${ABC[M:M+4]}n${N}, vmax); 73*4bdc9457SAndroid Build Coastguard Worker 74*4bdc9457SAndroid Build Coastguard Worker $for N in range(0, NR, 1): 75*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, MR, 4): 76*4bdc9457SAndroid Build Coastguard Worker vout${ABC[M:M+4]}n${N} = vmaxq_f32(vout${ABC[M:M+4]}n${N}, vmin); 77*4bdc9457SAndroid Build Coastguard Worker 78*4bdc9457SAndroid Build Coastguard Worker $for N in range(0, NR, 1): 79*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, MR, 4): 80*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + ${M}, vout${ABC[M:M+4]}n${N}); 81*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride); 82*4bdc9457SAndroid Build Coastguard Worker n -= ${NR}; 83*4bdc9457SAndroid Build Coastguard Worker } 84*4bdc9457SAndroid Build Coastguard Worker 85*4bdc9457SAndroid Build Coastguard Worker // clean up loop, fall back to nr=1 86*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) { 87*4bdc9457SAndroid Build Coastguard Worker do { 88*4bdc9457SAndroid Build Coastguard Worker uint32_t nnz = *nnzmap++; 89*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc${ABC[0:4]} = vld1q_dup_f32(w); w += 1; 90*4bdc9457SAndroid Build Coastguard Worker $for M in range(4, MR, 4): 91*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc${ABC[M:M+4]} = vacc${ABC[0:4]}; 92*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nnz != 0) { 93*4bdc9457SAndroid Build Coastguard Worker do { 94*4bdc9457SAndroid Build Coastguard Worker const intptr_t diff = *dmap++; 95*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi${ABC[0:4]} = vld1q_f32(input); 96*4bdc9457SAndroid Build Coastguard Worker $for M in range(4, MR, 4): 97*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi${ABC[M:M+4]} = vld1q_f32(input + ${M}); 98*4bdc9457SAndroid Build Coastguard Worker input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff); 99*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, MR, 16): 100*4bdc9457SAndroid Build Coastguard Worker __builtin_prefetch(input + ${M+16}); 101*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vw = vld1q_dup_f32(w); w += 1; 102*4bdc9457SAndroid Build Coastguard Worker __builtin_prefetch(w + 32); 103*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, MR, 4): 104*4bdc9457SAndroid Build Coastguard Worker vacc${ABC[M:M+4]} = vfmaq_f32(vacc${ABC[M:M+4]}, vi${ABC[M:M+4]}, vw); 105*4bdc9457SAndroid Build Coastguard Worker } while (--nnz != 0); 106*4bdc9457SAndroid Build Coastguard Worker } 107*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, MR, 4): 108*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout${ABC[M:M+4]} = vminq_f32(vacc${ABC[M:M+4]}, vmax); 109*4bdc9457SAndroid Build Coastguard Worker 110*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, MR, 4): 111*4bdc9457SAndroid Build Coastguard Worker vout${ABC[M:M+4]} = vmaxq_f32(vout${ABC[M:M+4]}, vmin); 112*4bdc9457SAndroid Build Coastguard Worker 113*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, MR, 4): 114*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + ${M}, vout${ABC[M:M+4]}); 115*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride); 116*4bdc9457SAndroid Build Coastguard Worker n -= 1; 117*4bdc9457SAndroid Build Coastguard Worker } while (n != 0); 118*4bdc9457SAndroid Build Coastguard Worker } 119*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output - output_decrement); 120*4bdc9457SAndroid Build Coastguard Worker input += ${MR}; 121*4bdc9457SAndroid Build Coastguard Worker mc -= ${MR} * sizeof(float); 122*4bdc9457SAndroid Build Coastguard Worker } 123*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(mc != 0) { 124*4bdc9457SAndroid Build Coastguard Worker $for LOG2M in reversed(range((MR - 1).bit_length())): 125*4bdc9457SAndroid Build Coastguard Worker $SUBMR = 1 << LOG2M 126*4bdc9457SAndroid Build Coastguard Worker $if SUBMR * 2 >= MR: 127*4bdc9457SAndroid Build Coastguard Worker output_decrement += ${MR - SUBMR} * sizeof(float); 128*4bdc9457SAndroid Build Coastguard Worker $else: 129*4bdc9457SAndroid Build Coastguard Worker output_decrement += ${SUBMR} * sizeof(float); 130*4bdc9457SAndroid Build Coastguard Worker if (mc & (${SUBMR} * sizeof(float))) { 131*4bdc9457SAndroid Build Coastguard Worker const float*restrict w = weights; 132*4bdc9457SAndroid Build Coastguard Worker const int32_t* dmap = widx_dmap; 133*4bdc9457SAndroid Build Coastguard Worker const uint32_t* nnzmap = nidx_nnzmap; 134*4bdc9457SAndroid Build Coastguard Worker size_t n = nc; 135*4bdc9457SAndroid Build Coastguard Worker while (n >= ${NR}) { 136*4bdc9457SAndroid Build Coastguard Worker uint32_t nnz = *nnzmap++; 137*4bdc9457SAndroid Build Coastguard Worker $for N in range(0, NR, 1): 138*4bdc9457SAndroid Build Coastguard Worker $if SUBMR < 4: 139*4bdc9457SAndroid Build Coastguard Worker float32x2_t vacc${ABC[0:SUBMR]}n${N} = vld1_dup_f32(w); w += 1; 140*4bdc9457SAndroid Build Coastguard Worker $else: 141*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc${ABC[0:4]}n${N} = vld1q_dup_f32(w); w += 1; 142*4bdc9457SAndroid Build Coastguard Worker $for M in range(4, SUBMR, 4): 143*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc${ABC[M:M+4]}n${N} = vacc${ABC[0:4]}n${N}; 144*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nnz != 0) { 145*4bdc9457SAndroid Build Coastguard Worker do { 146*4bdc9457SAndroid Build Coastguard Worker const intptr_t diff = *dmap++; 147*4bdc9457SAndroid Build Coastguard Worker $if SUBMR == 1: 148*4bdc9457SAndroid Build Coastguard Worker const float32x2_t vi${ABC[0]} = vld1_dup_f32(input); 149*4bdc9457SAndroid Build Coastguard Worker $elif SUBMR == 2: 150*4bdc9457SAndroid Build Coastguard Worker const float32x2_t vi${ABC[0:2]} = vld1_f32(input); 151*4bdc9457SAndroid Build Coastguard Worker $else: 152*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi${ABC[0:4]} = vld1q_f32(input); 153*4bdc9457SAndroid Build Coastguard Worker $for M in range(4, SUBMR, 4): 154*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi${ABC[M:M+4]} = vld1q_f32(input + ${M}); 155*4bdc9457SAndroid Build Coastguard Worker input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff); 156*4bdc9457SAndroid Build Coastguard Worker $if NR == 1: 157*4bdc9457SAndroid Build Coastguard Worker $if SUBMR < 4: 158*4bdc9457SAndroid Build Coastguard Worker const float32x2_t vw = vld1_dup_f32(w); w += 1; 159*4bdc9457SAndroid Build Coastguard Worker $else: 160*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vw = vld1q_dup_f32(w); w += 1; 161*4bdc9457SAndroid Build Coastguard Worker $elif NR == 2: 162*4bdc9457SAndroid Build Coastguard Worker const float32x2_t vw = vld1_f32(w); w += 2; 163*4bdc9457SAndroid Build Coastguard Worker $elif NR == 4: 164*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vw = vld1q_f32(w); w += 4; 165*4bdc9457SAndroid Build Coastguard Worker 166*4bdc9457SAndroid Build Coastguard Worker $if NR == 1: 167*4bdc9457SAndroid Build Coastguard Worker $if SUBMR < 4: 168*4bdc9457SAndroid Build Coastguard Worker vacc${ABC[0:SUBMR]}c0 = vfmaq_f32(vacc${ABC[0:SUBMR]}c0, vi${ABC[0:SUBMR]}, vw); 169*4bdc9457SAndroid Build Coastguard Worker $else: 170*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, SUBMR, 4): 171*4bdc9457SAndroid Build Coastguard Worker vacc${ABC[M:M+4]}c0 = vfmaq_f32(vacc${ABC[M:M+4]}c0, vi${ABC[M:M+4]}, vw); 172*4bdc9457SAndroid Build Coastguard Worker $else: 173*4bdc9457SAndroid Build Coastguard Worker $for N in range(NR): 174*4bdc9457SAndroid Build Coastguard Worker $if SUBMR < 4: 175*4bdc9457SAndroid Build Coastguard Worker vacc${ABC[0:SUBMR]}n${N} = vfma_lane${"q" if NR == 4 else ""}_f32(vacc${ABC[0:SUBMR]}n${N}, vi${ABC[0:SUBMR]}, vw, ${N}); 176*4bdc9457SAndroid Build Coastguard Worker $else: 177*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, SUBMR, 4): 178*4bdc9457SAndroid Build Coastguard Worker vacc${ABC[M:M+4]}n${N} = vfmaq_lane${"q" if NR == 4 else ""}_f32(vacc${ABC[M:M+4]}n${N}, vi${ABC[M:M+4]}, vw, ${N}); 179*4bdc9457SAndroid Build Coastguard Worker } while (--nnz != 0); 180*4bdc9457SAndroid Build Coastguard Worker } 181*4bdc9457SAndroid Build Coastguard Worker $for N in range(0, NR, 1): 182*4bdc9457SAndroid Build Coastguard Worker $if SUBMR < 4: 183*4bdc9457SAndroid Build Coastguard Worker float32x2_t vout${ABC[0:SUBMR]}n${N} = vmin_f32(vacc${ABC[0:SUBMR]}n${N}, vget_low_f32(vmax)); 184*4bdc9457SAndroid Build Coastguard Worker $else: 185*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, SUBMR, 4): 186*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout${ABC[M:M+4]}n${N} = vminq_f32(vacc${ABC[M:M+4]}n${N}, vmax); 187*4bdc9457SAndroid Build Coastguard Worker 188*4bdc9457SAndroid Build Coastguard Worker $for N in range(0, NR, 1): 189*4bdc9457SAndroid Build Coastguard Worker $if SUBMR < 4: 190*4bdc9457SAndroid Build Coastguard Worker vout${ABC[0:SUBMR]}n${N} = vmax_f32(vout${ABC[0:SUBMR]}n${N}, vget_low_f32(vmin)); 191*4bdc9457SAndroid Build Coastguard Worker $else: 192*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, SUBMR, 4): 193*4bdc9457SAndroid Build Coastguard Worker vout${ABC[M:M+4]}n${N} = vmaxq_f32(vout${ABC[M:M+4]}n${N}, vmin); 194*4bdc9457SAndroid Build Coastguard Worker 195*4bdc9457SAndroid Build Coastguard Worker $for N in range(NR): 196*4bdc9457SAndroid Build Coastguard Worker $if SUBMR == 1: 197*4bdc9457SAndroid Build Coastguard Worker vst1_lane_f32(output + ${M}, vout${ABC[0:SUBMR]}n${N}, 0); 198*4bdc9457SAndroid Build Coastguard Worker $elif SUBMR == 2: 199*4bdc9457SAndroid Build Coastguard Worker vst1_f32(output + ${M}, vout${ABC[0:SUBMR]}n${N}); 200*4bdc9457SAndroid Build Coastguard Worker $else: 201*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, SUBMR, 4): 202*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + ${M}, vout${ABC[M:M+4]}n${N}); 203*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride); 204*4bdc9457SAndroid Build Coastguard Worker n -= ${NR}; 205*4bdc9457SAndroid Build Coastguard Worker } 206*4bdc9457SAndroid Build Coastguard Worker 207*4bdc9457SAndroid Build Coastguard Worker // clean up loop, fall back to nr=1 208*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) { 209*4bdc9457SAndroid Build Coastguard Worker do { 210*4bdc9457SAndroid Build Coastguard Worker uint32_t nnz = *nnzmap++; 211*4bdc9457SAndroid Build Coastguard Worker $if SUBMR < 4: 212*4bdc9457SAndroid Build Coastguard Worker float32x2_t vacc${ABC[0:SUBMR]} = vld1_dup_f32(w); w += 1; 213*4bdc9457SAndroid Build Coastguard Worker $else: 214*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc${ABC[0:4]} = vld1q_dup_f32(w); w += 1; 215*4bdc9457SAndroid Build Coastguard Worker $for M in range(4, SUBMR, 4): 216*4bdc9457SAndroid Build Coastguard Worker float32x4_t vacc${ABC[M:M+4]} = vacc${ABC[0:4]}; 217*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(nnz != 0) { 218*4bdc9457SAndroid Build Coastguard Worker do { 219*4bdc9457SAndroid Build Coastguard Worker const intptr_t diff = *dmap++; 220*4bdc9457SAndroid Build Coastguard Worker $if SUBMR == 1: 221*4bdc9457SAndroid Build Coastguard Worker const float32x2_t vi${ABC[0:1]} = vld1_dup_f32(input); 222*4bdc9457SAndroid Build Coastguard Worker $elif SUBMR == 2: 223*4bdc9457SAndroid Build Coastguard Worker const float32x2_t vi${ABC[0:2]} = vld1_f32(input); 224*4bdc9457SAndroid Build Coastguard Worker $else: 225*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi${ABC[0:4]} = vld1q_f32(input); 226*4bdc9457SAndroid Build Coastguard Worker $for M in range(4, SUBMR, 4): 227*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vi${ABC[M:M+4]} = vld1q_f32(input + ${M}); 228*4bdc9457SAndroid Build Coastguard Worker input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff); 229*4bdc9457SAndroid Build Coastguard Worker $if SUBMR < 4: 230*4bdc9457SAndroid Build Coastguard Worker const float32x2_t vw = vld1_dup_f32(w); w += 1; 231*4bdc9457SAndroid Build Coastguard Worker vacc${ABC[0:SUBMR]} = vfma_f32(vacc${ABC[0:SUBMR]}, vi${ABC[0:SUBMR]}, vw); 232*4bdc9457SAndroid Build Coastguard Worker $else: 233*4bdc9457SAndroid Build Coastguard Worker const float32x4_t vw = vld1q_dup_f32(w); w += 1; 234*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, SUBMR, 4): 235*4bdc9457SAndroid Build Coastguard Worker vacc${ABC[M:M+4]} = vfmaq_f32(vacc${ABC[M:M+4]}, vi${ABC[M:M+4]}, vw); 236*4bdc9457SAndroid Build Coastguard Worker } while (--nnz != 0); 237*4bdc9457SAndroid Build Coastguard Worker } 238*4bdc9457SAndroid Build Coastguard Worker $if SUBMR < 4: 239*4bdc9457SAndroid Build Coastguard Worker float32x2_t vout${ABC[0:SUBMR]} = vmin_f32(vacc${ABC[0:SUBMR]}, vget_low_f32(vmax)); 240*4bdc9457SAndroid Build Coastguard Worker vout${ABC[0:SUBMR]} = vmax_f32(vout${ABC[0:SUBMR]}, vget_low_f32(vmin)); 241*4bdc9457SAndroid Build Coastguard Worker $else: 242*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, SUBMR, 4): 243*4bdc9457SAndroid Build Coastguard Worker float32x4_t vout${ABC[M:M+4]} = vminq_f32(vacc${ABC[M:M+4]}, vmax); 244*4bdc9457SAndroid Build Coastguard Worker 245*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, SUBMR, 4): 246*4bdc9457SAndroid Build Coastguard Worker vout${ABC[M:M+4]} = vmaxq_f32(vout${ABC[M:M+4]}, vmin); 247*4bdc9457SAndroid Build Coastguard Worker 248*4bdc9457SAndroid Build Coastguard Worker $if SUBMR == 1: 249*4bdc9457SAndroid Build Coastguard Worker vst1_lane_f32(output, vout${ABC[0:1]}, 1); 250*4bdc9457SAndroid Build Coastguard Worker $elif SUBMR == 2: 251*4bdc9457SAndroid Build Coastguard Worker vst1_f32(output, vout${ABC[0:2]}); 252*4bdc9457SAndroid Build Coastguard Worker $else: 253*4bdc9457SAndroid Build Coastguard Worker $for M in range(0, SUBMR, 4): 254*4bdc9457SAndroid Build Coastguard Worker vst1q_f32(output + ${M}, vout${ABC[M:M+4]}); 255*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output + output_stride); 256*4bdc9457SAndroid Build Coastguard Worker n -= 1; 257*4bdc9457SAndroid Build Coastguard Worker } while (n != 0); 258*4bdc9457SAndroid Build Coastguard Worker } 259*4bdc9457SAndroid Build Coastguard Worker output = (float*restrict) ((uintptr_t) output - output_decrement); 260*4bdc9457SAndroid Build Coastguard Worker input += ${SUBMR}; 261*4bdc9457SAndroid Build Coastguard Worker } 262*4bdc9457SAndroid Build Coastguard Worker } 263*4bdc9457SAndroid Build Coastguard Worker} 264