1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2020 Google LLC 2*4bdc9457SAndroid Build Coastguard Worker// 3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the 4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree. 5*4bdc9457SAndroid Build Coastguard Worker 6*4bdc9457SAndroid Build Coastguard Worker$assert ROW_TILE >= 1 7*4bdc9457SAndroid Build Coastguard Worker$assert ACCUMULATORS >= 1 8*4bdc9457SAndroid Build Coastguard Worker#include <assert.h> 9*4bdc9457SAndroid Build Coastguard Worker 10*4bdc9457SAndroid Build Coastguard Worker#include <arm_neon.h> 11*4bdc9457SAndroid Build Coastguard Worker 12*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/dwconv.h> 13*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/math.h> 14*4bdc9457SAndroid Build Coastguard Worker 15*4bdc9457SAndroid Build Coastguard Worker 16*4bdc9457SAndroid Build Coastguard Workervoid xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_${ROW_TILE}x4${"_acc%d" % ACCUMULATORS if ACCUMULATORS > 1 else ""}( 17*4bdc9457SAndroid Build Coastguard Worker size_t input_height, 18*4bdc9457SAndroid Build Coastguard Worker size_t input_width, 19*4bdc9457SAndroid Build Coastguard Worker const void* input, 20*4bdc9457SAndroid Build Coastguard Worker const void* weights, 21*4bdc9457SAndroid Build Coastguard Worker const void* zero, 22*4bdc9457SAndroid Build Coastguard Worker void* output, 23*4bdc9457SAndroid Build Coastguard Worker uint32_t padding_top, 24*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS 25*4bdc9457SAndroid Build Coastguard Worker{ 26*4bdc9457SAndroid Build Coastguard Worker assert(input_height != 0); 27*4bdc9457SAndroid Build Coastguard Worker assert(input_width != 0); 28*4bdc9457SAndroid Build Coastguard Worker assert(input_width % sizeof(__fp16) == 0); 29*4bdc9457SAndroid Build Coastguard Worker assert(padding_top >= 0); 30*4bdc9457SAndroid Build Coastguard Worker assert(padding_top <= 1); 31*4bdc9457SAndroid Build Coastguard Worker 32*4bdc9457SAndroid Build Coastguard Worker const uint16x4_t vmask_even = vld1_u16(params->neonfp16arith.mask_even); 33*4bdc9457SAndroid Build Coastguard Worker const uint16x4_t vmask_odd = vld1_u16(params->neonfp16arith.mask_odd); 34*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vmax = vld1_dup_f16(¶ms->neonfp16arith.max); 35*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vmin = vld1_dup_f16(¶ms->neonfp16arith.min); 36*4bdc9457SAndroid Build Coastguard Worker 37*4bdc9457SAndroid Build Coastguard Worker const __fp16* w0 = (const __fp16*)weights; 38*4bdc9457SAndroid Build Coastguard Worker const float16x8_t vw01234567 = vld1q_f16(w0); 39*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vw89 = vreinterpret_f16_u32(vld1_lane_u32((const void*)(w0 + 8), vmov_n_u32(0), 0)); 40*4bdc9457SAndroid Build Coastguard Worker 41*4bdc9457SAndroid Build Coastguard Worker const size_t input_decrement = round_down_po2(input_width, 4 /* SIMD output width */ * 2 /* subsampling */ * sizeof(__fp16)); 42*4bdc9457SAndroid Build Coastguard Worker $if ROW_TILE > 1: 43*4bdc9457SAndroid Build Coastguard Worker const size_t output_width = round_down_po2((input_width + (2 /* padding */ - 3 /* kernel size */ + 2 /* subsampling */) * sizeof(__fp16)) / 2, sizeof(__fp16)); 44*4bdc9457SAndroid Build Coastguard Worker 45*4bdc9457SAndroid Build Coastguard Worker const __fp16* i0 = (const __fp16*) ((uintptr_t) input - ((-padding_top) & input_width)); 46*4bdc9457SAndroid Build Coastguard Worker const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_width); 47*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(padding_top != 0) { 48*4bdc9457SAndroid Build Coastguard Worker i0 = zero; 49*4bdc9457SAndroid Build Coastguard Worker } 50*4bdc9457SAndroid Build Coastguard Worker $for M in range(2, 1 + 2 * ROW_TILE): 51*4bdc9457SAndroid Build Coastguard Worker const __fp16* i${M} = (const __fp16*) ((uintptr_t) i${M-1} + input_width); 52*4bdc9457SAndroid Build Coastguard Worker 53*4bdc9457SAndroid Build Coastguard Worker __fp16* o0 = output; 54*4bdc9457SAndroid Build Coastguard Worker $for M in range(1, ROW_TILE): 55*4bdc9457SAndroid Build Coastguard Worker __fp16* o${M} = (__fp16*) ((uintptr_t) o${M-1} + output_width); 56*4bdc9457SAndroid Build Coastguard Worker 57*4bdc9457SAndroid Build Coastguard Worker size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */; 58*4bdc9457SAndroid Build Coastguard Worker size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2; 59*4bdc9457SAndroid Build Coastguard Worker do { 60*4bdc9457SAndroid Build Coastguard Worker $for M in range(2, 1 + 2 * ROW_TILE): 61*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(padded_input_height < ${2 + M}) { 62*4bdc9457SAndroid Build Coastguard Worker i${M} = zero; 63*4bdc9457SAndroid Build Coastguard Worker $if M % 2 == 1: 64*4bdc9457SAndroid Build Coastguard Worker o${(M - 1) // 2} = o${(M - 1) // 2 - 1}; 65*4bdc9457SAndroid Build Coastguard Worker } 66*4bdc9457SAndroid Build Coastguard Worker 67*4bdc9457SAndroid Build Coastguard Worker $for M in range(1 + 2 * ROW_TILE): 68*4bdc9457SAndroid Build Coastguard Worker float16x4_t vi${M}x1357 = vmov_n_f16(0); 69*4bdc9457SAndroid Build Coastguard Worker 70*4bdc9457SAndroid Build Coastguard Worker size_t w = input_width; 71*4bdc9457SAndroid Build Coastguard Worker for (; w >= 8 * sizeof(__fp16); w -= 8 * sizeof(__fp16)) { 72*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 73*4bdc9457SAndroid Build Coastguard Worker float16x4_t vo${M}p0 = vdup_laneq_f16(vw01234567, 0); 74*4bdc9457SAndroid Build Coastguard Worker 75*4bdc9457SAndroid Build Coastguard Worker $for M in range(1 + 2 * ROW_TILE): 76*4bdc9457SAndroid Build Coastguard Worker const float16x4x2_t vi${M}x8ACE9BDF = vld2_f16(i${M}); i${M} += 8; 77*4bdc9457SAndroid Build Coastguard Worker 78*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 79*4bdc9457SAndroid Build Coastguard Worker $if ACCUMULATORS > 1: 80*4bdc9457SAndroid Build Coastguard Worker float16x4_t vo${M}p1 = vmul_laneq_f16(vi${2*M}x8ACE9BDF.val[0], vw01234567, 2); 81*4bdc9457SAndroid Build Coastguard Worker $else: 82*4bdc9457SAndroid Build Coastguard Worker vo${M}p0 = vfma_laneq_f16(vo${M}p0, vi${2*M}x8ACE9BDF.val[0], vw01234567, 2); 83*4bdc9457SAndroid Build Coastguard Worker 84*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 85*4bdc9457SAndroid Build Coastguard Worker $if ACCUMULATORS > 2: 86*4bdc9457SAndroid Build Coastguard Worker float16x4_t vo${M}p2 = vmul_laneq_f16(vi${2*M+1}x8ACE9BDF.val[0], vw01234567, 5); 87*4bdc9457SAndroid Build Coastguard Worker $else: 88*4bdc9457SAndroid Build Coastguard Worker vo${M}p0 = vfma_laneq_f16(vo${M}p0, vi${2*M+1}x8ACE9BDF.val[0], vw01234567, 5); 89*4bdc9457SAndroid Build Coastguard Worker 90*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 91*4bdc9457SAndroid Build Coastguard Worker $if ACCUMULATORS > 3: 92*4bdc9457SAndroid Build Coastguard Worker float16x4_t vo${M}p3 = vmul_lane_f16(vi${2*M+2}x8ACE9BDF.val[0], vw89, 0); 93*4bdc9457SAndroid Build Coastguard Worker $else: 94*4bdc9457SAndroid Build Coastguard Worker vo${M}p${4 % ACCUMULATORS} = vfma_lane_f16(vo${M}p${4 % ACCUMULATORS}, vi${2*M+2}x8ACE9BDF.val[0], vw89, 0); 95*4bdc9457SAndroid Build Coastguard Worker 96*4bdc9457SAndroid Build Coastguard Worker $for M in range(1 + 2 * ROW_TILE): 97*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vi${M}x7BDF = vext_f16(vi${M}x1357, vi${M}x8ACE9BDF.val[1], 3); 98*4bdc9457SAndroid Build Coastguard Worker vi${M}x1357 = vi${M}x8ACE9BDF.val[1]; 99*4bdc9457SAndroid Build Coastguard Worker 100*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 101*4bdc9457SAndroid Build Coastguard Worker $if ACCUMULATORS > 4: 102*4bdc9457SAndroid Build Coastguard Worker float16x4_t vo${M}p4 = vmul_laneq_f16(vi${2*M}x7BDF, vw01234567, 1); 103*4bdc9457SAndroid Build Coastguard Worker $else: 104*4bdc9457SAndroid Build Coastguard Worker vo${M}p${5 % ACCUMULATORS} = vfma_laneq_f16(vo${M}p${5 % ACCUMULATORS}, vi${2*M}x7BDF, vw01234567, 1); 105*4bdc9457SAndroid Build Coastguard Worker 106*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 107*4bdc9457SAndroid Build Coastguard Worker $if ACCUMULATORS > 5: 108*4bdc9457SAndroid Build Coastguard Worker float16x4_t vo${M}p5 = vmul_laneq_f16(vi${2*M+1}x7BDF, vw01234567, 4); 109*4bdc9457SAndroid Build Coastguard Worker $else: 110*4bdc9457SAndroid Build Coastguard Worker vo${M}p${6 % ACCUMULATORS} = vfma_laneq_f16(vo${M}p${6 % ACCUMULATORS}, vi${2*M+1}x7BDF, vw01234567, 4); 111*4bdc9457SAndroid Build Coastguard Worker 112*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 113*4bdc9457SAndroid Build Coastguard Worker $if ACCUMULATORS > 6: 114*4bdc9457SAndroid Build Coastguard Worker float16x4_t vo${M}p6 = vmul_laneq_f16(vi${2*M+2}x7BDF, vw01234567, 5); 115*4bdc9457SAndroid Build Coastguard Worker $else: 116*4bdc9457SAndroid Build Coastguard Worker vo${M}p${7 % ACCUMULATORS} = vfma_laneq_f16(vo${M}p${7 % ACCUMULATORS}, vi${2*M+2}x7BDF, vw01234567, 7); 117*4bdc9457SAndroid Build Coastguard Worker 118*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 119*4bdc9457SAndroid Build Coastguard Worker vo${M}p${8 % ACCUMULATORS} = vfma_laneq_f16(vo${M}p${8 % ACCUMULATORS}, vi${2*M}x8ACE9BDF.val[1], vw01234567, 3); 120*4bdc9457SAndroid Build Coastguard Worker 121*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 122*4bdc9457SAndroid Build Coastguard Worker vo${M}p${9 % ACCUMULATORS} = vfma_laneq_f16(vo${M}p${9 % ACCUMULATORS}, vi${2*M+1}x8ACE9BDF.val[1], vw01234567, 6); 123*4bdc9457SAndroid Build Coastguard Worker 124*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 125*4bdc9457SAndroid Build Coastguard Worker vo${M}p${10 % ACCUMULATORS} = vfma_lane_f16(vo${M}p${10 % ACCUMULATORS}, vi${2*M+2}x8ACE9BDF.val[1], vw89, 1); 126*4bdc9457SAndroid Build Coastguard Worker 127*4bdc9457SAndroid Build Coastguard Worker $if ACCUMULATORS > 1: 128*4bdc9457SAndroid Build Coastguard Worker $ACC_SLICE = 1 129*4bdc9457SAndroid Build Coastguard Worker $while ACC_SLICE < ACCUMULATORS: 130*4bdc9457SAndroid Build Coastguard Worker $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 131*4bdc9457SAndroid Build Coastguard Worker $if A + ACC_SLICE < ACCUMULATORS: 132*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 133*4bdc9457SAndroid Build Coastguard Worker vo${M}p${A} = vadd_f16(vo${M}p${A}, vo${M}p${A + ACC_SLICE}); 134*4bdc9457SAndroid Build Coastguard Worker $ACC_SLICE *= 2 135*4bdc9457SAndroid Build Coastguard Worker 136*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 137*4bdc9457SAndroid Build Coastguard Worker float16x4_t vo${M} = vmax_f16(vo${M}p0, vmin); 138*4bdc9457SAndroid Build Coastguard Worker 139*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 140*4bdc9457SAndroid Build Coastguard Worker vo${M} = vmin_f16(vo${M}, vmax); 141*4bdc9457SAndroid Build Coastguard Worker 142*4bdc9457SAndroid Build Coastguard Worker $for M in reversed(range(ROW_TILE)): 143*4bdc9457SAndroid Build Coastguard Worker vst1_f16(o${M}, vo${M}); o${M} += 4; 144*4bdc9457SAndroid Build Coastguard Worker } 145*4bdc9457SAndroid Build Coastguard Worker // Last block has 0-7 pixels to process. 146*4bdc9457SAndroid Build Coastguard Worker assert(w < 8 * sizeof(__fp16)); 147*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(w != 0) { 148*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 149*4bdc9457SAndroid Build Coastguard Worker float16x4_t vo${M}p0 = vdup_laneq_f16(vw01234567, 0); 150*4bdc9457SAndroid Build Coastguard Worker 151*4bdc9457SAndroid Build Coastguard Worker $for M in range(1 + 2 * ROW_TILE): 152*4bdc9457SAndroid Build Coastguard Worker const float16x4x2_t vi${M}x8ACE9BDF = vld2_f16(i${M}); 153*4bdc9457SAndroid Build Coastguard Worker 154*4bdc9457SAndroid Build Coastguard Worker $for M in range(1 + 2 * ROW_TILE): 155*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vi${M}x8ACE = vreinterpret_f16_u16(vand_u16(vmask_even, vreinterpret_u16_f16(vi${M}x8ACE9BDF.val[0]))); 156*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vi${M}x9BDF = vreinterpret_f16_u16(vand_u16(vmask_odd, vreinterpret_u16_f16(vi${M}x8ACE9BDF.val[1]))); 157*4bdc9457SAndroid Build Coastguard Worker 158*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 159*4bdc9457SAndroid Build Coastguard Worker $if ACCUMULATORS > 1: 160*4bdc9457SAndroid Build Coastguard Worker float16x4_t vo${M}p1 = vmul_laneq_f16(vi${2*M}x8ACE, vw01234567, 2); 161*4bdc9457SAndroid Build Coastguard Worker $else: 162*4bdc9457SAndroid Build Coastguard Worker vo${M}p0 = vfma_laneq_f16(vo${M}p0, vi${2*M}x8ACE, vw01234567, 2); 163*4bdc9457SAndroid Build Coastguard Worker 164*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 165*4bdc9457SAndroid Build Coastguard Worker $if ACCUMULATORS > 2: 166*4bdc9457SAndroid Build Coastguard Worker float16x4_t vo${M}p2 = vmul_laneq_f16(vi${2*M+1}x8ACE, vw01234567, 5); 167*4bdc9457SAndroid Build Coastguard Worker $else: 168*4bdc9457SAndroid Build Coastguard Worker vo${M}p0 = vfma_laneq_f16(vo${M}p0, vi${2*M+1}x8ACE, vw01234567, 5); 169*4bdc9457SAndroid Build Coastguard Worker 170*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 171*4bdc9457SAndroid Build Coastguard Worker $if ACCUMULATORS > 3: 172*4bdc9457SAndroid Build Coastguard Worker float16x4_t vo${M}p3 = vmul_lane_f16(vi${2*M+2}x8ACE, vw89, 0); 173*4bdc9457SAndroid Build Coastguard Worker $else: 174*4bdc9457SAndroid Build Coastguard Worker vo${M}p${4 % ACCUMULATORS} = vfma_lane_f16(vo${M}p${4 % ACCUMULATORS}, vi${2*M+2}x8ACE, vw89, 0); 175*4bdc9457SAndroid Build Coastguard Worker 176*4bdc9457SAndroid Build Coastguard Worker $for M in range(1 + 2 * ROW_TILE): 177*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vi${M}x7BDF = vext_f16(vi${M}x1357, vi${M}x9BDF, 3); 178*4bdc9457SAndroid Build Coastguard Worker 179*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 180*4bdc9457SAndroid Build Coastguard Worker $if ACCUMULATORS > 4: 181*4bdc9457SAndroid Build Coastguard Worker float16x4_t vo${M}p4 = vmul_laneq_f16(vi${2*M}x7BDF, vw01234567, 1); 182*4bdc9457SAndroid Build Coastguard Worker $else: 183*4bdc9457SAndroid Build Coastguard Worker vo${M}p${5 % ACCUMULATORS} = vfma_laneq_f16(vo${M}p${5 % ACCUMULATORS}, vi${2*M}x7BDF, vw01234567, 1); 184*4bdc9457SAndroid Build Coastguard Worker 185*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 186*4bdc9457SAndroid Build Coastguard Worker $if ACCUMULATORS > 5: 187*4bdc9457SAndroid Build Coastguard Worker float16x4_t vo${M}p5 = vmul_laneq_f16(vi${2*M+1}x7BDF, vw01234567, 4); 188*4bdc9457SAndroid Build Coastguard Worker $else: 189*4bdc9457SAndroid Build Coastguard Worker vo${M}p${6 % ACCUMULATORS} = vfma_laneq_f16(vo${M}p${6 % ACCUMULATORS}, vi${2*M+1}x7BDF, vw01234567, 4); 190*4bdc9457SAndroid Build Coastguard Worker 191*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 192*4bdc9457SAndroid Build Coastguard Worker $if ACCUMULATORS > 6: 193*4bdc9457SAndroid Build Coastguard Worker float16x4_t vo${M}p6 = vmul_laneq_f16(vi${2*M+2}x7BDF, vw01234567, 5); 194*4bdc9457SAndroid Build Coastguard Worker $else: 195*4bdc9457SAndroid Build Coastguard Worker vo${M}p${7 % ACCUMULATORS} = vfma_laneq_f16(vo${M}p${7 % ACCUMULATORS}, vi${2*M+2}x7BDF, vw01234567, 7); 196*4bdc9457SAndroid Build Coastguard Worker 197*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 198*4bdc9457SAndroid Build Coastguard Worker vo${M}p${8 % ACCUMULATORS} = vfma_laneq_f16(vo${M}p${8 % ACCUMULATORS}, vi${2*M}x9BDF, vw01234567, 3); 199*4bdc9457SAndroid Build Coastguard Worker 200*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 201*4bdc9457SAndroid Build Coastguard Worker vo${M}p${9 % ACCUMULATORS} = vfma_laneq_f16(vo${M}p${9 % ACCUMULATORS}, vi${2*M+1}x9BDF, vw01234567, 6); 202*4bdc9457SAndroid Build Coastguard Worker 203*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 204*4bdc9457SAndroid Build Coastguard Worker vo${M}p${10 % ACCUMULATORS} = vfma_lane_f16(vo${M}p${10 % ACCUMULATORS}, vi${2*M+2}x9BDF, vw89, 1); 205*4bdc9457SAndroid Build Coastguard Worker 206*4bdc9457SAndroid Build Coastguard Worker $if ACCUMULATORS > 1: 207*4bdc9457SAndroid Build Coastguard Worker $ACC_SLICE = 1 208*4bdc9457SAndroid Build Coastguard Worker $while ACC_SLICE < ACCUMULATORS: 209*4bdc9457SAndroid Build Coastguard Worker $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 210*4bdc9457SAndroid Build Coastguard Worker $if A + ACC_SLICE < ACCUMULATORS: 211*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 212*4bdc9457SAndroid Build Coastguard Worker vo${M}p${A} = vadd_f16(vo${M}p${A}, vo${M}p${A + ACC_SLICE}); 213*4bdc9457SAndroid Build Coastguard Worker $ACC_SLICE *= 2 214*4bdc9457SAndroid Build Coastguard Worker 215*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 216*4bdc9457SAndroid Build Coastguard Worker float16x4_t vo${M} = vmax_f16(vo${M}p0, vmin); 217*4bdc9457SAndroid Build Coastguard Worker 218*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 219*4bdc9457SAndroid Build Coastguard Worker vo${M} = vmin_f16(vo${M}, vmax); 220*4bdc9457SAndroid Build Coastguard Worker 221*4bdc9457SAndroid Build Coastguard Worker w += 1 * sizeof(__fp16); 222*4bdc9457SAndroid Build Coastguard Worker 223*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(w == 8 * sizeof(__fp16)) { 224*4bdc9457SAndroid Build Coastguard Worker $for M in reversed(range(ROW_TILE)): 225*4bdc9457SAndroid Build Coastguard Worker vst1_f16(o${M}, vo${M}); o${M} += 4; 226*4bdc9457SAndroid Build Coastguard Worker } else { 227*4bdc9457SAndroid Build Coastguard Worker if (w & (4 * sizeof(__fp16))) { 228*4bdc9457SAndroid Build Coastguard Worker $for M in reversed(range(ROW_TILE)): 229*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o${M}, vreinterpret_u32_f16(vo${M}), 0); o${M} += 2; 230*4bdc9457SAndroid Build Coastguard Worker 231*4bdc9457SAndroid Build Coastguard Worker $for M in range(ROW_TILE): 232*4bdc9457SAndroid Build Coastguard Worker vo${M} = vext_f16(vo${M}, vo${M}, 2); 233*4bdc9457SAndroid Build Coastguard Worker } 234*4bdc9457SAndroid Build Coastguard Worker if (w & (2 * sizeof(__fp16))) { 235*4bdc9457SAndroid Build Coastguard Worker $for M in reversed(range(ROW_TILE)): 236*4bdc9457SAndroid Build Coastguard Worker vst1_lane_f16(o${M}, vo${M}, 0); o${M} += 1; 237*4bdc9457SAndroid Build Coastguard Worker } 238*4bdc9457SAndroid Build Coastguard Worker } 239*4bdc9457SAndroid Build Coastguard Worker 240*4bdc9457SAndroid Build Coastguard Worker } 241*4bdc9457SAndroid Build Coastguard Worker 242*4bdc9457SAndroid Build Coastguard Worker i0 = (const __fp16*) ((uintptr_t) i${2 * ROW_TILE} - input_decrement); 243*4bdc9457SAndroid Build Coastguard Worker $for M in range(1, 1 + 2 * ROW_TILE): 244*4bdc9457SAndroid Build Coastguard Worker i${M} = (const __fp16*) ((uintptr_t) i${M-1} + input_width); 245*4bdc9457SAndroid Build Coastguard Worker 246*4bdc9457SAndroid Build Coastguard Worker $if ROW_TILE > 1: 247*4bdc9457SAndroid Build Coastguard Worker o0 = o${ROW_TILE - 1}; 248*4bdc9457SAndroid Build Coastguard Worker $for M in range(1, ROW_TILE): 249*4bdc9457SAndroid Build Coastguard Worker o${M} = (__fp16*) ((uintptr_t) o${M-1} + output_width); 250*4bdc9457SAndroid Build Coastguard Worker 251*4bdc9457SAndroid Build Coastguard Worker $if ROW_TILE > 1: 252*4bdc9457SAndroid Build Coastguard Worker output_height = doz(output_height, ${ROW_TILE}); 253*4bdc9457SAndroid Build Coastguard Worker padded_input_height = doz(padded_input_height, ${ROW_TILE * 2}); 254*4bdc9457SAndroid Build Coastguard Worker $else: 255*4bdc9457SAndroid Build Coastguard Worker output_height -= 1; 256*4bdc9457SAndroid Build Coastguard Worker padded_input_height -= 2; 257*4bdc9457SAndroid Build Coastguard Worker } while (output_height != 0); 258*4bdc9457SAndroid Build Coastguard Worker} 259