1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2022 Google LLC 2*4bdc9457SAndroid Build Coastguard Worker// 3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the 4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree. 5*4bdc9457SAndroid Build Coastguard Worker 6*4bdc9457SAndroid Build Coastguard Worker$assert PIXEL_TILE >= 1 7*4bdc9457SAndroid Build Coastguard Worker$assert PIXEL_TILE % 4 == 0 8*4bdc9457SAndroid Build Coastguard Worker$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 9*4bdc9457SAndroid Build Coastguard Worker#include <assert.h> 10*4bdc9457SAndroid Build Coastguard Worker 11*4bdc9457SAndroid Build Coastguard Worker#include <arm_neon.h> 12*4bdc9457SAndroid Build Coastguard Worker 13*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/ibilinear.h> 14*4bdc9457SAndroid Build Coastguard Worker 15*4bdc9457SAndroid Build Coastguard Worker 16*4bdc9457SAndroid Build Coastguard Workervoid xnn_f16_ibilinear_chw_ukernel__neonfp16arith_p${PIXEL_TILE}( 17*4bdc9457SAndroid Build Coastguard Worker size_t output_pixels, 18*4bdc9457SAndroid Build Coastguard Worker size_t channels, 19*4bdc9457SAndroid Build Coastguard Worker const void**restrict input, 20*4bdc9457SAndroid Build Coastguard Worker size_t input_offset, 21*4bdc9457SAndroid Build Coastguard Worker const void*restrict weights, 22*4bdc9457SAndroid Build Coastguard Worker void*restrict output, 23*4bdc9457SAndroid Build Coastguard Worker size_t input_increment) XNN_OOB_READS 24*4bdc9457SAndroid Build Coastguard Worker{ 25*4bdc9457SAndroid Build Coastguard Worker assert(output_pixels != 0); 26*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0); 27*4bdc9457SAndroid Build Coastguard Worker assert(input_increment % sizeof(__fp16) == 0); 28*4bdc9457SAndroid Build Coastguard Worker 29*4bdc9457SAndroid Build Coastguard Worker __fp16* o = (__fp16*) output; 30*4bdc9457SAndroid Build Coastguard Worker do { 31*4bdc9457SAndroid Build Coastguard Worker const __fp16** i = (const __fp16**)input; 32*4bdc9457SAndroid Build Coastguard Worker const __fp16* w = weights; 33*4bdc9457SAndroid Build Coastguard Worker size_t p = output_pixels; 34*4bdc9457SAndroid Build Coastguard Worker 35*4bdc9457SAndroid Build Coastguard Worker $if PIXEL_TILE > 4: 36*4bdc9457SAndroid Build Coastguard Worker for (; p >= ${PIXEL_TILE}; p -= ${PIXEL_TILE}) { 37*4bdc9457SAndroid Build Coastguard Worker $for P in range(PIXEL_TILE): 38*4bdc9457SAndroid Build Coastguard Worker const __fp16* itl${ABC[P]} = (const __fp16*) ((uintptr_t) i[${2 * P}] + input_offset); 39*4bdc9457SAndroid Build Coastguard Worker const __fp16* ibl${ABC[P]} = (const __fp16*) ((uintptr_t) i[${2 * P + 1}] + input_offset); 40*4bdc9457SAndroid Build Coastguard Worker i += 2 * ${PIXEL_TILE}; 41*4bdc9457SAndroid Build Coastguard Worker 42*4bdc9457SAndroid Build Coastguard Worker $for P in range(0, PIXEL_TILE, 4): 43*4bdc9457SAndroid Build Coastguard Worker const float16x4x2_t vw${ABC[P:P+4]} = vld2_f16(w + ${2 * P}); 44*4bdc9457SAndroid Build Coastguard Worker w += 2 * ${PIXEL_TILE}; 45*4bdc9457SAndroid Build Coastguard Worker 46*4bdc9457SAndroid Build Coastguard Worker $for P in range(0, PIXEL_TILE, 4): 47*4bdc9457SAndroid Build Coastguard Worker float16x8_t vtltr${ABC[P:P+4]} = vmovq_n_f16(0); // vmov for uninitialized var warning 48*4bdc9457SAndroid Build Coastguard Worker float16x8_t vblbr${ABC[P:P+4]} = vmovq_n_f16(0); 49*4bdc9457SAndroid Build Coastguard Worker $for L in range(0, 4): 50*4bdc9457SAndroid Build Coastguard Worker vtltr${ABC[P:P+4]} = vreinterpretq_f16_u32(vld1q_lane_u32((const void*) itl${ABC[P+L]}, vreinterpretq_u32_f16(vtltr${ABC[P:P+4]}), ${L})); 51*4bdc9457SAndroid Build Coastguard Worker vblbr${ABC[P:P+4]} = vreinterpretq_f16_u32(vld1q_lane_u32((const void*) ibl${ABC[P+L]}, vreinterpretq_u32_f16(vblbr${ABC[P:P+4]}), ${L})); 52*4bdc9457SAndroid Build Coastguard Worker 53*4bdc9457SAndroid Build Coastguard Worker $for P in range(0, PIXEL_TILE, 8): 54*4bdc9457SAndroid Build Coastguard Worker const float16x8_t valphah${ABC[P:P+8]} = vcombine_f16(vw${ABC[P:P+4]}.val[0], vw${ABC[P+4:P+8]}.val[0]); 55*4bdc9457SAndroid Build Coastguard Worker const float16x8_t valphav${ABC[P:P+8]} = vcombine_f16(vw${ABC[P:P+4]}.val[1], vw${ABC[P+4:P+8]}.val[1]); 56*4bdc9457SAndroid Build Coastguard Worker 57*4bdc9457SAndroid Build Coastguard Worker $for P in range(0, PIXEL_TILE, 4): 58*4bdc9457SAndroid Build Coastguard Worker const float16x8_t vldrd${ABC[P:P+4]} = vsubq_f16(vblbr${ABC[P:P+4]}, vtltr${ABC[P:P+4]}); 59*4bdc9457SAndroid Build Coastguard Worker 60*4bdc9457SAndroid Build Coastguard Worker $for P in range(0, PIXEL_TILE, 8): 61*4bdc9457SAndroid Build Coastguard Worker const float16x8x2_t vld_t${ABC[P:P+8]} = vuzpq_f16(vldrd${ABC[P:P+4]}, vldrd${ABC[P+4:P+8]}); 62*4bdc9457SAndroid Build Coastguard Worker const float16x8_t vld${ABC[P:P+8]} = vld_t${ABC[P:P+8]}.val[0]; 63*4bdc9457SAndroid Build Coastguard Worker const float16x8_t vrd${ABC[P:P+8]} = vld_t${ABC[P:P+8]}.val[1]; 64*4bdc9457SAndroid Build Coastguard Worker 65*4bdc9457SAndroid Build Coastguard Worker $for P in range(0, PIXEL_TILE, 8): 66*4bdc9457SAndroid Build Coastguard Worker const float16x8x2_t vtl_t${ABC[P:P+8]} = vuzpq_f16(vtltr${ABC[P:P+4]}, vtltr${ABC[P+4:P+8]}); 67*4bdc9457SAndroid Build Coastguard Worker const float16x8_t vtl${ABC[P:P+8]} = vtl_t${ABC[P:P+8]}.val[0]; 68*4bdc9457SAndroid Build Coastguard Worker const float16x8_t vtr${ABC[P:P+8]} = vtl_t${ABC[P:P+8]}.val[1]; 69*4bdc9457SAndroid Build Coastguard Worker 70*4bdc9457SAndroid Build Coastguard Worker $for P in range(0, PIXEL_TILE, 8): 71*4bdc9457SAndroid Build Coastguard Worker const float16x8_t vl${ABC[P:P+8]} = vfmaq_f16(vtl${ABC[P:P+8]}, vld${ABC[P:P+8]}, valphav${ABC[P:P+8]}); 72*4bdc9457SAndroid Build Coastguard Worker const float16x8_t vr${ABC[P:P+8]} = vfmaq_f16(vtr${ABC[P:P+8]}, vrd${ABC[P:P+8]}, valphav${ABC[P:P+8]}); 73*4bdc9457SAndroid Build Coastguard Worker 74*4bdc9457SAndroid Build Coastguard Worker $for P in range(0, PIXEL_TILE, 8): 75*4bdc9457SAndroid Build Coastguard Worker const float16x8_t vd${ABC[P:P+8]} = vsubq_f16(vr${ABC[P:P+8]}, vl${ABC[P:P+8]}); 76*4bdc9457SAndroid Build Coastguard Worker $for P in range(0, PIXEL_TILE, 8): 77*4bdc9457SAndroid Build Coastguard Worker const float16x8_t vo${ABC[P:P+8]} = vfmaq_f16(vl${ABC[P:P+8]}, vd${ABC[P:P+8]}, valphah${ABC[P:P+8]}); 78*4bdc9457SAndroid Build Coastguard Worker 79*4bdc9457SAndroid Build Coastguard Worker $for P in range(0, PIXEL_TILE, 8): 80*4bdc9457SAndroid Build Coastguard Worker vst1q_f16(o + ${P}, vo${ABC[P:P+8]}); 81*4bdc9457SAndroid Build Coastguard Worker o += ${PIXEL_TILE}; 82*4bdc9457SAndroid Build Coastguard Worker } 83*4bdc9457SAndroid Build Coastguard Worker 84*4bdc9457SAndroid Build Coastguard Worker for (; p >= 4; p -= 4) { 85*4bdc9457SAndroid Build Coastguard Worker $for P in range(4): 86*4bdc9457SAndroid Build Coastguard Worker const __fp16* itl${ABC[P]} = (const __fp16*) ((uintptr_t) i[${2 * P}] + input_offset); 87*4bdc9457SAndroid Build Coastguard Worker const __fp16* ibl${ABC[P]} = (const __fp16*) ((uintptr_t) i[${2 * P + 1}] + input_offset); 88*4bdc9457SAndroid Build Coastguard Worker i += 8; 89*4bdc9457SAndroid Build Coastguard Worker 90*4bdc9457SAndroid Build Coastguard Worker const float16x4x2_t vw = vld2_f16(w); 91*4bdc9457SAndroid Build Coastguard Worker w += 8; 92*4bdc9457SAndroid Build Coastguard Worker 93*4bdc9457SAndroid Build Coastguard Worker float16x8_t vtltr = vmovq_n_f16(0); // vmov for uninitialized var warning 94*4bdc9457SAndroid Build Coastguard Worker float16x8_t vblbr = vmovq_n_f16(0); 95*4bdc9457SAndroid Build Coastguard Worker $for P in range(0, 4): 96*4bdc9457SAndroid Build Coastguard Worker vtltr = vreinterpretq_f16_u32(vld1q_lane_u32((const void*) itl${ABC[P]}, vreinterpretq_u32_f16(vtltr), ${P})); 97*4bdc9457SAndroid Build Coastguard Worker vblbr = vreinterpretq_f16_u32(vld1q_lane_u32((const void*) ibl${ABC[P]}, vreinterpretq_u32_f16(vblbr), ${P})); 98*4bdc9457SAndroid Build Coastguard Worker 99*4bdc9457SAndroid Build Coastguard Worker const float16x4_t valphah = vw.val[0]; 100*4bdc9457SAndroid Build Coastguard Worker const float16x4_t valphav = vw.val[1]; 101*4bdc9457SAndroid Build Coastguard Worker 102*4bdc9457SAndroid Build Coastguard Worker const float16x8_t vldrd = vsubq_f16(vblbr, vtltr); 103*4bdc9457SAndroid Build Coastguard Worker 104*4bdc9457SAndroid Build Coastguard Worker const float16x4x2_t vld_t = vuzp_f16(vget_low_f16(vldrd), vget_high_f16(vldrd)); 105*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vld = vld_t.val[0]; 106*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vrd = vld_t.val[1]; 107*4bdc9457SAndroid Build Coastguard Worker 108*4bdc9457SAndroid Build Coastguard Worker const float16x4x2_t vtl_t = vuzp_f16(vget_low_f16(vtltr), vget_high_f16(vtltr)); 109*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vtl = vtl_t.val[0]; 110*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vtr = vtl_t.val[1]; 111*4bdc9457SAndroid Build Coastguard Worker 112*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vl = vfma_f16(vtl, vld, valphav); 113*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vr = vfma_f16(vtr, vrd, valphav); 114*4bdc9457SAndroid Build Coastguard Worker 115*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vd = vsub_f16(vr, vl); 116*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vo = vfma_f16(vl, vd, valphah); 117*4bdc9457SAndroid Build Coastguard Worker 118*4bdc9457SAndroid Build Coastguard Worker vst1_f16(o, vo); 119*4bdc9457SAndroid Build Coastguard Worker o += 4; 120*4bdc9457SAndroid Build Coastguard Worker } 121*4bdc9457SAndroid Build Coastguard Worker 122*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(p != 0) { 123*4bdc9457SAndroid Build Coastguard Worker if (p & 2) { 124*4bdc9457SAndroid Build Coastguard Worker $for P in range(2): 125*4bdc9457SAndroid Build Coastguard Worker const __fp16* itl${ABC[P]} = (const __fp16*) ((uintptr_t) i[${2 * P}] + input_offset); 126*4bdc9457SAndroid Build Coastguard Worker const __fp16* ibl${ABC[P]} = (const __fp16*) ((uintptr_t) i[${2 * P + 1}] + input_offset); 127*4bdc9457SAndroid Build Coastguard Worker i += 4; 128*4bdc9457SAndroid Build Coastguard Worker 129*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vw = vld1_f16(w); 130*4bdc9457SAndroid Build Coastguard Worker w += 4; 131*4bdc9457SAndroid Build Coastguard Worker 132*4bdc9457SAndroid Build Coastguard Worker const float16x4x2_t vwhv = vuzp_f16(vw, vw); 133*4bdc9457SAndroid Build Coastguard Worker const float16x4_t valphah = vwhv.val[0]; 134*4bdc9457SAndroid Build Coastguard Worker const float16x4_t valphav = vwhv.val[1]; 135*4bdc9457SAndroid Build Coastguard Worker 136*4bdc9457SAndroid Build Coastguard Worker float16x4_t vtltr = vmov_n_f16(0); // vmov for uninitialized var warning 137*4bdc9457SAndroid Build Coastguard Worker float16x4_t vblbr = vmov_n_f16(0); 138*4bdc9457SAndroid Build Coastguard Worker 139*4bdc9457SAndroid Build Coastguard Worker $for P in range(0, 2): 140*4bdc9457SAndroid Build Coastguard Worker vtltr = vreinterpret_f16_u32(vld1_lane_u32((const void*) itl${ABC[P]}, vreinterpret_u32_f16(vtltr), ${P})); 141*4bdc9457SAndroid Build Coastguard Worker vblbr = vreinterpret_f16_u32(vld1_lane_u32((const void*) ibl${ABC[P]}, vreinterpret_u32_f16(vblbr), ${P})); 142*4bdc9457SAndroid Build Coastguard Worker 143*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vldrd = vsub_f16(vblbr, vtltr); 144*4bdc9457SAndroid Build Coastguard Worker 145*4bdc9457SAndroid Build Coastguard Worker const float16x4x2_t vld_t = vuzp_f16(vldrd, vldrd); 146*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vld = vld_t.val[0]; 147*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vrd = vld_t.val[1]; 148*4bdc9457SAndroid Build Coastguard Worker 149*4bdc9457SAndroid Build Coastguard Worker const float16x4x2_t vtl_t = vuzp_f16(vtltr, vtltr); 150*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vtl = vtl_t.val[0]; 151*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vtr = vtl_t.val[1]; 152*4bdc9457SAndroid Build Coastguard Worker 153*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vl = vfma_f16(vtl, vld, valphav); 154*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vr = vfma_f16(vtr, vrd, valphav); 155*4bdc9457SAndroid Build Coastguard Worker 156*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vd = vsub_f16(vr, vl); 157*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vo = vfma_f16(vl, vd, valphah); 158*4bdc9457SAndroid Build Coastguard Worker 159*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) o, vreinterpret_u32_f16(vo), 0); 160*4bdc9457SAndroid Build Coastguard Worker o += 2; 161*4bdc9457SAndroid Build Coastguard Worker } 162*4bdc9457SAndroid Build Coastguard Worker 163*4bdc9457SAndroid Build Coastguard Worker if (p & 1) { 164*4bdc9457SAndroid Build Coastguard Worker // We are computing the following formula: 165*4bdc9457SAndroid Build Coastguard Worker // result = (1 - alpha_h) * (1 - alpha_v) * top_left + 166*4bdc9457SAndroid Build Coastguard Worker // alpha_h * (1 - alpha_v) * top_right + 167*4bdc9457SAndroid Build Coastguard Worker // (1 - alpha_h) * alpha_v * bottom_left + 168*4bdc9457SAndroid Build Coastguard Worker // alpha_h * alpha_v * bottom_right. 169*4bdc9457SAndroid Build Coastguard Worker // 170*4bdc9457SAndroid Build Coastguard Worker // Rearranging gives 171*4bdc9457SAndroid Build Coastguard Worker // result = left + alpha_h * (right - left), 172*4bdc9457SAndroid Build Coastguard Worker // where 173*4bdc9457SAndroid Build Coastguard Worker // left = top_left + alpha_v * (bottom_left - top_left), 174*4bdc9457SAndroid Build Coastguard Worker // right = top_right + alpha_v * (bottom_right - top_right). 175*4bdc9457SAndroid Build Coastguard Worker 176*4bdc9457SAndroid Build Coastguard Worker const __fp16* itl = (const __fp16*) ((uintptr_t) i[0] + input_offset); 177*4bdc9457SAndroid Build Coastguard Worker const __fp16* ibl = (const __fp16*) ((uintptr_t) i[1] + input_offset); 178*4bdc9457SAndroid Build Coastguard Worker i += 2; 179*4bdc9457SAndroid Build Coastguard Worker 180*4bdc9457SAndroid Build Coastguard Worker float16x4_t vw = vmov_n_f16(0); 181*4bdc9457SAndroid Build Coastguard Worker vw = vreinterpret_f16_u32(vld1_lane_u32((const void*) w, vreinterpret_u32_f16(vw), 0)); 182*4bdc9457SAndroid Build Coastguard Worker w += 2; 183*4bdc9457SAndroid Build Coastguard Worker 184*4bdc9457SAndroid Build Coastguard Worker const float16x4x2_t vwhv = vuzp_f16(vw, vw); 185*4bdc9457SAndroid Build Coastguard Worker const float16x4_t valphah = vwhv.val[0]; 186*4bdc9457SAndroid Build Coastguard Worker const float16x4_t valphav = vwhv.val[1]; 187*4bdc9457SAndroid Build Coastguard Worker 188*4bdc9457SAndroid Build Coastguard Worker float16x4_t vtltr = vmov_n_f16(0); // vmov for uninitialized var warning 189*4bdc9457SAndroid Build Coastguard Worker float16x4_t vblbr = vmov_n_f16(0); 190*4bdc9457SAndroid Build Coastguard Worker 191*4bdc9457SAndroid Build Coastguard Worker vtltr = vreinterpret_f16_u32(vld1_lane_u32((const void*) itl, vreinterpret_u32_f16(vtltr), 0)); 192*4bdc9457SAndroid Build Coastguard Worker vblbr = vreinterpret_f16_u32(vld1_lane_u32((const void*) ibl, vreinterpret_u32_f16(vblbr), 0)); 193*4bdc9457SAndroid Build Coastguard Worker 194*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vldrd = vsub_f16(vblbr, vtltr); 195*4bdc9457SAndroid Build Coastguard Worker 196*4bdc9457SAndroid Build Coastguard Worker const float16x4x2_t vld_t = vuzp_f16(vldrd, vldrd); 197*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vld = vld_t.val[0]; 198*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vrd = vld_t.val[1]; 199*4bdc9457SAndroid Build Coastguard Worker 200*4bdc9457SAndroid Build Coastguard Worker const float16x4x2_t vtl_t = vuzp_f16(vtltr, vtltr); 201*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vtl = vtl_t.val[0]; 202*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vtr = vtl_t.val[1]; 203*4bdc9457SAndroid Build Coastguard Worker 204*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vl = vfma_f16(vtl, vld, valphav); 205*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vr = vfma_f16(vtr, vrd, valphav); 206*4bdc9457SAndroid Build Coastguard Worker 207*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vd = vsub_f16(vr, vl); 208*4bdc9457SAndroid Build Coastguard Worker const float16x4_t vo = vfma_f16(vl, vd, valphah); 209*4bdc9457SAndroid Build Coastguard Worker 210*4bdc9457SAndroid Build Coastguard Worker vst1_lane_f16(o, vo, 0); 211*4bdc9457SAndroid Build Coastguard Worker o += 1; 212*4bdc9457SAndroid Build Coastguard Worker } 213*4bdc9457SAndroid Build Coastguard Worker } 214*4bdc9457SAndroid Build Coastguard Worker 215*4bdc9457SAndroid Build Coastguard Worker input_offset += input_increment; 216*4bdc9457SAndroid Build Coastguard Worker } while (--channels != 0); 217*4bdc9457SAndroid Build Coastguard Worker} 218