1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert CHANNEL_TILE % 8 == 0 7$assert KERNEL_TILE >= 2 8$assert ACCUMULATORS >= 1 9$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 10#include <assert.h> 11 12#include <arm_neon.h> 13 14#include <xnnpack/dwconv.h> 15 16 17void xnn_f16_dwconv_minmax_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__neonfp16arith${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}( 18 size_t channels, 19 size_t output_width, 20 const void** input, 21 const void* weights, 22 void* output_ptr, 23 size_t input_stride, 24 size_t output_increment, 25 size_t input_offset, 26 const void* zero, 27 const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS 28{ 29 assert(channels != 0); 30 assert(output_width != 0); 31 32 __fp16* output = (__fp16*) output_ptr; 33 const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.max)); 34 const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.min)); 35 do { 36 $for K in range(KERNEL_TILE): 37 const __fp16* i${K} = (const __fp16*) input[${K}]; 38 assert(i${K} != NULL); 39 if XNN_UNPREDICTABLE(i${K} != (const __fp16*) zero) { 40 i${K} = (const __fp16*) ((uintptr_t) i${K} + input_offset); 41 } 42 43 input = (const void**) ((uintptr_t) input + input_stride); 44 45 size_t c = channels; 46 const __fp16* w = (const __fp16*) weights; 47 for (; c >= ${CHANNEL_TILE}; c -= ${CHANNEL_TILE}) { 48 $for C in range(0, CHANNEL_TILE, 8): 49 float16x8_t vacc${ABC[C:C+8]}p0 = vld1q_f16(w); w += 8; 50 51 $for K in range(KERNEL_TILE): 52 53 $for C in range(0, CHANNEL_TILE, 8): 54 const float16x8_t vi${K}x${ABC[C:C+8]} = vld1q_f16(i${K}); i${K} += 8; 55 $for C in range(0, CHANNEL_TILE, 8): 56 const float16x8_t vk${K}x${ABC[C:C+8]} = vld1q_f16(w); w += 8; 57 $for C in range(0, CHANNEL_TILE, 8): 58 $if 1 <= K < ACCUMULATORS: 59 float16x8_t vacc${ABC[C:C+8]}p${K} = vmulq_f16(vi${K}x${ABC[C:C+8]}, vk${K}x${ABC[C:C+8]}); 60 $else: 61 vacc${ABC[C:C+8]}p${K % ACCUMULATORS} = vfmaq_f16(vacc${ABC[C:C+8]}p${K % ACCUMULATORS}, vi${K}x${ABC[C:C+8]}, vk${K}x${ABC[C:C+8]}); 62 63 $if ACCUMULATORS > 1: 64 // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0 65 $ACC_STEP = 1 66 $while ACC_STEP < ACCUMULATORS: 67 $for A in range(0, ACCUMULATORS, ACC_STEP * 2): 68 $if A + ACC_STEP < ACCUMULATORS: 69 $for C in range(0, CHANNEL_TILE, 8): 70 vacc${ABC[C:C+8]}p${A} = vaddq_f16(vacc${ABC[C:C+8]}p${A}, vacc${ABC[C:C+8]}p${A + ACC_STEP}); 71 $ACC_STEP *= 2 72 73 $for C in range(0, CHANNEL_TILE, 8): 74 float16x8_t vacc${ABC[C:C+8]} = vmaxq_f16(vacc${ABC[C:C+8]}p0, vmin); 75 $for C in range(0, CHANNEL_TILE, 8): 76 vacc${ABC[C:C+8]} = vminq_f16(vacc${ABC[C:C+8]}, vmax); 77 78 $for C in range(0, CHANNEL_TILE, 8): 79 vst1q_f16(output, vacc${ABC[C:C+8]}); output += 8; 80 } 81 $if CHANNEL_TILE > 8: 82 for (; c >= 8; c -= 8) { 83 float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8; 84 85 $for K in range(KERNEL_TILE): 86 87 const float16x8_t vi${K}x01234567 = vld1q_f16(i${K}); i${K} += 8; 88 const float16x8_t vk${K}x01234567 = vld1q_f16(w + ${(K + 1) * CHANNEL_TILE - 8}); 89 $if 1 <= K < ACCUMULATORS: 90 float16x8_t vacc01234567p${K} = vmulq_f16(vi${K}x01234567, vk${K}x01234567); 91 $else: 92 vacc01234567p${K % ACCUMULATORS} = vfmaq_f16(vacc01234567p${K % ACCUMULATORS}, vi${K}x01234567, vk${K}x01234567); 93 94 $if ACCUMULATORS > 1: 95 // Add up all accumulators to vacc01234567p0 96 $ACC_STEP = 1 97 $while ACC_STEP < ACCUMULATORS: 98 $for A in range(0, ACCUMULATORS, ACC_STEP * 2): 99 $if A + ACC_STEP < ACCUMULATORS: 100 vacc01234567p${A} = vaddq_f16(vacc01234567p${A}, vacc01234567p${A + ACC_STEP}); 101 $ACC_STEP *= 2 102 103 float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin); 104 vacc01234567 = vminq_f16(vacc01234567, vmax); 105 106 vst1q_f16(output, vacc01234567); output += 8; 107 } 108 if XNN_UNLIKELY(c != 0) { 109 $if CHANNEL_TILE == 8: 110 float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8; 111 $else: 112 float16x8_t vacc01234567p0 = vld1q_f16(w); 113 114 $for K in range(KERNEL_TILE): 115 116 const float16x8_t vi${K}x01234567 = vld1q_f16(i${K}); 117 $if CHANNEL_TILE == 8: 118 const float16x8_t vk${K}x01234567 = vld1q_f16(w); w += 8; 119 $else: 120 const float16x8_t vk${K}x01234567 = vld1q_f16(w + ${(K + 1) * CHANNEL_TILE}); 121 $if 1 <= K < ACCUMULATORS: 122 float16x8_t vacc01234567p${K} = vmulq_f16(vi${K}x01234567, vk${K}x01234567); 123 $else: 124 vacc01234567p${K % ACCUMULATORS} = vfmaq_f16(vacc01234567p${K % ACCUMULATORS}, vi${K}x01234567, vk${K}x01234567); 125 126 $if ACCUMULATORS > 1: 127 // Add up all accumulators to vacc01234567p0 128 $ACC_STEP = 1 129 $while ACC_STEP < ACCUMULATORS: 130 $for A in range(0, ACCUMULATORS, ACC_STEP * 2): 131 $if A + ACC_STEP < ACCUMULATORS: 132 vacc01234567p${A} = vaddq_f16(vacc01234567p${A}, vacc01234567p${A + ACC_STEP}); 133 $ACC_STEP *= 2 134 135 float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin); 136 vacc01234567 = vminq_f16(vacc01234567, vmax); 137 138 float16x4_t vacc0123 = vget_low_f16(vacc01234567); 139 if (c & 4) { 140 vst1_f16(output, vacc0123); output += 4; 141 vacc0123 = vget_high_f16(vacc01234567); 142 } 143 if (c & 2) { 144 vst1_lane_u32((void*) output, vreinterpret_u32_f16(vacc0123), 0); output += 2; 145 vacc0123 = vext_f16(vacc0123, vacc0123, 2); 146 } 147 if (c & 1) { 148 vst1_lane_f16(output, vacc0123, 0); output += 1; 149 } 150 } 151 152 output = (__fp16*) ((uintptr_t) output + output_increment); 153 } while (--output_width != 0); 154} 155