1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2022 Google LLC 2*4bdc9457SAndroid Build Coastguard Worker// 3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the 4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree. 5*4bdc9457SAndroid Build Coastguard Worker 6*4bdc9457SAndroid Build Coastguard Worker$assert BATCH_TILE % 8 == 0 7*4bdc9457SAndroid Build Coastguard Worker$assert BATCH_TILE >= 8 8*4bdc9457SAndroid Build Coastguard Worker$SIMD_TILE = BATCH_TILE // 8 9*4bdc9457SAndroid Build Coastguard Worker#include <assert.h> 10*4bdc9457SAndroid Build Coastguard Worker#include <stddef.h> 11*4bdc9457SAndroid Build Coastguard Worker#include <stdint.h> 12*4bdc9457SAndroid Build Coastguard Worker 13*4bdc9457SAndroid Build Coastguard Worker#include <arm_neon.h> 14*4bdc9457SAndroid Build Coastguard Worker 15*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/math.h> 16*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/rmaxabs.h> 17*4bdc9457SAndroid Build Coastguard Worker 18*4bdc9457SAndroid Build Coastguard Worker 19*4bdc9457SAndroid Build Coastguard Workervoid xnn_s16_rmaxabs_ukernel__neon_x${BATCH_TILE}( 20*4bdc9457SAndroid Build Coastguard Worker size_t batch, 21*4bdc9457SAndroid Build Coastguard Worker const int16_t* input, 22*4bdc9457SAndroid Build Coastguard Worker uint16_t* output) { 23*4bdc9457SAndroid Build Coastguard Worker 24*4bdc9457SAndroid Build Coastguard Worker assert(batch > 0); 25*4bdc9457SAndroid Build Coastguard Worker assert(input != NULL); 26*4bdc9457SAndroid Build Coastguard Worker assert(output != NULL); 27*4bdc9457SAndroid Build Coastguard Worker 28*4bdc9457SAndroid Build Coastguard Worker const uint16x8_t vzero = vdupq_n_u16(0); 29*4bdc9457SAndroid Build Coastguard Worker $for N in range(SIMD_TILE): 30*4bdc9457SAndroid Build Coastguard Worker uint16x8_t vmax${N} = vzero; 31*4bdc9457SAndroid Build Coastguard Worker 32*4bdc9457SAndroid Build Coastguard Worker $if BATCH_TILE > 8: 33*4bdc9457SAndroid Build Coastguard Worker for (; batch >= ${BATCH_TILE}; batch -= ${BATCH_TILE}) { 34*4bdc9457SAndroid Build Coastguard Worker $for N in range(SIMD_TILE): 35*4bdc9457SAndroid Build Coastguard Worker const int16x8_t vi${N} = vld1q_s16(input); input += 8; 36*4bdc9457SAndroid Build Coastguard Worker 37*4bdc9457SAndroid Build Coastguard Worker $for N in range(SIMD_TILE): 38*4bdc9457SAndroid Build Coastguard Worker const uint16x8_t vabs${N} = vreinterpretq_u16_s16(vabsq_s16(vi${N})); 39*4bdc9457SAndroid Build Coastguard Worker 40*4bdc9457SAndroid Build Coastguard Worker $for N in range(SIMD_TILE): 41*4bdc9457SAndroid Build Coastguard Worker vmax${N} = vmaxq_u16(vmax${N}, vabs${N}); 42*4bdc9457SAndroid Build Coastguard Worker } 43*4bdc9457SAndroid Build Coastguard Worker 44*4bdc9457SAndroid Build Coastguard Worker $SIMD_SLICE = 1 45*4bdc9457SAndroid Build Coastguard Worker $while SIMD_SLICE < SIMD_TILE: 46*4bdc9457SAndroid Build Coastguard Worker $for S in range(0, SIMD_TILE, SIMD_SLICE * 2): 47*4bdc9457SAndroid Build Coastguard Worker $if S + SIMD_SLICE < SIMD_TILE: 48*4bdc9457SAndroid Build Coastguard Worker vmax${S} = vmaxq_u16(vmax${S}, vmax${S + SIMD_SLICE}); 49*4bdc9457SAndroid Build Coastguard Worker $SIMD_SLICE *= 2 50*4bdc9457SAndroid Build Coastguard Worker 51*4bdc9457SAndroid Build Coastguard Worker // Remainder of full vectors 52*4bdc9457SAndroid Build Coastguard Worker for (; batch >= 8; batch -= 8) { 53*4bdc9457SAndroid Build Coastguard Worker const int16x8_t vi = vld1q_s16(input); input += 8; 54*4bdc9457SAndroid Build Coastguard Worker const uint16x8_t vabs = vreinterpretq_u16_s16(vabsq_s16(vi)); 55*4bdc9457SAndroid Build Coastguard Worker vmax0 = vmaxq_u16(vmax0, vabs); 56*4bdc9457SAndroid Build Coastguard Worker } 57*4bdc9457SAndroid Build Coastguard Worker 58*4bdc9457SAndroid Build Coastguard Worker // Remainder 59*4bdc9457SAndroid Build Coastguard Worker if (batch != 0) { 60*4bdc9457SAndroid Build Coastguard Worker do { 61*4bdc9457SAndroid Build Coastguard Worker const int16x8_t vi = vld1q_dup_s16(input); input += 1; 62*4bdc9457SAndroid Build Coastguard Worker const uint16x8_t vabs = vreinterpretq_u16_s16(vabsq_s16(vi)); 63*4bdc9457SAndroid Build Coastguard Worker vmax0 = vmaxq_u16(vmax0, vabs); 64*4bdc9457SAndroid Build Coastguard Worker } while (--batch != 0); 65*4bdc9457SAndroid Build Coastguard Worker } 66*4bdc9457SAndroid Build Coastguard Worker 67*4bdc9457SAndroid Build Coastguard Worker #if XNN_ARCH_ARM64 68*4bdc9457SAndroid Build Coastguard Worker *output = vmaxvq_u16(vmax0); 69*4bdc9457SAndroid Build Coastguard Worker #else 70*4bdc9457SAndroid Build Coastguard Worker uint16x4_t vmax_lo = vmax_u16(vget_low_u16(vmax0), vget_high_u16(vmax0)); 71*4bdc9457SAndroid Build Coastguard Worker vmax_lo = vpmax_u16(vmax_lo, vmax_lo); 72*4bdc9457SAndroid Build Coastguard Worker vmax_lo = vpmax_u16(vmax_lo, vmax_lo); 73*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u16(output, vmax_lo, 0); 74*4bdc9457SAndroid Build Coastguard Worker #endif 75*4bdc9457SAndroid Build Coastguard Worker} 76