1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2022 Google LLC 2*4bdc9457SAndroid Build Coastguard Worker// 3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the 4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree. 5*4bdc9457SAndroid Build Coastguard Worker 6*4bdc9457SAndroid Build Coastguard Worker$assert BATCH_TILE % 8 == 0 7*4bdc9457SAndroid Build Coastguard Worker$assert BATCH_TILE >= 8 8*4bdc9457SAndroid Build Coastguard Worker$SIMD_TILE = BATCH_TILE // 8 9*4bdc9457SAndroid Build Coastguard Worker#include <assert.h> 10*4bdc9457SAndroid Build Coastguard Worker#include <stddef.h> 11*4bdc9457SAndroid Build Coastguard Worker#include <stdint.h> 12*4bdc9457SAndroid Build Coastguard Worker 13*4bdc9457SAndroid Build Coastguard Worker#include <arm_neon.h> 14*4bdc9457SAndroid Build Coastguard Worker 15*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/math.h> 16*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/vlshift.h> 17*4bdc9457SAndroid Build Coastguard Worker 18*4bdc9457SAndroid Build Coastguard Worker 19*4bdc9457SAndroid Build Coastguard Workervoid xnn_s16_vlshift_ukernel__neon_x${BATCH_TILE}( 20*4bdc9457SAndroid Build Coastguard Worker size_t batch, 21*4bdc9457SAndroid Build Coastguard Worker const int16_t* input, 22*4bdc9457SAndroid Build Coastguard Worker int16_t* output, 23*4bdc9457SAndroid Build Coastguard Worker uint32_t shift) 24*4bdc9457SAndroid Build Coastguard Worker{ 25*4bdc9457SAndroid Build Coastguard Worker assert(batch > 0); 26*4bdc9457SAndroid Build Coastguard Worker assert(input != NULL); 27*4bdc9457SAndroid Build Coastguard Worker assert(output != NULL); 28*4bdc9457SAndroid Build Coastguard Worker assert(shift < 16); 29*4bdc9457SAndroid Build Coastguard Worker 30*4bdc9457SAndroid Build Coastguard Worker const int16x8_t vshift = vdupq_n_s16((int16_t) shift); 31*4bdc9457SAndroid Build Coastguard Worker 32*4bdc9457SAndroid Build Coastguard Worker $if BATCH_TILE > 8: 33*4bdc9457SAndroid Build Coastguard Worker for (; batch >= ${BATCH_TILE}; batch -= ${BATCH_TILE}) { 34*4bdc9457SAndroid Build Coastguard Worker $for N in range(SIMD_TILE): 35*4bdc9457SAndroid Build Coastguard Worker const int16x8_t vi${N} = vld1q_s16(input); input += 8; 36*4bdc9457SAndroid Build Coastguard Worker 37*4bdc9457SAndroid Build Coastguard Worker $for N in range(SIMD_TILE): 38*4bdc9457SAndroid Build Coastguard Worker const int16x8_t vout${N} = vshlq_s16(vi${N}, vshift); 39*4bdc9457SAndroid Build Coastguard Worker 40*4bdc9457SAndroid Build Coastguard Worker $for N in range(SIMD_TILE): 41*4bdc9457SAndroid Build Coastguard Worker vst1q_s16(output, vout${N}); output += 8; 42*4bdc9457SAndroid Build Coastguard Worker } 43*4bdc9457SAndroid Build Coastguard Worker 44*4bdc9457SAndroid Build Coastguard Worker // Remainder of full vectors 45*4bdc9457SAndroid Build Coastguard Worker for (; batch >= 8; batch -= 8) { 46*4bdc9457SAndroid Build Coastguard Worker const int16x8_t vi = vld1q_s16(input); input += 8; 47*4bdc9457SAndroid Build Coastguard Worker 48*4bdc9457SAndroid Build Coastguard Worker const int16x8_t vout = vshlq_s16(vi, vshift); 49*4bdc9457SAndroid Build Coastguard Worker 50*4bdc9457SAndroid Build Coastguard Worker vst1q_s16(output, vout); output += 8; 51*4bdc9457SAndroid Build Coastguard Worker } 52*4bdc9457SAndroid Build Coastguard Worker 53*4bdc9457SAndroid Build Coastguard Worker // Remainder of 1 to 7 batch 54*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(batch != 0) { 55*4bdc9457SAndroid Build Coastguard Worker const int16x8_t vi = vld1q_s16(input); 56*4bdc9457SAndroid Build Coastguard Worker 57*4bdc9457SAndroid Build Coastguard Worker const int16x8_t vout = vshlq_s16(vi, vshift); 58*4bdc9457SAndroid Build Coastguard Worker int16x4_t vout_lo = vget_low_s16(vout); 59*4bdc9457SAndroid Build Coastguard Worker 60*4bdc9457SAndroid Build Coastguard Worker if (batch & 4) { 61*4bdc9457SAndroid Build Coastguard Worker vst1_s16(output, vout_lo); output += 4; 62*4bdc9457SAndroid Build Coastguard Worker vout_lo = vget_high_s16(vout); 63*4bdc9457SAndroid Build Coastguard Worker } 64*4bdc9457SAndroid Build Coastguard Worker if (batch & 2) { 65*4bdc9457SAndroid Build Coastguard Worker vst1_lane_u32((void*) output, vreinterpret_u32_s16(vout_lo), 0); output += 2; 66*4bdc9457SAndroid Build Coastguard Worker vout_lo = vext_s16(vout_lo, vout_lo, 2); 67*4bdc9457SAndroid Build Coastguard Worker } 68*4bdc9457SAndroid Build Coastguard Worker if (batch & 1){ 69*4bdc9457SAndroid Build Coastguard Worker vst1_lane_s16(output, vout_lo, 0); 70*4bdc9457SAndroid Build Coastguard Worker } 71*4bdc9457SAndroid Build Coastguard Worker } 72*4bdc9457SAndroid Build Coastguard Worker} 73