1// Copyright 2022 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert BATCH_TILE % 8 == 0 7$assert BATCH_TILE >= 8 8$SIMD_TILE = BATCH_TILE // 8 9#include <assert.h> 10#include <stddef.h> 11#include <stdint.h> 12 13#include <arm_neon.h> 14 15#include <xnnpack/math.h> 16#include <xnnpack/vlshift.h> 17 18 19void xnn_s16_vlshift_ukernel__neon_x${BATCH_TILE}( 20 size_t batch, 21 const int16_t* input, 22 int16_t* output, 23 uint32_t shift) 24{ 25 assert(batch > 0); 26 assert(input != NULL); 27 assert(output != NULL); 28 assert(shift < 16); 29 30 const int16x8_t vshift = vdupq_n_s16((int16_t) shift); 31 32 $if BATCH_TILE > 8: 33 for (; batch >= ${BATCH_TILE}; batch -= ${BATCH_TILE}) { 34 $for N in range(SIMD_TILE): 35 const int16x8_t vi${N} = vld1q_s16(input); input += 8; 36 37 $for N in range(SIMD_TILE): 38 const int16x8_t vout${N} = vshlq_s16(vi${N}, vshift); 39 40 $for N in range(SIMD_TILE): 41 vst1q_s16(output, vout${N}); output += 8; 42 } 43 44 // Remainder of full vectors 45 for (; batch >= 8; batch -= 8) { 46 const int16x8_t vi = vld1q_s16(input); input += 8; 47 48 const int16x8_t vout = vshlq_s16(vi, vshift); 49 50 vst1q_s16(output, vout); output += 8; 51 } 52 53 // Remainder of 1 to 7 batch 54 if XNN_UNLIKELY(batch != 0) { 55 const int16x8_t vi = vld1q_s16(input); 56 57 const int16x8_t vout = vshlq_s16(vi, vshift); 58 int16x4_t vout_lo = vget_low_s16(vout); 59 60 if (batch & 4) { 61 vst1_s16(output, vout_lo); output += 4; 62 vout_lo = vget_high_s16(vout); 63 } 64 if (batch & 2) { 65 vst1_lane_u32((void*) output, vreinterpret_u32_s16(vout_lo), 0); output += 2; 66 vout_lo = vext_s16(vout_lo, vout_lo, 2); 67 } 68 if (batch & 1){ 69 vst1_lane_s16(output, vout_lo, 0); 70 } 71 } 72} 73