1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <assert.h>
7
8 #include <arm_neon.h>
9
10 #include <xnnpack/vunary.h>
11
12
xnn_s8_vclamp_ukernel__neon_x64(size_t n,const int8_t * x,int8_t * y,const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])13 void xnn_s8_vclamp_ukernel__neon_x64(
14 size_t n,
15 const int8_t* x,
16 int8_t* y,
17 const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
18 {
19 assert(n != 0);
20
21 const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.max);
22 const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.min);
23
24 for (; n >= 64; n -= 64) {
25 int8x16_t vacc0 = vld1q_s8(x); x += 16;
26 int8x16_t vacc1 = vld1q_s8(x); x += 16;
27 int8x16_t vacc2 = vld1q_s8(x); x += 16;
28 int8x16_t vacc3 = vld1q_s8(x); x += 16;
29
30 vacc0 = vmaxq_s8(vacc0, voutput_min);
31 vacc1 = vmaxq_s8(vacc1, voutput_min);
32 vacc2 = vmaxq_s8(vacc2, voutput_min);
33 vacc3 = vmaxq_s8(vacc3, voutput_min);
34
35 vacc0 = vminq_s8(vacc0, voutput_max);
36 vacc1 = vminq_s8(vacc1, voutput_max);
37 vacc2 = vminq_s8(vacc2, voutput_max);
38 vacc3 = vminq_s8(vacc3, voutput_max);
39
40 vst1q_s8(y, vacc0); y += 16;
41 vst1q_s8(y, vacc1); y += 16;
42 vst1q_s8(y, vacc2); y += 16;
43 vst1q_s8(y, vacc3); y += 16;
44 }
45 for (; n >= 8; n -= 8) {
46 int8x8_t vacc = vld1_s8(x); x += 8;
47
48 vacc = vmin_s8(vacc, vget_low_s8(voutput_max));
49 vacc = vmax_s8(vacc, vget_low_s8(voutput_min));
50
51 vst1_s8(y, vacc); y += 8;
52 }
53 if XNN_UNLIKELY(n != 0) {
54 int8x8_t vacc = vld1_s8(x); x += 8;
55
56 vacc = vmin_s8(vacc, vget_low_s8(voutput_max));
57 vacc = vmax_s8(vacc, vget_low_s8(voutput_min));
58
59 if (n & 4) {
60 vst1_lane_u32((void*) y, vreinterpret_u32_s8(vacc), 0); y += 4;
61 vacc = vext_s8(vacc, vacc, 4);
62 }
63 if (n & 2) {
64 vst1_lane_u16((void*) y, vreinterpret_u16_s8(vacc), 0); y += 2;
65 vacc = vext_s8(vacc, vacc, 2);
66 }
67 if (n & 1) {
68 vst1_lane_s8(y, vacc, 0);
69 }
70 }
71 }
72