1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-vcvt/armsimd32.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2022 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_acle.h>
13
14 #include <xnnpack/intrinsics-polyfill.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/unaligned.h>
17 #include <xnnpack/vcvt.h>
18
19
xnn_qu8_vcvt_ukernel__armsimd32_x8(size_t n,const uint8_t * x,uint8_t * y,const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])20 void xnn_qu8_vcvt_ukernel__armsimd32_x8(
21 size_t n,
22 const uint8_t* x,
23 uint8_t* y,
24 const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
25 {
26 const uint16x2_t vminus_input_zero_point = (uint16x2_t) params->armsimd32.minus_input_zero_point;
27 const int32_t vbias = params->armsimd32.bias;
28 const int32_t vmultiplier = params->armsimd32.multiplier;
29 for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
30 const uint8x4_t vx0123 = (uint8x4_t) unaligned_indexed_load_u32(x, 0);
31 const uint8x4_t vx4567 = (uint8x4_t) unaligned_indexed_load_u32(x, 1);
32 x += 8;
33
34 const uint16x2_t vx02 = __uxtab16(vminus_input_zero_point, vx0123);
35 const uint16x2_t vx13 = __uxtab16(vminus_input_zero_point, __ror(vx0123, 8));
36 const uint16x2_t vx46 = __uxtab16(vminus_input_zero_point, vx4567);
37 const uint16x2_t vx57 = __uxtab16(vminus_input_zero_point, __ror(vx4567, 8));
38
39 int32_t vacc0 = __smlawb(vmultiplier, vx02, vbias);
40 int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias);
41 int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias);
42 int32_t vacc3 = __smlawt(vmultiplier, vx13, vbias);
43 int32_t vacc4 = __smlawb(vmultiplier, vx46, vbias);
44 int32_t vacc5 = __smlawb(vmultiplier, vx57, vbias);
45 int32_t vacc6 = __smlawt(vmultiplier, vx46, vbias);
46 int32_t vacc7 = __smlawt(vmultiplier, vx57, vbias);
47
48 vacc0 = __usat(math_asr_s32(vacc0, 1), 8);
49 vacc1 = __usat(math_asr_s32(vacc1, 1), 8);
50 vacc2 = __usat(math_asr_s32(vacc2, 1), 8);
51 vacc3 = __usat(math_asr_s32(vacc3, 1), 8);
52 vacc4 = __usat(math_asr_s32(vacc4, 1), 8);
53 vacc5 = __usat(math_asr_s32(vacc5, 1), 8);
54 vacc6 = __usat(math_asr_s32(vacc6, 1), 8);
55 vacc7 = __usat(math_asr_s32(vacc7, 1), 8);
56
57 y[0] = (uint8_t) vacc0;
58 y[1] = (uint8_t) vacc1;
59 y[2] = (uint8_t) vacc2;
60 y[3] = (uint8_t) vacc3;
61 y[4] = (uint8_t) vacc4;
62 y[5] = (uint8_t) vacc5;
63 y[6] = (uint8_t) vacc6;
64 y[7] = (uint8_t) vacc7;
65 y += 8;
66 }
67 for (; n >= 4 * sizeof(uint8_t); n -= 4 * sizeof(uint8_t)) {
68 const uint8x4_t vx0123 = (uint8x4_t) unaligned_load_u32(x);
69 x += 4;
70
71 const uint16x2_t vx02 = __uxtab16(vminus_input_zero_point, vx0123);
72 const uint16x2_t vx13 = __uxtab16(vminus_input_zero_point, __ror(vx0123, 8));
73
74 int32_t vacc0 = __smlawb(vmultiplier, vx02, vbias);
75 int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias);
76 int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias);
77 int32_t vacc3 = __smlawt(vmultiplier, vx13, vbias);
78
79 vacc0 = __usat(math_asr_s32(vacc0, 1), 8);
80 vacc1 = __usat(math_asr_s32(vacc1, 1), 8);
81 vacc2 = __usat(math_asr_s32(vacc2, 1), 8);
82 vacc3 = __usat(math_asr_s32(vacc3, 1), 8);
83
84 y[0] = (uint8_t) vacc0;
85 y[1] = (uint8_t) vacc1;
86 y[2] = (uint8_t) vacc2;
87 y[3] = (uint8_t) vacc3;
88 y += 4;
89 }
90 if XNN_UNLIKELY(n != 0) {
91 const uint8x4_t vx0123 = (uint8x4_t) unaligned_load_u32(x);
92
93 const uint16x2_t vx02 = __uxtab16(vminus_input_zero_point, vx0123);
94 const uint16x2_t vx13 = __uxtab16(vminus_input_zero_point, __ror(vx0123, 8));
95
96 int32_t vacc0 = __smlawb(vmultiplier, vx02, vbias);
97 int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias);
98 const int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias);
99
100 vacc0 = __usat(math_asr_s32(vacc0, 1), 8);
101 vacc1 = __usat(math_asr_s32(vacc1, 1), 8);
102
103 if (n & (2 * sizeof(uint8_t))) {
104 y[0] = (uint8_t) vacc0;
105 y[1] = (uint8_t) vacc1;
106 vacc0 = __usat(math_asr_s32(vacc2, 1), 8);
107 y += 2;
108 }
109 if (n & (1 * sizeof(uint8_t))) {
110 y[0] = (uint8_t) vacc0;
111 }
112 }
113 }
114