1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-vcvt/armsimd32.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2022 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_acle.h>
13
14 #include <xnnpack/intrinsics-polyfill.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/unaligned.h>
17 #include <xnnpack/vcvt.h>
18
19
xnn_qu8_vcvt_ukernel__armsimd32_x4(size_t n,const uint8_t * x,uint8_t * y,const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])20 void xnn_qu8_vcvt_ukernel__armsimd32_x4(
21 size_t n,
22 const uint8_t* x,
23 uint8_t* y,
24 const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
25 {
26 const uint16x2_t vminus_input_zero_point = (uint16x2_t) params->armsimd32.minus_input_zero_point;
27 const int32_t vbias = params->armsimd32.bias;
28 const int32_t vmultiplier = params->armsimd32.multiplier;
29 for (; n >= 4 * sizeof(uint8_t); n -= 4 * sizeof(uint8_t)) {
30 const uint8x4_t vx0123 = (uint8x4_t) unaligned_load_u32(x);
31 x += 4;
32
33 const uint16x2_t vx02 = __uxtab16(vminus_input_zero_point, vx0123);
34 const uint16x2_t vx13 = __uxtab16(vminus_input_zero_point, __ror(vx0123, 8));
35
36 int32_t vacc0 = __smlawb(vmultiplier, vx02, vbias);
37 int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias);
38 int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias);
39 int32_t vacc3 = __smlawt(vmultiplier, vx13, vbias);
40
41 vacc0 = __usat(math_asr_s32(vacc0, 1), 8);
42 vacc1 = __usat(math_asr_s32(vacc1, 1), 8);
43 vacc2 = __usat(math_asr_s32(vacc2, 1), 8);
44 vacc3 = __usat(math_asr_s32(vacc3, 1), 8);
45
46 y[0] = (uint8_t) vacc0;
47 y[1] = (uint8_t) vacc1;
48 y[2] = (uint8_t) vacc2;
49 y[3] = (uint8_t) vacc3;
50 y += 4;
51 }
52 if XNN_UNLIKELY(n != 0) {
53 const uint8x4_t vx0123 = (uint8x4_t) unaligned_load_u32(x);
54
55 const uint16x2_t vx02 = __uxtab16(vminus_input_zero_point, vx0123);
56 const uint16x2_t vx13 = __uxtab16(vminus_input_zero_point, __ror(vx0123, 8));
57
58 int32_t vacc0 = __smlawb(vmultiplier, vx02, vbias);
59 int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias);
60 const int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias);
61
62 vacc0 = __usat(math_asr_s32(vacc0, 1), 8);
63 vacc1 = __usat(math_asr_s32(vacc1, 1), 8);
64
65 if (n & (2 * sizeof(uint8_t))) {
66 y[0] = (uint8_t) vacc0;
67 y[1] = (uint8_t) vacc1;
68 vacc0 = __usat(math_asr_s32(vacc2, 1), 8);
69 y += 2;
70 }
71 if (n & (1 * sizeof(uint8_t))) {
72 y[0] = (uint8_t) vacc0;
73 }
74 }
75 }
76