1*4bdc9457SAndroid Build Coastguard Worker // Copyright 2021 Google LLC
2*4bdc9457SAndroid Build Coastguard Worker //
3*4bdc9457SAndroid Build Coastguard Worker // This source code is licensed under the BSD-style license found in the
4*4bdc9457SAndroid Build Coastguard Worker // LICENSE file in the root directory of this source tree.
5*4bdc9457SAndroid Build Coastguard Worker
6*4bdc9457SAndroid Build Coastguard Worker #include <assert.h>
7*4bdc9457SAndroid Build Coastguard Worker
8*4bdc9457SAndroid Build Coastguard Worker #include <immintrin.h>
9*4bdc9457SAndroid Build Coastguard Worker
10*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/common.h>
11*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/dwconv.h>
12*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/math.h>
13*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/unaligned.h>
14*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/vcvt.h>
15*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/vlrelu.h>
16*4bdc9457SAndroid Build Coastguard Worker
17*4bdc9457SAndroid Build Coastguard Worker
xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4_acc2(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])18*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4_acc2(
19*4bdc9457SAndroid Build Coastguard Worker size_t input_height,
20*4bdc9457SAndroid Build Coastguard Worker size_t input_width,
21*4bdc9457SAndroid Build Coastguard Worker const float* input,
22*4bdc9457SAndroid Build Coastguard Worker const float* weights,
23*4bdc9457SAndroid Build Coastguard Worker const float* zero,
24*4bdc9457SAndroid Build Coastguard Worker float* output,
25*4bdc9457SAndroid Build Coastguard Worker uint32_t padding_top,
26*4bdc9457SAndroid Build Coastguard Worker const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
27*4bdc9457SAndroid Build Coastguard Worker {
28*4bdc9457SAndroid Build Coastguard Worker assert(input_height != 0);
29*4bdc9457SAndroid Build Coastguard Worker assert(input_width != 0);
30*4bdc9457SAndroid Build Coastguard Worker assert(input_width % sizeof(float) == 0);
31*4bdc9457SAndroid Build Coastguard Worker assert(padding_top == 1);
32*4bdc9457SAndroid Build Coastguard Worker
33*4bdc9457SAndroid Build Coastguard Worker const __m128 vmask = _mm_load_ps((const float*) params->sse.mask);
34*4bdc9457SAndroid Build Coastguard Worker const __m128 vmax = _mm_load_ps(params->sse.max);
35*4bdc9457SAndroid Build Coastguard Worker const __m128 vmin = _mm_load_ps(params->sse.min);
36*4bdc9457SAndroid Build Coastguard Worker
37*4bdc9457SAndroid Build Coastguard Worker const __m128 vbias = _mm_load1_ps(weights);
38*4bdc9457SAndroid Build Coastguard Worker const __m128 vk00 = _mm_load1_ps(weights + 1);
39*4bdc9457SAndroid Build Coastguard Worker const __m128 vk01 = _mm_load1_ps(weights + 2);
40*4bdc9457SAndroid Build Coastguard Worker const __m128 vk02 = _mm_load1_ps(weights + 3);
41*4bdc9457SAndroid Build Coastguard Worker const __m128 vk10 = _mm_load1_ps(weights + 4);
42*4bdc9457SAndroid Build Coastguard Worker const __m128 vk11 = _mm_load1_ps(weights + 5);
43*4bdc9457SAndroid Build Coastguard Worker const __m128 vk12 = _mm_load1_ps(weights + 6);
44*4bdc9457SAndroid Build Coastguard Worker const __m128 vk20 = _mm_load1_ps(weights + 7);
45*4bdc9457SAndroid Build Coastguard Worker const __m128 vk21 = _mm_load1_ps(weights + 8);
46*4bdc9457SAndroid Build Coastguard Worker const __m128 vk22 = _mm_load1_ps(weights + 9);
47*4bdc9457SAndroid Build Coastguard Worker
48*4bdc9457SAndroid Build Coastguard Worker const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
49*4bdc9457SAndroid Build Coastguard Worker
50*4bdc9457SAndroid Build Coastguard Worker const float* i0 = zero;
51*4bdc9457SAndroid Build Coastguard Worker const float* i1 = input;
52*4bdc9457SAndroid Build Coastguard Worker const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
53*4bdc9457SAndroid Build Coastguard Worker const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
54*4bdc9457SAndroid Build Coastguard Worker
55*4bdc9457SAndroid Build Coastguard Worker float* o0 = output;
56*4bdc9457SAndroid Build Coastguard Worker float* o1 = (float*) ((uintptr_t) o0 + input_width);
57*4bdc9457SAndroid Build Coastguard Worker
58*4bdc9457SAndroid Build Coastguard Worker size_t output_height = input_height;
59*4bdc9457SAndroid Build Coastguard Worker do {
60*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(output_height < 2) {
61*4bdc9457SAndroid Build Coastguard Worker i2 = zero;
62*4bdc9457SAndroid Build Coastguard Worker o1 = o0;
63*4bdc9457SAndroid Build Coastguard Worker }
64*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(output_height < 3) {
65*4bdc9457SAndroid Build Coastguard Worker i3 = zero;
66*4bdc9457SAndroid Build Coastguard Worker }
67*4bdc9457SAndroid Build Coastguard Worker
68*4bdc9457SAndroid Build Coastguard Worker __m128 vi0x0123 = _mm_setzero_ps();
69*4bdc9457SAndroid Build Coastguard Worker __m128 vi1x0123 = _mm_setzero_ps();
70*4bdc9457SAndroid Build Coastguard Worker __m128 vi2x0123 = _mm_setzero_ps();
71*4bdc9457SAndroid Build Coastguard Worker __m128 vi3x0123 = _mm_setzero_ps();
72*4bdc9457SAndroid Build Coastguard Worker
73*4bdc9457SAndroid Build Coastguard Worker __m128 vi0x4567 = _mm_loadu_ps(i0);
74*4bdc9457SAndroid Build Coastguard Worker i0 += 4;
75*4bdc9457SAndroid Build Coastguard Worker __m128 vi1x4567 = _mm_loadu_ps(i1);
76*4bdc9457SAndroid Build Coastguard Worker i1 += 4;
77*4bdc9457SAndroid Build Coastguard Worker __m128 vi2x4567 = _mm_loadu_ps(i2);
78*4bdc9457SAndroid Build Coastguard Worker i2 += 4;
79*4bdc9457SAndroid Build Coastguard Worker __m128 vi3x4567 = _mm_loadu_ps(i3);
80*4bdc9457SAndroid Build Coastguard Worker i3 += 4;
81*4bdc9457SAndroid Build Coastguard Worker
82*4bdc9457SAndroid Build Coastguard Worker size_t w = input_width;
83*4bdc9457SAndroid Build Coastguard Worker for (; w > 4 * sizeof(float); w -= 4 * sizeof(float)) {
84*4bdc9457SAndroid Build Coastguard Worker const __m128 vi0x89AB = _mm_loadu_ps(i0);
85*4bdc9457SAndroid Build Coastguard Worker i0 += 4;
86*4bdc9457SAndroid Build Coastguard Worker const __m128 vi1x89AB = _mm_loadu_ps(i1);
87*4bdc9457SAndroid Build Coastguard Worker i1 += 4;
88*4bdc9457SAndroid Build Coastguard Worker const __m128 vi2x89AB = _mm_loadu_ps(i2);
89*4bdc9457SAndroid Build Coastguard Worker i2 += 4;
90*4bdc9457SAndroid Build Coastguard Worker const __m128 vi3x89AB = _mm_loadu_ps(i3);
91*4bdc9457SAndroid Build Coastguard Worker i3 += 4;
92*4bdc9457SAndroid Build Coastguard Worker
93*4bdc9457SAndroid Build Coastguard Worker __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk01));
94*4bdc9457SAndroid Build Coastguard Worker __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk01));
95*4bdc9457SAndroid Build Coastguard Worker __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk11);
96*4bdc9457SAndroid Build Coastguard Worker __m128 vo1p1 = _mm_mul_ps(vi2x4567, vk11);
97*4bdc9457SAndroid Build Coastguard Worker vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk21));
98*4bdc9457SAndroid Build Coastguard Worker vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk21));
99*4bdc9457SAndroid Build Coastguard Worker
100*4bdc9457SAndroid Build Coastguard Worker const __m128 vi0x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi0x4567), _mm_castps_si128(vi0x0123), 12));
101*4bdc9457SAndroid Build Coastguard Worker const __m128 vi1x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi1x4567), _mm_castps_si128(vi1x0123), 12));
102*4bdc9457SAndroid Build Coastguard Worker const __m128 vi2x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi2x4567), _mm_castps_si128(vi2x0123), 12));
103*4bdc9457SAndroid Build Coastguard Worker const __m128 vi3x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi3x4567), _mm_castps_si128(vi3x0123), 12));
104*4bdc9457SAndroid Build Coastguard Worker
105*4bdc9457SAndroid Build Coastguard Worker vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x3456, vk00));
106*4bdc9457SAndroid Build Coastguard Worker vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x3456, vk00));
107*4bdc9457SAndroid Build Coastguard Worker vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk10));
108*4bdc9457SAndroid Build Coastguard Worker vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk10));
109*4bdc9457SAndroid Build Coastguard Worker vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x3456, vk20));
110*4bdc9457SAndroid Build Coastguard Worker vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x3456, vk20));
111*4bdc9457SAndroid Build Coastguard Worker
112*4bdc9457SAndroid Build Coastguard Worker vi0x0123 = vi0x4567;
113*4bdc9457SAndroid Build Coastguard Worker vi1x0123 = vi1x4567;
114*4bdc9457SAndroid Build Coastguard Worker vi2x0123 = vi2x4567;
115*4bdc9457SAndroid Build Coastguard Worker vi3x0123 = vi3x4567;
116*4bdc9457SAndroid Build Coastguard Worker
117*4bdc9457SAndroid Build Coastguard Worker const __m128 vi0x5678 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi0x89AB), _mm_castps_si128(vi0x4567), 4));
118*4bdc9457SAndroid Build Coastguard Worker const __m128 vi1x5678 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi1x89AB), _mm_castps_si128(vi1x4567), 4));
119*4bdc9457SAndroid Build Coastguard Worker const __m128 vi2x5678 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi2x89AB), _mm_castps_si128(vi2x4567), 4));
120*4bdc9457SAndroid Build Coastguard Worker const __m128 vi3x5678 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi3x89AB), _mm_castps_si128(vi3x4567), 4));
121*4bdc9457SAndroid Build Coastguard Worker
122*4bdc9457SAndroid Build Coastguard Worker vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02));
123*4bdc9457SAndroid Build Coastguard Worker vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02));
124*4bdc9457SAndroid Build Coastguard Worker vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x5678, vk12));
125*4bdc9457SAndroid Build Coastguard Worker vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x5678, vk12));
126*4bdc9457SAndroid Build Coastguard Worker vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22));
127*4bdc9457SAndroid Build Coastguard Worker vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22));
128*4bdc9457SAndroid Build Coastguard Worker
129*4bdc9457SAndroid Build Coastguard Worker vi0x4567 = vi0x89AB;
130*4bdc9457SAndroid Build Coastguard Worker vi1x4567 = vi1x89AB;
131*4bdc9457SAndroid Build Coastguard Worker vi2x4567 = vi2x89AB;
132*4bdc9457SAndroid Build Coastguard Worker vi3x4567 = vi3x89AB;
133*4bdc9457SAndroid Build Coastguard Worker
134*4bdc9457SAndroid Build Coastguard Worker vo0p0 = _mm_add_ps(vo0p0, vo0p1);
135*4bdc9457SAndroid Build Coastguard Worker vo1p0 = _mm_add_ps(vo1p0, vo1p1);
136*4bdc9457SAndroid Build Coastguard Worker
137*4bdc9457SAndroid Build Coastguard Worker __m128 vo0 = _mm_max_ps(vo0p0, vmin);
138*4bdc9457SAndroid Build Coastguard Worker __m128 vo1 = _mm_max_ps(vo1p0, vmin);
139*4bdc9457SAndroid Build Coastguard Worker
140*4bdc9457SAndroid Build Coastguard Worker vo0 = _mm_min_ps(vo0, vmax);
141*4bdc9457SAndroid Build Coastguard Worker vo1 = _mm_min_ps(vo1, vmax);
142*4bdc9457SAndroid Build Coastguard Worker
143*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(o1, vo1);
144*4bdc9457SAndroid Build Coastguard Worker o1 += 4;
145*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(o0, vo0);
146*4bdc9457SAndroid Build Coastguard Worker o0 += 4;
147*4bdc9457SAndroid Build Coastguard Worker }
148*4bdc9457SAndroid Build Coastguard Worker // Always process the last block of 1..4 pixels.
149*4bdc9457SAndroid Build Coastguard Worker assert(w >= 1 * sizeof(float));
150*4bdc9457SAndroid Build Coastguard Worker assert(w <= 4 * sizeof(float));
151*4bdc9457SAndroid Build Coastguard Worker {
152*4bdc9457SAndroid Build Coastguard Worker vi0x4567 = _mm_and_ps(vmask, vi0x4567);
153*4bdc9457SAndroid Build Coastguard Worker vi1x4567 = _mm_and_ps(vmask, vi1x4567);
154*4bdc9457SAndroid Build Coastguard Worker vi2x4567 = _mm_and_ps(vmask, vi2x4567);
155*4bdc9457SAndroid Build Coastguard Worker vi3x4567 = _mm_and_ps(vmask, vi3x4567);
156*4bdc9457SAndroid Build Coastguard Worker
157*4bdc9457SAndroid Build Coastguard Worker __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk01));
158*4bdc9457SAndroid Build Coastguard Worker __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk01));
159*4bdc9457SAndroid Build Coastguard Worker __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk11);
160*4bdc9457SAndroid Build Coastguard Worker __m128 vo1p1 = _mm_mul_ps(vi2x4567, vk11);
161*4bdc9457SAndroid Build Coastguard Worker vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk21));
162*4bdc9457SAndroid Build Coastguard Worker vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk21));
163*4bdc9457SAndroid Build Coastguard Worker
164*4bdc9457SAndroid Build Coastguard Worker const __m128 vi0x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi0x4567), _mm_castps_si128(vi0x0123), 12));
165*4bdc9457SAndroid Build Coastguard Worker const __m128 vi1x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi1x4567), _mm_castps_si128(vi1x0123), 12));
166*4bdc9457SAndroid Build Coastguard Worker const __m128 vi2x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi2x4567), _mm_castps_si128(vi2x0123), 12));
167*4bdc9457SAndroid Build Coastguard Worker const __m128 vi3x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi3x4567), _mm_castps_si128(vi3x0123), 12));
168*4bdc9457SAndroid Build Coastguard Worker
169*4bdc9457SAndroid Build Coastguard Worker vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x3456, vk00));
170*4bdc9457SAndroid Build Coastguard Worker vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x3456, vk00));
171*4bdc9457SAndroid Build Coastguard Worker vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk10));
172*4bdc9457SAndroid Build Coastguard Worker vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk10));
173*4bdc9457SAndroid Build Coastguard Worker vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x3456, vk20));
174*4bdc9457SAndroid Build Coastguard Worker vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x3456, vk20));
175*4bdc9457SAndroid Build Coastguard Worker
176*4bdc9457SAndroid Build Coastguard Worker const __m128i vzero = _mm_setzero_si128();
177*4bdc9457SAndroid Build Coastguard Worker const __m128 vi0x5678 = _mm_castsi128_ps(_mm_alignr_epi8(vzero, _mm_castps_si128(vi0x4567), 4));
178*4bdc9457SAndroid Build Coastguard Worker const __m128 vi1x5678 = _mm_castsi128_ps(_mm_alignr_epi8(vzero, _mm_castps_si128(vi1x4567), 4));
179*4bdc9457SAndroid Build Coastguard Worker const __m128 vi2x5678 = _mm_castsi128_ps(_mm_alignr_epi8(vzero, _mm_castps_si128(vi2x4567), 4));
180*4bdc9457SAndroid Build Coastguard Worker const __m128 vi3x5678 = _mm_castsi128_ps(_mm_alignr_epi8(vzero, _mm_castps_si128(vi3x4567), 4));
181*4bdc9457SAndroid Build Coastguard Worker
182*4bdc9457SAndroid Build Coastguard Worker vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02));
183*4bdc9457SAndroid Build Coastguard Worker vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02));
184*4bdc9457SAndroid Build Coastguard Worker vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x5678, vk12));
185*4bdc9457SAndroid Build Coastguard Worker vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x5678, vk12));
186*4bdc9457SAndroid Build Coastguard Worker vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22));
187*4bdc9457SAndroid Build Coastguard Worker vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22));
188*4bdc9457SAndroid Build Coastguard Worker
189*4bdc9457SAndroid Build Coastguard Worker vo0p0 = _mm_add_ps(vo0p0, vo0p1);
190*4bdc9457SAndroid Build Coastguard Worker vo1p0 = _mm_add_ps(vo1p0, vo1p1);
191*4bdc9457SAndroid Build Coastguard Worker
192*4bdc9457SAndroid Build Coastguard Worker __m128 vo0 = _mm_max_ps(vo0p0, vmin);
193*4bdc9457SAndroid Build Coastguard Worker __m128 vo1 = _mm_max_ps(vo1p0, vmin);
194*4bdc9457SAndroid Build Coastguard Worker
195*4bdc9457SAndroid Build Coastguard Worker vo0 = _mm_min_ps(vo0, vmax);
196*4bdc9457SAndroid Build Coastguard Worker vo1 = _mm_min_ps(vo1, vmax);
197*4bdc9457SAndroid Build Coastguard Worker
198*4bdc9457SAndroid Build Coastguard Worker if XNN_LIKELY(w == 4 * sizeof(float)) {
199*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(o1, vo1);
200*4bdc9457SAndroid Build Coastguard Worker o1 += 4;
201*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(o0, vo0);
202*4bdc9457SAndroid Build Coastguard Worker o0 += 4;
203*4bdc9457SAndroid Build Coastguard Worker } else {
204*4bdc9457SAndroid Build Coastguard Worker if (w & (2 * sizeof(float))) {
205*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) o1, vo1);
206*4bdc9457SAndroid Build Coastguard Worker o1 += 2;
207*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) o0, vo0);
208*4bdc9457SAndroid Build Coastguard Worker o0 += 2;
209*4bdc9457SAndroid Build Coastguard Worker
210*4bdc9457SAndroid Build Coastguard Worker vo0 = _mm_movehl_ps(vo0, vo0);
211*4bdc9457SAndroid Build Coastguard Worker vo1 = _mm_movehl_ps(vo1, vo1);
212*4bdc9457SAndroid Build Coastguard Worker }
213*4bdc9457SAndroid Build Coastguard Worker if (w & (1 * sizeof(float))) {
214*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(o1, vo1);
215*4bdc9457SAndroid Build Coastguard Worker o1 += 1;
216*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(o0, vo0);
217*4bdc9457SAndroid Build Coastguard Worker o0 += 1;
218*4bdc9457SAndroid Build Coastguard Worker }
219*4bdc9457SAndroid Build Coastguard Worker }
220*4bdc9457SAndroid Build Coastguard Worker }
221*4bdc9457SAndroid Build Coastguard Worker
222*4bdc9457SAndroid Build Coastguard Worker i0 = (const float*) ((uintptr_t) i2 - input_decrement);
223*4bdc9457SAndroid Build Coastguard Worker i1 = (const float*) ((uintptr_t) i3 - input_decrement);
224*4bdc9457SAndroid Build Coastguard Worker i2 = (const float*) ((uintptr_t) i1 + input_width);
225*4bdc9457SAndroid Build Coastguard Worker i3 = (const float*) ((uintptr_t) i2 + input_width);
226*4bdc9457SAndroid Build Coastguard Worker
227*4bdc9457SAndroid Build Coastguard Worker o0 = o1;
228*4bdc9457SAndroid Build Coastguard Worker o1 = (float*) ((uintptr_t) o0 + input_width);
229*4bdc9457SAndroid Build Coastguard Worker
230*4bdc9457SAndroid Build Coastguard Worker output_height = doz(output_height, 2);
231*4bdc9457SAndroid Build Coastguard Worker } while (output_height != 0);
232*4bdc9457SAndroid Build Coastguard Worker }
233*4bdc9457SAndroid Build Coastguard Worker
xnn_qs8_vcvt_ukernel__ssse3_x32(size_t n,const int8_t * x,int8_t * y,const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])234*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_vcvt_ukernel__ssse3_x32(
235*4bdc9457SAndroid Build Coastguard Worker size_t n,
236*4bdc9457SAndroid Build Coastguard Worker const int8_t* x,
237*4bdc9457SAndroid Build Coastguard Worker int8_t* y,
238*4bdc9457SAndroid Build Coastguard Worker const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
239*4bdc9457SAndroid Build Coastguard Worker {
240*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
241*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(int8_t) == 0);
242*4bdc9457SAndroid Build Coastguard Worker assert(x != NULL);
243*4bdc9457SAndroid Build Coastguard Worker assert(y != NULL);
244*4bdc9457SAndroid Build Coastguard Worker
245*4bdc9457SAndroid Build Coastguard Worker const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->ssse3.input_zero_point);
246*4bdc9457SAndroid Build Coastguard Worker const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->ssse3.multiplier);
247*4bdc9457SAndroid Build Coastguard Worker const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->ssse3.output_zero_point);
248*4bdc9457SAndroid Build Coastguard Worker for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
249*4bdc9457SAndroid Build Coastguard Worker const __m128i vx0 = _mm_loadu_si128((const __m128i*) x);
250*4bdc9457SAndroid Build Coastguard Worker const __m128i vx1 = _mm_loadu_si128((const __m128i*) (x + 16));
251*4bdc9457SAndroid Build Coastguard Worker x += 32;
252*4bdc9457SAndroid Build Coastguard Worker
253*4bdc9457SAndroid Build Coastguard Worker const __m128i vm0 = _mm_cmpgt_epi8(_mm_setzero_si128(), vx0);
254*4bdc9457SAndroid Build Coastguard Worker __m128i vacc0 = _mm_unpacklo_epi8(vx0, vm0);
255*4bdc9457SAndroid Build Coastguard Worker __m128i vacc1 = _mm_unpackhi_epi8(vx0, vm0);
256*4bdc9457SAndroid Build Coastguard Worker const __m128i vm1 = _mm_cmpgt_epi8(_mm_setzero_si128(), vx1);
257*4bdc9457SAndroid Build Coastguard Worker __m128i vacc2 = _mm_unpacklo_epi8(vx1, vm1);
258*4bdc9457SAndroid Build Coastguard Worker __m128i vacc3 = _mm_unpackhi_epi8(vx1, vm1);
259*4bdc9457SAndroid Build Coastguard Worker
260*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm_sub_epi16(vinput_zero_point, vacc0);
261*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm_sub_epi16(vinput_zero_point, vacc1);
262*4bdc9457SAndroid Build Coastguard Worker vacc2 = _mm_sub_epi16(vinput_zero_point, vacc2);
263*4bdc9457SAndroid Build Coastguard Worker vacc3 = _mm_sub_epi16(vinput_zero_point, vacc3);
264*4bdc9457SAndroid Build Coastguard Worker
265*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm_slli_epi16(vacc0, 7);
266*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm_slli_epi16(vacc1, 7);
267*4bdc9457SAndroid Build Coastguard Worker vacc2 = _mm_slli_epi16(vacc2, 7);
268*4bdc9457SAndroid Build Coastguard Worker vacc3 = _mm_slli_epi16(vacc3, 7);
269*4bdc9457SAndroid Build Coastguard Worker
270*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm_mulhrs_epi16(vacc0, vmultiplier);
271*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm_mulhrs_epi16(vacc1, vmultiplier);
272*4bdc9457SAndroid Build Coastguard Worker vacc2 = _mm_mulhrs_epi16(vacc2, vmultiplier);
273*4bdc9457SAndroid Build Coastguard Worker vacc3 = _mm_mulhrs_epi16(vacc3, vmultiplier);
274*4bdc9457SAndroid Build Coastguard Worker
275*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
276*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
277*4bdc9457SAndroid Build Coastguard Worker vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
278*4bdc9457SAndroid Build Coastguard Worker vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
279*4bdc9457SAndroid Build Coastguard Worker
280*4bdc9457SAndroid Build Coastguard Worker const __m128i vy0 = _mm_packs_epi16(vacc0, vacc1);
281*4bdc9457SAndroid Build Coastguard Worker const __m128i vy1 = _mm_packs_epi16(vacc2, vacc3);
282*4bdc9457SAndroid Build Coastguard Worker
283*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, vy0);
284*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (y + 16), vy1);
285*4bdc9457SAndroid Build Coastguard Worker y += 32;
286*4bdc9457SAndroid Build Coastguard Worker }
287*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
288*4bdc9457SAndroid Build Coastguard Worker const __m128i vx = _mm_loadu_si128((const __m128i*) x);
289*4bdc9457SAndroid Build Coastguard Worker x += 16;
290*4bdc9457SAndroid Build Coastguard Worker
291*4bdc9457SAndroid Build Coastguard Worker const __m128i vm = _mm_cmpgt_epi8(_mm_setzero_si128(), vx);
292*4bdc9457SAndroid Build Coastguard Worker __m128i vacc_lo = _mm_unpacklo_epi8(vx, vm);
293*4bdc9457SAndroid Build Coastguard Worker __m128i vacc_hi = _mm_unpackhi_epi8(vx, vm);
294*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_sub_epi16(vinput_zero_point, vacc_lo);
295*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_sub_epi16(vinput_zero_point, vacc_hi);
296*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_slli_epi16(vacc_lo, 7);
297*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_slli_epi16(vacc_hi, 7);
298*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_mulhrs_epi16(vacc_lo, vmultiplier);
299*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_mulhrs_epi16(vacc_hi, vmultiplier);
300*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_adds_epi16(vacc_lo, voutput_zero_point);
301*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_adds_epi16(vacc_hi, voutput_zero_point);
302*4bdc9457SAndroid Build Coastguard Worker
303*4bdc9457SAndroid Build Coastguard Worker const __m128i vy = _mm_packs_epi16(vacc_lo, vacc_hi);
304*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, vy);
305*4bdc9457SAndroid Build Coastguard Worker y += 16;
306*4bdc9457SAndroid Build Coastguard Worker }
307*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
308*4bdc9457SAndroid Build Coastguard Worker assert(n >= 1 * sizeof(int8_t));
309*4bdc9457SAndroid Build Coastguard Worker assert(n <= 15 * sizeof(int8_t));
310*4bdc9457SAndroid Build Coastguard Worker
311*4bdc9457SAndroid Build Coastguard Worker const __m128i vx = _mm_loadu_si128((const __m128i*) x);
312*4bdc9457SAndroid Build Coastguard Worker
313*4bdc9457SAndroid Build Coastguard Worker const __m128i vm = _mm_cmpgt_epi8(_mm_setzero_si128(), vx);
314*4bdc9457SAndroid Build Coastguard Worker __m128i vacc_lo = _mm_unpacklo_epi8(vx, vm);
315*4bdc9457SAndroid Build Coastguard Worker __m128i vacc_hi = _mm_unpackhi_epi8(vx, vm);
316*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_sub_epi16(vinput_zero_point, vacc_lo);
317*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_sub_epi16(vinput_zero_point, vacc_hi);
318*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_slli_epi16(vacc_lo, 7);
319*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_slli_epi16(vacc_hi, 7);
320*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_mulhrs_epi16(vacc_lo, vmultiplier);
321*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_mulhrs_epi16(vacc_hi, vmultiplier);
322*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_adds_epi16(vacc_lo, voutput_zero_point);
323*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_adds_epi16(vacc_hi, voutput_zero_point);
324*4bdc9457SAndroid Build Coastguard Worker
325*4bdc9457SAndroid Build Coastguard Worker __m128i vy = _mm_packs_epi16(vacc_lo, vacc_hi);
326*4bdc9457SAndroid Build Coastguard Worker if (n & (8 * sizeof(int8_t))) {
327*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vy);
328*4bdc9457SAndroid Build Coastguard Worker vy = _mm_unpackhi_epi64(vy, vy);
329*4bdc9457SAndroid Build Coastguard Worker y += 8;
330*4bdc9457SAndroid Build Coastguard Worker }
331*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(int8_t))) {
332*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u32(y, (uint32_t) _mm_cvtsi128_si32(vy));
333*4bdc9457SAndroid Build Coastguard Worker vy = _mm_srli_epi64(vy, 32);
334*4bdc9457SAndroid Build Coastguard Worker y += 4;
335*4bdc9457SAndroid Build Coastguard Worker }
336*4bdc9457SAndroid Build Coastguard Worker uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
337*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(int8_t))) {
338*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u16(y, (uint16_t) vy_lo);
339*4bdc9457SAndroid Build Coastguard Worker vy_lo >>= 16;
340*4bdc9457SAndroid Build Coastguard Worker y += 2;
341*4bdc9457SAndroid Build Coastguard Worker }
342*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(int8_t))) {
343*4bdc9457SAndroid Build Coastguard Worker *y = (int8_t) vy_lo;
344*4bdc9457SAndroid Build Coastguard Worker }
345*4bdc9457SAndroid Build Coastguard Worker }
346*4bdc9457SAndroid Build Coastguard Worker }
347*4bdc9457SAndroid Build Coastguard Worker
xnn_qs8_vlrelu_ukernel__ssse3_x32(size_t n,const int8_t * x,int8_t * y,const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])348*4bdc9457SAndroid Build Coastguard Worker void xnn_qs8_vlrelu_ukernel__ssse3_x32(
349*4bdc9457SAndroid Build Coastguard Worker size_t n,
350*4bdc9457SAndroid Build Coastguard Worker const int8_t* x,
351*4bdc9457SAndroid Build Coastguard Worker int8_t* y,
352*4bdc9457SAndroid Build Coastguard Worker const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
353*4bdc9457SAndroid Build Coastguard Worker {
354*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
355*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(int8_t) == 0);
356*4bdc9457SAndroid Build Coastguard Worker assert(x != NULL);
357*4bdc9457SAndroid Build Coastguard Worker assert(y != NULL);
358*4bdc9457SAndroid Build Coastguard Worker
359*4bdc9457SAndroid Build Coastguard Worker const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->sse2.input_zero_point);
360*4bdc9457SAndroid Build Coastguard Worker const __m128i vmultiplier_diff = _mm_load_si128((const __m128i*) params->sse2.multiplier_diff);
361*4bdc9457SAndroid Build Coastguard Worker const __m128i vmultiplier_base = _mm_load_si128((const __m128i*) params->sse2.multiplier_base);
362*4bdc9457SAndroid Build Coastguard Worker const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
363*4bdc9457SAndroid Build Coastguard Worker for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
364*4bdc9457SAndroid Build Coastguard Worker const __m128i vx0 = _mm_loadu_si128((const __m128i*) x);
365*4bdc9457SAndroid Build Coastguard Worker const __m128i vx1 = _mm_loadu_si128((const __m128i*) (x + 16));
366*4bdc9457SAndroid Build Coastguard Worker x += 32;
367*4bdc9457SAndroid Build Coastguard Worker
368*4bdc9457SAndroid Build Coastguard Worker const __m128i vm0 = _mm_cmpgt_epi8(_mm_setzero_si128(), vx0);
369*4bdc9457SAndroid Build Coastguard Worker __m128i vacc0 = _mm_unpacklo_epi8(vx0, vm0);
370*4bdc9457SAndroid Build Coastguard Worker __m128i vacc1 = _mm_unpackhi_epi8(vx0, vm0);
371*4bdc9457SAndroid Build Coastguard Worker const __m128i vm1 = _mm_cmpgt_epi8(_mm_setzero_si128(), vx1);
372*4bdc9457SAndroid Build Coastguard Worker __m128i vacc2 = _mm_unpacklo_epi8(vx1, vm1);
373*4bdc9457SAndroid Build Coastguard Worker __m128i vacc3 = _mm_unpackhi_epi8(vx1, vm1);
374*4bdc9457SAndroid Build Coastguard Worker
375*4bdc9457SAndroid Build Coastguard Worker __m128i vmultiplier0 = _mm_cmpgt_epi16(vacc0, vinput_zero_point);
376*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm_sub_epi16(vinput_zero_point, vacc0);
377*4bdc9457SAndroid Build Coastguard Worker __m128i vmultiplier1 = _mm_cmpgt_epi16(vacc1, vinput_zero_point);
378*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm_sub_epi16(vinput_zero_point, vacc1);
379*4bdc9457SAndroid Build Coastguard Worker __m128i vmultiplier2 = _mm_cmpgt_epi16(vacc2, vinput_zero_point);
380*4bdc9457SAndroid Build Coastguard Worker vacc2 = _mm_sub_epi16(vinput_zero_point, vacc2);
381*4bdc9457SAndroid Build Coastguard Worker __m128i vmultiplier3 = _mm_cmpgt_epi16(vacc3, vinput_zero_point);
382*4bdc9457SAndroid Build Coastguard Worker vacc3 = _mm_sub_epi16(vinput_zero_point, vacc3);
383*4bdc9457SAndroid Build Coastguard Worker
384*4bdc9457SAndroid Build Coastguard Worker vmultiplier0 = _mm_and_si128(vmultiplier0, vmultiplier_diff);
385*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm_slli_epi16(vacc0, 7);
386*4bdc9457SAndroid Build Coastguard Worker vmultiplier0 = _mm_xor_si128(vmultiplier0, vmultiplier_base);
387*4bdc9457SAndroid Build Coastguard Worker vmultiplier1 = _mm_and_si128(vmultiplier1, vmultiplier_diff);
388*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm_slli_epi16(vacc1, 7);
389*4bdc9457SAndroid Build Coastguard Worker vmultiplier1 = _mm_xor_si128(vmultiplier1, vmultiplier_base);
390*4bdc9457SAndroid Build Coastguard Worker vmultiplier2 = _mm_and_si128(vmultiplier2, vmultiplier_diff);
391*4bdc9457SAndroid Build Coastguard Worker vacc2 = _mm_slli_epi16(vacc2, 7);
392*4bdc9457SAndroid Build Coastguard Worker vmultiplier2 = _mm_xor_si128(vmultiplier2, vmultiplier_base);
393*4bdc9457SAndroid Build Coastguard Worker vmultiplier3 = _mm_and_si128(vmultiplier3, vmultiplier_diff);
394*4bdc9457SAndroid Build Coastguard Worker vacc3 = _mm_slli_epi16(vacc3, 7);
395*4bdc9457SAndroid Build Coastguard Worker vmultiplier3 = _mm_xor_si128(vmultiplier3, vmultiplier_base);
396*4bdc9457SAndroid Build Coastguard Worker
397*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm_mulhrs_epi16(vacc0, vmultiplier0);
398*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm_mulhrs_epi16(vacc1, vmultiplier1);
399*4bdc9457SAndroid Build Coastguard Worker vacc2 = _mm_mulhrs_epi16(vacc2, vmultiplier2);
400*4bdc9457SAndroid Build Coastguard Worker vacc3 = _mm_mulhrs_epi16(vacc3, vmultiplier3);
401*4bdc9457SAndroid Build Coastguard Worker
402*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
403*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
404*4bdc9457SAndroid Build Coastguard Worker vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
405*4bdc9457SAndroid Build Coastguard Worker vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
406*4bdc9457SAndroid Build Coastguard Worker
407*4bdc9457SAndroid Build Coastguard Worker const __m128i vy0 = _mm_packs_epi16(vacc0, vacc1);
408*4bdc9457SAndroid Build Coastguard Worker const __m128i vy1 = _mm_packs_epi16(vacc2, vacc3);
409*4bdc9457SAndroid Build Coastguard Worker
410*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, vy0);
411*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (y + 16), vy1);
412*4bdc9457SAndroid Build Coastguard Worker y += 32;
413*4bdc9457SAndroid Build Coastguard Worker }
414*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
415*4bdc9457SAndroid Build Coastguard Worker const __m128i vx = _mm_loadu_si128((const __m128i*) x);
416*4bdc9457SAndroid Build Coastguard Worker x += 16;
417*4bdc9457SAndroid Build Coastguard Worker
418*4bdc9457SAndroid Build Coastguard Worker const __m128i vm = _mm_cmpgt_epi8(_mm_setzero_si128(), vx);
419*4bdc9457SAndroid Build Coastguard Worker __m128i vacc_lo = _mm_unpacklo_epi8(vx, vm);
420*4bdc9457SAndroid Build Coastguard Worker __m128i vacc_hi = _mm_unpackhi_epi8(vx, vm);
421*4bdc9457SAndroid Build Coastguard Worker __m128i vmultiplier_lo = _mm_cmpgt_epi16(vacc_lo, vinput_zero_point);
422*4bdc9457SAndroid Build Coastguard Worker __m128i vmultiplier_hi = _mm_cmpgt_epi16(vacc_hi, vinput_zero_point);
423*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_sub_epi16(vinput_zero_point, vacc_lo);
424*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_sub_epi16(vinput_zero_point, vacc_hi);
425*4bdc9457SAndroid Build Coastguard Worker vmultiplier_lo = _mm_and_si128(vmultiplier_lo, vmultiplier_diff);
426*4bdc9457SAndroid Build Coastguard Worker vmultiplier_hi = _mm_and_si128(vmultiplier_hi, vmultiplier_diff);
427*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_slli_epi16(vacc_lo, 7);
428*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_slli_epi16(vacc_hi, 7);
429*4bdc9457SAndroid Build Coastguard Worker vmultiplier_lo = _mm_xor_si128(vmultiplier_lo, vmultiplier_base);
430*4bdc9457SAndroid Build Coastguard Worker vmultiplier_hi = _mm_xor_si128(vmultiplier_hi, vmultiplier_base);
431*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_mulhrs_epi16(vacc_lo, vmultiplier_lo);
432*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_mulhrs_epi16(vacc_hi, vmultiplier_hi);
433*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_adds_epi16(vacc_lo, voutput_zero_point);
434*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_adds_epi16(vacc_hi, voutput_zero_point);
435*4bdc9457SAndroid Build Coastguard Worker
436*4bdc9457SAndroid Build Coastguard Worker const __m128i vy = _mm_packs_epi16(vacc_lo, vacc_hi);
437*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, vy);
438*4bdc9457SAndroid Build Coastguard Worker y += 16;
439*4bdc9457SAndroid Build Coastguard Worker }
440*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
441*4bdc9457SAndroid Build Coastguard Worker assert(n >= 1 * sizeof(int8_t));
442*4bdc9457SAndroid Build Coastguard Worker assert(n <= 15 * sizeof(int8_t));
443*4bdc9457SAndroid Build Coastguard Worker
444*4bdc9457SAndroid Build Coastguard Worker const __m128i vx = _mm_loadu_si128((const __m128i*) x);
445*4bdc9457SAndroid Build Coastguard Worker
446*4bdc9457SAndroid Build Coastguard Worker const __m128i vm = _mm_cmpgt_epi8(_mm_setzero_si128(), vx);
447*4bdc9457SAndroid Build Coastguard Worker __m128i vacc_lo = _mm_unpacklo_epi8(vx, vm);
448*4bdc9457SAndroid Build Coastguard Worker __m128i vacc_hi = _mm_unpackhi_epi8(vx, vm);
449*4bdc9457SAndroid Build Coastguard Worker __m128i vmultiplier_lo = _mm_cmpgt_epi16(vacc_lo, vinput_zero_point);
450*4bdc9457SAndroid Build Coastguard Worker __m128i vmultiplier_hi = _mm_cmpgt_epi16(vacc_hi, vinput_zero_point);
451*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_sub_epi16(vinput_zero_point, vacc_lo);
452*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_sub_epi16(vinput_zero_point, vacc_hi);
453*4bdc9457SAndroid Build Coastguard Worker vmultiplier_lo = _mm_and_si128(vmultiplier_lo, vmultiplier_diff);
454*4bdc9457SAndroid Build Coastguard Worker vmultiplier_hi = _mm_and_si128(vmultiplier_hi, vmultiplier_diff);
455*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_slli_epi16(vacc_lo, 7);
456*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_slli_epi16(vacc_hi, 7);
457*4bdc9457SAndroid Build Coastguard Worker vmultiplier_lo = _mm_xor_si128(vmultiplier_lo, vmultiplier_base);
458*4bdc9457SAndroid Build Coastguard Worker vmultiplier_hi = _mm_xor_si128(vmultiplier_hi, vmultiplier_base);
459*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_mulhrs_epi16(vacc_lo, vmultiplier_lo);
460*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_mulhrs_epi16(vacc_hi, vmultiplier_hi);
461*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_adds_epi16(vacc_lo, voutput_zero_point);
462*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_adds_epi16(vacc_hi, voutput_zero_point);
463*4bdc9457SAndroid Build Coastguard Worker
464*4bdc9457SAndroid Build Coastguard Worker __m128i vy = _mm_packs_epi16(vacc_lo, vacc_hi);
465*4bdc9457SAndroid Build Coastguard Worker if (n & (8 * sizeof(int8_t))) {
466*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vy);
467*4bdc9457SAndroid Build Coastguard Worker vy = _mm_unpackhi_epi64(vy, vy);
468*4bdc9457SAndroid Build Coastguard Worker y += 8;
469*4bdc9457SAndroid Build Coastguard Worker }
470*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(int8_t))) {
471*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u32(y, (uint32_t) _mm_cvtsi128_si32(vy));
472*4bdc9457SAndroid Build Coastguard Worker vy = _mm_srli_epi64(vy, 32);
473*4bdc9457SAndroid Build Coastguard Worker y += 4;
474*4bdc9457SAndroid Build Coastguard Worker }
475*4bdc9457SAndroid Build Coastguard Worker uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
476*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(int8_t))) {
477*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u16(y, (uint16_t) vy_lo);
478*4bdc9457SAndroid Build Coastguard Worker vy_lo >>= 16;
479*4bdc9457SAndroid Build Coastguard Worker y += 2;
480*4bdc9457SAndroid Build Coastguard Worker }
481*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(int8_t))) {
482*4bdc9457SAndroid Build Coastguard Worker *y = (int8_t) vy_lo;
483*4bdc9457SAndroid Build Coastguard Worker }
484*4bdc9457SAndroid Build Coastguard Worker }
485*4bdc9457SAndroid Build Coastguard Worker }
486*4bdc9457SAndroid Build Coastguard Worker
xnn_qu8_vcvt_ukernel__ssse3_x32(size_t n,const uint8_t * x,uint8_t * y,const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])487*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_vcvt_ukernel__ssse3_x32(
488*4bdc9457SAndroid Build Coastguard Worker size_t n,
489*4bdc9457SAndroid Build Coastguard Worker const uint8_t* x,
490*4bdc9457SAndroid Build Coastguard Worker uint8_t* y,
491*4bdc9457SAndroid Build Coastguard Worker const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
492*4bdc9457SAndroid Build Coastguard Worker {
493*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
494*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint8_t) == 0);
495*4bdc9457SAndroid Build Coastguard Worker assert(x != NULL);
496*4bdc9457SAndroid Build Coastguard Worker assert(y != NULL);
497*4bdc9457SAndroid Build Coastguard Worker
498*4bdc9457SAndroid Build Coastguard Worker const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->ssse3.input_zero_point);
499*4bdc9457SAndroid Build Coastguard Worker const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->ssse3.multiplier);
500*4bdc9457SAndroid Build Coastguard Worker const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->ssse3.output_zero_point);
501*4bdc9457SAndroid Build Coastguard Worker const __m128i vzero = _mm_setzero_si128();
502*4bdc9457SAndroid Build Coastguard Worker for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
503*4bdc9457SAndroid Build Coastguard Worker const __m128i vx0 = _mm_loadu_si128((const __m128i*) x);
504*4bdc9457SAndroid Build Coastguard Worker const __m128i vx1 = _mm_loadu_si128((const __m128i*) (x + 16));
505*4bdc9457SAndroid Build Coastguard Worker x += 32;
506*4bdc9457SAndroid Build Coastguard Worker
507*4bdc9457SAndroid Build Coastguard Worker __m128i vacc0 = _mm_unpacklo_epi8(vx0, vzero);
508*4bdc9457SAndroid Build Coastguard Worker __m128i vacc1 = _mm_unpackhi_epi8(vx0, vzero);
509*4bdc9457SAndroid Build Coastguard Worker __m128i vacc2 = _mm_unpacklo_epi8(vx1, vzero);
510*4bdc9457SAndroid Build Coastguard Worker __m128i vacc3 = _mm_unpackhi_epi8(vx1, vzero);
511*4bdc9457SAndroid Build Coastguard Worker
512*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm_sub_epi16(vinput_zero_point, vacc0);
513*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm_sub_epi16(vinput_zero_point, vacc1);
514*4bdc9457SAndroid Build Coastguard Worker vacc2 = _mm_sub_epi16(vinput_zero_point, vacc2);
515*4bdc9457SAndroid Build Coastguard Worker vacc3 = _mm_sub_epi16(vinput_zero_point, vacc3);
516*4bdc9457SAndroid Build Coastguard Worker
517*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm_slli_epi16(vacc0, 7);
518*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm_slli_epi16(vacc1, 7);
519*4bdc9457SAndroid Build Coastguard Worker vacc2 = _mm_slli_epi16(vacc2, 7);
520*4bdc9457SAndroid Build Coastguard Worker vacc3 = _mm_slli_epi16(vacc3, 7);
521*4bdc9457SAndroid Build Coastguard Worker
522*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm_mulhrs_epi16(vacc0, vmultiplier);
523*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm_mulhrs_epi16(vacc1, vmultiplier);
524*4bdc9457SAndroid Build Coastguard Worker vacc2 = _mm_mulhrs_epi16(vacc2, vmultiplier);
525*4bdc9457SAndroid Build Coastguard Worker vacc3 = _mm_mulhrs_epi16(vacc3, vmultiplier);
526*4bdc9457SAndroid Build Coastguard Worker
527*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
528*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
529*4bdc9457SAndroid Build Coastguard Worker vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
530*4bdc9457SAndroid Build Coastguard Worker vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
531*4bdc9457SAndroid Build Coastguard Worker
532*4bdc9457SAndroid Build Coastguard Worker const __m128i vy0 = _mm_packus_epi16(vacc0, vacc1);
533*4bdc9457SAndroid Build Coastguard Worker const __m128i vy1 = _mm_packus_epi16(vacc2, vacc3);
534*4bdc9457SAndroid Build Coastguard Worker
535*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, vy0);
536*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (y + 16), vy1);
537*4bdc9457SAndroid Build Coastguard Worker y += 32;
538*4bdc9457SAndroid Build Coastguard Worker }
539*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
540*4bdc9457SAndroid Build Coastguard Worker const __m128i vx = _mm_loadu_si128((const __m128i*) x);
541*4bdc9457SAndroid Build Coastguard Worker x += 16;
542*4bdc9457SAndroid Build Coastguard Worker
543*4bdc9457SAndroid Build Coastguard Worker __m128i vacc_lo = _mm_unpacklo_epi8(vx, vzero);
544*4bdc9457SAndroid Build Coastguard Worker __m128i vacc_hi = _mm_unpackhi_epi8(vx, vzero);
545*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_sub_epi16(vinput_zero_point, vacc_lo);
546*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_sub_epi16(vinput_zero_point, vacc_hi);
547*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_slli_epi16(vacc_lo, 7);
548*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_slli_epi16(vacc_hi, 7);
549*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_mulhrs_epi16(vacc_lo, vmultiplier);
550*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_mulhrs_epi16(vacc_hi, vmultiplier);
551*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_adds_epi16(vacc_lo, voutput_zero_point);
552*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_adds_epi16(vacc_hi, voutput_zero_point);
553*4bdc9457SAndroid Build Coastguard Worker
554*4bdc9457SAndroid Build Coastguard Worker const __m128i vy = _mm_packus_epi16(vacc_lo, vacc_hi);
555*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, vy);
556*4bdc9457SAndroid Build Coastguard Worker y += 16;
557*4bdc9457SAndroid Build Coastguard Worker }
558*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
559*4bdc9457SAndroid Build Coastguard Worker assert(n >= 1 * sizeof(uint8_t));
560*4bdc9457SAndroid Build Coastguard Worker assert(n <= 15 * sizeof(uint8_t));
561*4bdc9457SAndroid Build Coastguard Worker
562*4bdc9457SAndroid Build Coastguard Worker const __m128i vx = _mm_loadu_si128((const __m128i*) x);
563*4bdc9457SAndroid Build Coastguard Worker
564*4bdc9457SAndroid Build Coastguard Worker __m128i vacc_lo = _mm_unpacklo_epi8(vx, vzero);
565*4bdc9457SAndroid Build Coastguard Worker __m128i vacc_hi = _mm_unpackhi_epi8(vx, vzero);
566*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_sub_epi16(vinput_zero_point, vacc_lo);
567*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_sub_epi16(vinput_zero_point, vacc_hi);
568*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_slli_epi16(vacc_lo, 7);
569*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_slli_epi16(vacc_hi, 7);
570*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_mulhrs_epi16(vacc_lo, vmultiplier);
571*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_mulhrs_epi16(vacc_hi, vmultiplier);
572*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_adds_epi16(vacc_lo, voutput_zero_point);
573*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_adds_epi16(vacc_hi, voutput_zero_point);
574*4bdc9457SAndroid Build Coastguard Worker
575*4bdc9457SAndroid Build Coastguard Worker __m128i vy = _mm_packus_epi16(vacc_lo, vacc_hi);
576*4bdc9457SAndroid Build Coastguard Worker if (n & (8 * sizeof(uint8_t))) {
577*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vy);
578*4bdc9457SAndroid Build Coastguard Worker vy = _mm_unpackhi_epi64(vy, vy);
579*4bdc9457SAndroid Build Coastguard Worker y += 8;
580*4bdc9457SAndroid Build Coastguard Worker }
581*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint8_t))) {
582*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u32(y, (uint32_t) _mm_cvtsi128_si32(vy));
583*4bdc9457SAndroid Build Coastguard Worker vy = _mm_srli_epi64(vy, 32);
584*4bdc9457SAndroid Build Coastguard Worker y += 4;
585*4bdc9457SAndroid Build Coastguard Worker }
586*4bdc9457SAndroid Build Coastguard Worker uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
587*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint8_t))) {
588*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u16(y, (uint16_t) vy_lo);
589*4bdc9457SAndroid Build Coastguard Worker vy_lo >>= 16;
590*4bdc9457SAndroid Build Coastguard Worker y += 2;
591*4bdc9457SAndroid Build Coastguard Worker }
592*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint8_t))) {
593*4bdc9457SAndroid Build Coastguard Worker *y = (uint8_t) vy_lo;
594*4bdc9457SAndroid Build Coastguard Worker }
595*4bdc9457SAndroid Build Coastguard Worker }
596*4bdc9457SAndroid Build Coastguard Worker }
597*4bdc9457SAndroid Build Coastguard Worker
xnn_qu8_vlrelu_ukernel__ssse3_x32(size_t n,const uint8_t * x,uint8_t * y,const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])598*4bdc9457SAndroid Build Coastguard Worker void xnn_qu8_vlrelu_ukernel__ssse3_x32(
599*4bdc9457SAndroid Build Coastguard Worker size_t n,
600*4bdc9457SAndroid Build Coastguard Worker const uint8_t* x,
601*4bdc9457SAndroid Build Coastguard Worker uint8_t* y,
602*4bdc9457SAndroid Build Coastguard Worker const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
603*4bdc9457SAndroid Build Coastguard Worker {
604*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
605*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint8_t) == 0);
606*4bdc9457SAndroid Build Coastguard Worker assert(x != NULL);
607*4bdc9457SAndroid Build Coastguard Worker assert(y != NULL);
608*4bdc9457SAndroid Build Coastguard Worker
609*4bdc9457SAndroid Build Coastguard Worker const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->sse2.input_zero_point);
610*4bdc9457SAndroid Build Coastguard Worker const __m128i vmultiplier_diff = _mm_load_si128((const __m128i*) params->sse2.multiplier_diff);
611*4bdc9457SAndroid Build Coastguard Worker const __m128i vmultiplier_base = _mm_load_si128((const __m128i*) params->sse2.multiplier_base);
612*4bdc9457SAndroid Build Coastguard Worker const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
613*4bdc9457SAndroid Build Coastguard Worker const __m128i vzero = _mm_setzero_si128();
614*4bdc9457SAndroid Build Coastguard Worker for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
615*4bdc9457SAndroid Build Coastguard Worker const __m128i vx0 = _mm_loadu_si128((const __m128i*) x);
616*4bdc9457SAndroid Build Coastguard Worker const __m128i vx1 = _mm_loadu_si128((const __m128i*) (x + 16));
617*4bdc9457SAndroid Build Coastguard Worker x += 32;
618*4bdc9457SAndroid Build Coastguard Worker
619*4bdc9457SAndroid Build Coastguard Worker __m128i vacc0 = _mm_unpacklo_epi8(vx0, vzero);
620*4bdc9457SAndroid Build Coastguard Worker __m128i vacc1 = _mm_unpackhi_epi8(vx0, vzero);
621*4bdc9457SAndroid Build Coastguard Worker __m128i vacc2 = _mm_unpacklo_epi8(vx1, vzero);
622*4bdc9457SAndroid Build Coastguard Worker __m128i vacc3 = _mm_unpackhi_epi8(vx1, vzero);
623*4bdc9457SAndroid Build Coastguard Worker
624*4bdc9457SAndroid Build Coastguard Worker __m128i vmultiplier0 = _mm_cmpgt_epi16(vacc0, vinput_zero_point);
625*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm_sub_epi16(vinput_zero_point, vacc0);
626*4bdc9457SAndroid Build Coastguard Worker __m128i vmultiplier1 = _mm_cmpgt_epi16(vacc1, vinput_zero_point);
627*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm_sub_epi16(vinput_zero_point, vacc1);
628*4bdc9457SAndroid Build Coastguard Worker __m128i vmultiplier2 = _mm_cmpgt_epi16(vacc2, vinput_zero_point);
629*4bdc9457SAndroid Build Coastguard Worker vacc2 = _mm_sub_epi16(vinput_zero_point, vacc2);
630*4bdc9457SAndroid Build Coastguard Worker __m128i vmultiplier3 = _mm_cmpgt_epi16(vacc3, vinput_zero_point);
631*4bdc9457SAndroid Build Coastguard Worker vacc3 = _mm_sub_epi16(vinput_zero_point, vacc3);
632*4bdc9457SAndroid Build Coastguard Worker
633*4bdc9457SAndroid Build Coastguard Worker vmultiplier0 = _mm_and_si128(vmultiplier0, vmultiplier_diff);
634*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm_slli_epi16(vacc0, 7);
635*4bdc9457SAndroid Build Coastguard Worker vmultiplier0 = _mm_xor_si128(vmultiplier0, vmultiplier_base);
636*4bdc9457SAndroid Build Coastguard Worker vmultiplier1 = _mm_and_si128(vmultiplier1, vmultiplier_diff);
637*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm_slli_epi16(vacc1, 7);
638*4bdc9457SAndroid Build Coastguard Worker vmultiplier1 = _mm_xor_si128(vmultiplier1, vmultiplier_base);
639*4bdc9457SAndroid Build Coastguard Worker vmultiplier2 = _mm_and_si128(vmultiplier2, vmultiplier_diff);
640*4bdc9457SAndroid Build Coastguard Worker vacc2 = _mm_slli_epi16(vacc2, 7);
641*4bdc9457SAndroid Build Coastguard Worker vmultiplier2 = _mm_xor_si128(vmultiplier2, vmultiplier_base);
642*4bdc9457SAndroid Build Coastguard Worker vmultiplier3 = _mm_and_si128(vmultiplier3, vmultiplier_diff);
643*4bdc9457SAndroid Build Coastguard Worker vacc3 = _mm_slli_epi16(vacc3, 7);
644*4bdc9457SAndroid Build Coastguard Worker vmultiplier3 = _mm_xor_si128(vmultiplier3, vmultiplier_base);
645*4bdc9457SAndroid Build Coastguard Worker
646*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm_mulhrs_epi16(vacc0, vmultiplier0);
647*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm_mulhrs_epi16(vacc1, vmultiplier1);
648*4bdc9457SAndroid Build Coastguard Worker vacc2 = _mm_mulhrs_epi16(vacc2, vmultiplier2);
649*4bdc9457SAndroid Build Coastguard Worker vacc3 = _mm_mulhrs_epi16(vacc3, vmultiplier3);
650*4bdc9457SAndroid Build Coastguard Worker
651*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
652*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
653*4bdc9457SAndroid Build Coastguard Worker vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
654*4bdc9457SAndroid Build Coastguard Worker vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
655*4bdc9457SAndroid Build Coastguard Worker
656*4bdc9457SAndroid Build Coastguard Worker const __m128i vy0 = _mm_packus_epi16(vacc0, vacc1);
657*4bdc9457SAndroid Build Coastguard Worker const __m128i vy1 = _mm_packus_epi16(vacc2, vacc3);
658*4bdc9457SAndroid Build Coastguard Worker
659*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, vy0);
660*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (y + 16), vy1);
661*4bdc9457SAndroid Build Coastguard Worker y += 32;
662*4bdc9457SAndroid Build Coastguard Worker }
663*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
664*4bdc9457SAndroid Build Coastguard Worker const __m128i vx = _mm_loadu_si128((const __m128i*) x);
665*4bdc9457SAndroid Build Coastguard Worker x += 16;
666*4bdc9457SAndroid Build Coastguard Worker
667*4bdc9457SAndroid Build Coastguard Worker __m128i vacc_lo = _mm_unpacklo_epi8(vx, vzero);
668*4bdc9457SAndroid Build Coastguard Worker __m128i vacc_hi = _mm_unpackhi_epi8(vx, vzero);
669*4bdc9457SAndroid Build Coastguard Worker __m128i vmultiplier_lo = _mm_cmpgt_epi16(vacc_lo, vinput_zero_point);
670*4bdc9457SAndroid Build Coastguard Worker __m128i vmultiplier_hi = _mm_cmpgt_epi16(vacc_hi, vinput_zero_point);
671*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_sub_epi16(vinput_zero_point, vacc_lo);
672*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_sub_epi16(vinput_zero_point, vacc_hi);
673*4bdc9457SAndroid Build Coastguard Worker vmultiplier_lo = _mm_and_si128(vmultiplier_lo, vmultiplier_diff);
674*4bdc9457SAndroid Build Coastguard Worker vmultiplier_hi = _mm_and_si128(vmultiplier_hi, vmultiplier_diff);
675*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_slli_epi16(vacc_lo, 7);
676*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_slli_epi16(vacc_hi, 7);
677*4bdc9457SAndroid Build Coastguard Worker vmultiplier_lo = _mm_xor_si128(vmultiplier_lo, vmultiplier_base);
678*4bdc9457SAndroid Build Coastguard Worker vmultiplier_hi = _mm_xor_si128(vmultiplier_hi, vmultiplier_base);
679*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_mulhrs_epi16(vacc_lo, vmultiplier_lo);
680*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_mulhrs_epi16(vacc_hi, vmultiplier_hi);
681*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_adds_epi16(vacc_lo, voutput_zero_point);
682*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_adds_epi16(vacc_hi, voutput_zero_point);
683*4bdc9457SAndroid Build Coastguard Worker
684*4bdc9457SAndroid Build Coastguard Worker const __m128i vy = _mm_packus_epi16(vacc_lo, vacc_hi);
685*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, vy);
686*4bdc9457SAndroid Build Coastguard Worker y += 16;
687*4bdc9457SAndroid Build Coastguard Worker }
688*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
689*4bdc9457SAndroid Build Coastguard Worker assert(n >= 1 * sizeof(uint8_t));
690*4bdc9457SAndroid Build Coastguard Worker assert(n <= 15 * sizeof(uint8_t));
691*4bdc9457SAndroid Build Coastguard Worker
692*4bdc9457SAndroid Build Coastguard Worker const __m128i vx = _mm_loadu_si128((const __m128i*) x);
693*4bdc9457SAndroid Build Coastguard Worker
694*4bdc9457SAndroid Build Coastguard Worker __m128i vacc_lo = _mm_unpacklo_epi8(vx, vzero);
695*4bdc9457SAndroid Build Coastguard Worker __m128i vacc_hi = _mm_unpackhi_epi8(vx, vzero);
696*4bdc9457SAndroid Build Coastguard Worker __m128i vmultiplier_lo = _mm_cmpgt_epi16(vacc_lo, vinput_zero_point);
697*4bdc9457SAndroid Build Coastguard Worker __m128i vmultiplier_hi = _mm_cmpgt_epi16(vacc_hi, vinput_zero_point);
698*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_sub_epi16(vinput_zero_point, vacc_lo);
699*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_sub_epi16(vinput_zero_point, vacc_hi);
700*4bdc9457SAndroid Build Coastguard Worker vmultiplier_lo = _mm_and_si128(vmultiplier_lo, vmultiplier_diff);
701*4bdc9457SAndroid Build Coastguard Worker vmultiplier_hi = _mm_and_si128(vmultiplier_hi, vmultiplier_diff);
702*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_slli_epi16(vacc_lo, 7);
703*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_slli_epi16(vacc_hi, 7);
704*4bdc9457SAndroid Build Coastguard Worker vmultiplier_lo = _mm_xor_si128(vmultiplier_lo, vmultiplier_base);
705*4bdc9457SAndroid Build Coastguard Worker vmultiplier_hi = _mm_xor_si128(vmultiplier_hi, vmultiplier_base);
706*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_mulhrs_epi16(vacc_lo, vmultiplier_lo);
707*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_mulhrs_epi16(vacc_hi, vmultiplier_hi);
708*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_adds_epi16(vacc_lo, voutput_zero_point);
709*4bdc9457SAndroid Build Coastguard Worker vacc_hi = _mm_adds_epi16(vacc_hi, voutput_zero_point);
710*4bdc9457SAndroid Build Coastguard Worker
711*4bdc9457SAndroid Build Coastguard Worker __m128i vy = _mm_packus_epi16(vacc_lo, vacc_hi);
712*4bdc9457SAndroid Build Coastguard Worker if (n & (8 * sizeof(uint8_t))) {
713*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vy);
714*4bdc9457SAndroid Build Coastguard Worker vy = _mm_unpackhi_epi64(vy, vy);
715*4bdc9457SAndroid Build Coastguard Worker y += 8;
716*4bdc9457SAndroid Build Coastguard Worker }
717*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint8_t))) {
718*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u32(y, (uint32_t) _mm_cvtsi128_si32(vy));
719*4bdc9457SAndroid Build Coastguard Worker vy = _mm_srli_epi64(vy, 32);
720*4bdc9457SAndroid Build Coastguard Worker y += 4;
721*4bdc9457SAndroid Build Coastguard Worker }
722*4bdc9457SAndroid Build Coastguard Worker uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
723*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint8_t))) {
724*4bdc9457SAndroid Build Coastguard Worker unaligned_store_u16(y, (uint16_t) vy_lo);
725*4bdc9457SAndroid Build Coastguard Worker vy_lo >>= 16;
726*4bdc9457SAndroid Build Coastguard Worker y += 2;
727*4bdc9457SAndroid Build Coastguard Worker }
728*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint8_t))) {
729*4bdc9457SAndroid Build Coastguard Worker *y = (uint8_t) vy_lo;
730*4bdc9457SAndroid Build Coastguard Worker }
731*4bdc9457SAndroid Build Coastguard Worker }
732*4bdc9457SAndroid Build Coastguard Worker }
733