1*4bdc9457SAndroid Build Coastguard Worker // Copyright 2019 Google LLC
2*4bdc9457SAndroid Build Coastguard Worker //
3*4bdc9457SAndroid Build Coastguard Worker // This source code is licensed under the BSD-style license found in the
4*4bdc9457SAndroid Build Coastguard Worker // LICENSE file in the root directory of this source tree.
5*4bdc9457SAndroid Build Coastguard Worker
6*4bdc9457SAndroid Build Coastguard Worker #include <assert.h>
7*4bdc9457SAndroid Build Coastguard Worker
8*4bdc9457SAndroid Build Coastguard Worker #include <xmmintrin.h>
9*4bdc9457SAndroid Build Coastguard Worker
10*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/packx.h>
11*4bdc9457SAndroid Build Coastguard Worker
12*4bdc9457SAndroid Build Coastguard Worker
xnn_x32_packx_ukernel_4x__sse(size_t m,size_t k,const uint32_t * restrict x,size_t x_stride,uint32_t * restrict y)13*4bdc9457SAndroid Build Coastguard Worker void xnn_x32_packx_ukernel_4x__sse(
14*4bdc9457SAndroid Build Coastguard Worker size_t m,
15*4bdc9457SAndroid Build Coastguard Worker size_t k,
16*4bdc9457SAndroid Build Coastguard Worker const uint32_t* restrict x,
17*4bdc9457SAndroid Build Coastguard Worker size_t x_stride,
18*4bdc9457SAndroid Build Coastguard Worker uint32_t* restrict y)
19*4bdc9457SAndroid Build Coastguard Worker {
20*4bdc9457SAndroid Build Coastguard Worker assert(m != 0);
21*4bdc9457SAndroid Build Coastguard Worker assert(k != 0);
22*4bdc9457SAndroid Build Coastguard Worker
23*4bdc9457SAndroid Build Coastguard Worker const float* x0 = (const float*) x;
24*4bdc9457SAndroid Build Coastguard Worker const float* x1 = (const float*) ((uintptr_t) x0 + x_stride);
25*4bdc9457SAndroid Build Coastguard Worker if (m < 2) {
26*4bdc9457SAndroid Build Coastguard Worker x1 = x0;
27*4bdc9457SAndroid Build Coastguard Worker }
28*4bdc9457SAndroid Build Coastguard Worker const float* x2 = (const float*) ((uintptr_t) x1 + x_stride);
29*4bdc9457SAndroid Build Coastguard Worker if (m <= 2) {
30*4bdc9457SAndroid Build Coastguard Worker x2 = x1;
31*4bdc9457SAndroid Build Coastguard Worker }
32*4bdc9457SAndroid Build Coastguard Worker const float* x3 = (const float*) ((uintptr_t) x2 + x_stride);
33*4bdc9457SAndroid Build Coastguard Worker if (m != 4) {
34*4bdc9457SAndroid Build Coastguard Worker x3 = x2;
35*4bdc9457SAndroid Build Coastguard Worker }
36*4bdc9457SAndroid Build Coastguard Worker
37*4bdc9457SAndroid Build Coastguard Worker float*restrict y_f32 = (float*) y;
38*4bdc9457SAndroid Build Coastguard Worker
39*4bdc9457SAndroid Build Coastguard Worker for (; k >= 4; k -= 4) {
40*4bdc9457SAndroid Build Coastguard Worker const __m128 vx0 = _mm_loadu_ps(x0);
41*4bdc9457SAndroid Build Coastguard Worker x0 += 4;
42*4bdc9457SAndroid Build Coastguard Worker const __m128 vx1 = _mm_loadu_ps(x1);
43*4bdc9457SAndroid Build Coastguard Worker x1 += 4;
44*4bdc9457SAndroid Build Coastguard Worker const __m128 vx2 = _mm_loadu_ps(x2);
45*4bdc9457SAndroid Build Coastguard Worker x2 += 4;
46*4bdc9457SAndroid Build Coastguard Worker const __m128 vx3 = _mm_loadu_ps(x3);
47*4bdc9457SAndroid Build Coastguard Worker x3 += 4;
48*4bdc9457SAndroid Build Coastguard Worker
49*4bdc9457SAndroid Build Coastguard Worker const __m128 vt0 = _mm_unpacklo_ps(vx0, vx1);
50*4bdc9457SAndroid Build Coastguard Worker const __m128 vt1 = _mm_unpackhi_ps(vx0, vx1);
51*4bdc9457SAndroid Build Coastguard Worker const __m128 vt2 = _mm_unpacklo_ps(vx2, vx3);
52*4bdc9457SAndroid Build Coastguard Worker const __m128 vt3 = _mm_unpackhi_ps(vx2, vx3);
53*4bdc9457SAndroid Build Coastguard Worker
54*4bdc9457SAndroid Build Coastguard Worker const __m128 vy0 = _mm_movelh_ps(vt0, vt2);
55*4bdc9457SAndroid Build Coastguard Worker _mm_store_ps(y_f32, vy0);
56*4bdc9457SAndroid Build Coastguard Worker
57*4bdc9457SAndroid Build Coastguard Worker const __m128 vy1 = _mm_movehl_ps(vt2, vt0);
58*4bdc9457SAndroid Build Coastguard Worker _mm_store_ps(y_f32 + 4, vy1);
59*4bdc9457SAndroid Build Coastguard Worker
60*4bdc9457SAndroid Build Coastguard Worker const __m128 vy2 = _mm_movelh_ps(vt1, vt3);
61*4bdc9457SAndroid Build Coastguard Worker _mm_store_ps(y_f32 + 8, vy2);
62*4bdc9457SAndroid Build Coastguard Worker
63*4bdc9457SAndroid Build Coastguard Worker const __m128 vy3 = _mm_movehl_ps(vt3, vt1);
64*4bdc9457SAndroid Build Coastguard Worker _mm_store_ps(y_f32 + 12, vy3);
65*4bdc9457SAndroid Build Coastguard Worker
66*4bdc9457SAndroid Build Coastguard Worker y_f32 += 16;
67*4bdc9457SAndroid Build Coastguard Worker }
68*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(k != 0) {
69*4bdc9457SAndroid Build Coastguard Worker do {
70*4bdc9457SAndroid Build Coastguard Worker const __m128 vx0 = _mm_load_ss(x0);
71*4bdc9457SAndroid Build Coastguard Worker x0 += 1;
72*4bdc9457SAndroid Build Coastguard Worker const __m128 vx1 = _mm_load_ss(x1);
73*4bdc9457SAndroid Build Coastguard Worker x1 += 1;
74*4bdc9457SAndroid Build Coastguard Worker const __m128 vx2 = _mm_load_ss(x2);
75*4bdc9457SAndroid Build Coastguard Worker x2 += 1;
76*4bdc9457SAndroid Build Coastguard Worker const __m128 vx3 = _mm_load_ss(x3);
77*4bdc9457SAndroid Build Coastguard Worker x3 += 1;
78*4bdc9457SAndroid Build Coastguard Worker
79*4bdc9457SAndroid Build Coastguard Worker const __m128 vx01 = _mm_unpacklo_ps(vx0, vx1);
80*4bdc9457SAndroid Build Coastguard Worker const __m128 vx23 = _mm_unpacklo_ps(vx2, vx3);
81*4bdc9457SAndroid Build Coastguard Worker const __m128 vy = _mm_movelh_ps(vx01, vx23);
82*4bdc9457SAndroid Build Coastguard Worker
83*4bdc9457SAndroid Build Coastguard Worker _mm_store_ps(y_f32, vy);
84*4bdc9457SAndroid Build Coastguard Worker y_f32 += 4;
85*4bdc9457SAndroid Build Coastguard Worker } while (--k != 0);
86*4bdc9457SAndroid Build Coastguard Worker }
87*4bdc9457SAndroid Build Coastguard Worker }
88