xref: /aosp_15_r20/external/XNNPACK/src/x8-lut/gen/lut-wasmsimd-x32.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/x8-lut/wasmsimd.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <wasm_simd128.h>
13 
14 #include <xnnpack/intrinsics-polyfill.h>
15 #include <xnnpack/lut.h>
16 #include <xnnpack/common.h>
17 
18 
xnn_x8_lut_ukernel__wasmsimd_x32(size_t n,const uint8_t * x,uint8_t * y,const uint8_t t[restrict XNN_MIN_ELEMENTS (256)])19 void xnn_x8_lut_ukernel__wasmsimd_x32(
20     size_t n,
21     const uint8_t* x,
22     uint8_t* y,
23     const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
24 {
25   assert(n != 0);
26   assert(x != NULL);
27   assert(y != NULL);
28 
29   const v128_t vtable0 = wasm_v128_load(t);
30   const v128_t vtable1 = wasm_v128_load(t + 16);
31   const v128_t vtable2 = wasm_v128_load(t + 32);
32   const v128_t vtable3 = wasm_v128_load(t + 48);
33   const v128_t vtable4 = wasm_v128_load(t + 64);
34   const v128_t vtable5 = wasm_v128_load(t + 80);
35   const v128_t vtable6 = wasm_v128_load(t + 96);
36   const v128_t vtable7 = wasm_v128_load(t + 112);
37   const v128_t vtable8 = wasm_v128_load(t + 128);
38   const v128_t vtable9 = wasm_v128_load(t + 144);
39   const v128_t vtable10 = wasm_v128_load(t + 160);
40   const v128_t vtable11 = wasm_v128_load(t + 176);
41   const v128_t vtable12 = wasm_v128_load(t + 192);
42   const v128_t vtable13 = wasm_v128_load(t + 208);
43   const v128_t vtable14 = wasm_v128_load(t + 224);
44   const v128_t vtable15 = wasm_v128_load(t + 240);
45   const v128_t voffset = wasm_i8x16_const_splat(16);
46   for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
47     v128_t vx0 = wasm_v128_load(x);
48     v128_t vx1 = wasm_v128_load(x + 16);
49     x += 32;
50 
51     v128_t vy0 = wasm_i8x16_swizzle(vtable0, vx0);
52     v128_t vy1 = wasm_i8x16_swizzle(vtable0, vx1);
53 
54     vx0 = wasm_i8x16_sub(vx0, voffset);
55     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable1, vx0));
56     vx1 = wasm_i8x16_sub(vx1, voffset);
57     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable1, vx1));
58     vx0 = wasm_i8x16_sub(vx0, voffset);
59     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable2, vx0));
60     vx1 = wasm_i8x16_sub(vx1, voffset);
61     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable2, vx1));
62     vx0 = wasm_i8x16_sub(vx0, voffset);
63     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable3, vx0));
64     vx1 = wasm_i8x16_sub(vx1, voffset);
65     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable3, vx1));
66     vx0 = wasm_i8x16_sub(vx0, voffset);
67     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable4, vx0));
68     vx1 = wasm_i8x16_sub(vx1, voffset);
69     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable4, vx1));
70     vx0 = wasm_i8x16_sub(vx0, voffset);
71     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable5, vx0));
72     vx1 = wasm_i8x16_sub(vx1, voffset);
73     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable5, vx1));
74     vx0 = wasm_i8x16_sub(vx0, voffset);
75     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable6, vx0));
76     vx1 = wasm_i8x16_sub(vx1, voffset);
77     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable6, vx1));
78     vx0 = wasm_i8x16_sub(vx0, voffset);
79     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable7, vx0));
80     vx1 = wasm_i8x16_sub(vx1, voffset);
81     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable7, vx1));
82     vx0 = wasm_i8x16_sub(vx0, voffset);
83     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable8, vx0));
84     vx1 = wasm_i8x16_sub(vx1, voffset);
85     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable8, vx1));
86     vx0 = wasm_i8x16_sub(vx0, voffset);
87     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable9, vx0));
88     vx1 = wasm_i8x16_sub(vx1, voffset);
89     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable9, vx1));
90     vx0 = wasm_i8x16_sub(vx0, voffset);
91     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable10, vx0));
92     vx1 = wasm_i8x16_sub(vx1, voffset);
93     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable10, vx1));
94     vx0 = wasm_i8x16_sub(vx0, voffset);
95     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable11, vx0));
96     vx1 = wasm_i8x16_sub(vx1, voffset);
97     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable11, vx1));
98     vx0 = wasm_i8x16_sub(vx0, voffset);
99     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable12, vx0));
100     vx1 = wasm_i8x16_sub(vx1, voffset);
101     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable12, vx1));
102     vx0 = wasm_i8x16_sub(vx0, voffset);
103     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable13, vx0));
104     vx1 = wasm_i8x16_sub(vx1, voffset);
105     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable13, vx1));
106     vx0 = wasm_i8x16_sub(vx0, voffset);
107     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable14, vx0));
108     vx1 = wasm_i8x16_sub(vx1, voffset);
109     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable14, vx1));
110     vx0 = wasm_i8x16_sub(vx0, voffset);
111     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable15, vx0));
112     vx1 = wasm_i8x16_sub(vx1, voffset);
113     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable15, vx1));
114 
115     wasm_v128_store(y, vy0);
116     wasm_v128_store(y + 16, vy1);
117     y += 32;
118   }
119   for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
120     v128_t vx = wasm_v128_load(x);
121     x += 16;
122 
123     v128_t vy = wasm_i8x16_swizzle(vtable0, vx);
124 
125     vx = wasm_i8x16_sub(vx, voffset);
126     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable1, vx));
127     vx = wasm_i8x16_sub(vx, voffset);
128     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable2, vx));
129     vx = wasm_i8x16_sub(vx, voffset);
130     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable3, vx));
131     vx = wasm_i8x16_sub(vx, voffset);
132     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable4, vx));
133     vx = wasm_i8x16_sub(vx, voffset);
134     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable5, vx));
135     vx = wasm_i8x16_sub(vx, voffset);
136     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable6, vx));
137     vx = wasm_i8x16_sub(vx, voffset);
138     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable7, vx));
139     vx = wasm_i8x16_sub(vx, voffset);
140     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable8, vx));
141     vx = wasm_i8x16_sub(vx, voffset);
142     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable9, vx));
143     vx = wasm_i8x16_sub(vx, voffset);
144     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable10, vx));
145     vx = wasm_i8x16_sub(vx, voffset);
146     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable11, vx));
147     vx = wasm_i8x16_sub(vx, voffset);
148     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable12, vx));
149     vx = wasm_i8x16_sub(vx, voffset);
150     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable13, vx));
151     vx = wasm_i8x16_sub(vx, voffset);
152     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable14, vx));
153     vx = wasm_i8x16_sub(vx, voffset);
154     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable15, vx));
155 
156     wasm_v128_store(y, vy);
157     y += 16;
158   }
159   if XNN_UNLIKELY(n != 0) {
160     v128_t vx = wasm_v128_load(x);
161 
162     v128_t vy = wasm_i8x16_swizzle(vtable0, vx);
163 
164     vx = wasm_i8x16_sub(vx, voffset);
165     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable1, vx));
166     vx = wasm_i8x16_sub(vx, voffset);
167     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable2, vx));
168     vx = wasm_i8x16_sub(vx, voffset);
169     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable3, vx));
170     vx = wasm_i8x16_sub(vx, voffset);
171     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable4, vx));
172     vx = wasm_i8x16_sub(vx, voffset);
173     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable5, vx));
174     vx = wasm_i8x16_sub(vx, voffset);
175     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable6, vx));
176     vx = wasm_i8x16_sub(vx, voffset);
177     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable7, vx));
178     vx = wasm_i8x16_sub(vx, voffset);
179     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable8, vx));
180     vx = wasm_i8x16_sub(vx, voffset);
181     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable9, vx));
182     vx = wasm_i8x16_sub(vx, voffset);
183     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable10, vx));
184     vx = wasm_i8x16_sub(vx, voffset);
185     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable11, vx));
186     vx = wasm_i8x16_sub(vx, voffset);
187     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable12, vx));
188     vx = wasm_i8x16_sub(vx, voffset);
189     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable13, vx));
190     vx = wasm_i8x16_sub(vx, voffset);
191     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable14, vx));
192     vx = wasm_i8x16_sub(vx, voffset);
193     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable15, vx));
194 
195     if (n & (8 * sizeof(uint8_t))) {
196       *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
197       vy = wasm_v64x2_shuffle(vy, vy, 1, 1);
198       y += 8;
199     }
200     if (n & (4 * sizeof(uint8_t))) {
201       *((float*) y) = wasm_f32x4_extract_lane(vy, 0);
202       vy = wasm_u64x2_shr(vy, 32);
203       y += 4;
204     }
205     uint32_t vy_lo = wasm_i32x4_extract_lane(vy, 0);
206     if (n & (2 * sizeof(uint8_t))) {
207       *((uint16_t*) y) = (uint16_t) vy_lo;
208       vy_lo >>= 16;
209       y += 2;
210     }
211     if (n & (1 * sizeof(uint8_t))) {
212       *y = (uint8_t) vy_lo;
213     }
214   }
215 }
216