xref: /aosp_15_r20/external/XNNPACK/src/x8-lut/gen/lut-wasmsimd-x64.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/x8-lut/wasmsimd.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <wasm_simd128.h>
13 
14 #include <xnnpack/intrinsics-polyfill.h>
15 #include <xnnpack/lut.h>
16 #include <xnnpack/common.h>
17 
18 
xnn_x8_lut_ukernel__wasmsimd_x64(size_t n,const uint8_t * x,uint8_t * y,const uint8_t t[restrict XNN_MIN_ELEMENTS (256)])19 void xnn_x8_lut_ukernel__wasmsimd_x64(
20     size_t n,
21     const uint8_t* x,
22     uint8_t* y,
23     const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
24 {
25   assert(n != 0);
26   assert(x != NULL);
27   assert(y != NULL);
28 
29   const v128_t vtable0 = wasm_v128_load(t);
30   const v128_t vtable1 = wasm_v128_load(t + 16);
31   const v128_t vtable2 = wasm_v128_load(t + 32);
32   const v128_t vtable3 = wasm_v128_load(t + 48);
33   const v128_t vtable4 = wasm_v128_load(t + 64);
34   const v128_t vtable5 = wasm_v128_load(t + 80);
35   const v128_t vtable6 = wasm_v128_load(t + 96);
36   const v128_t vtable7 = wasm_v128_load(t + 112);
37   const v128_t vtable8 = wasm_v128_load(t + 128);
38   const v128_t vtable9 = wasm_v128_load(t + 144);
39   const v128_t vtable10 = wasm_v128_load(t + 160);
40   const v128_t vtable11 = wasm_v128_load(t + 176);
41   const v128_t vtable12 = wasm_v128_load(t + 192);
42   const v128_t vtable13 = wasm_v128_load(t + 208);
43   const v128_t vtable14 = wasm_v128_load(t + 224);
44   const v128_t vtable15 = wasm_v128_load(t + 240);
45   const v128_t voffset = wasm_i8x16_const_splat(16);
46   for (; n >= 64 * sizeof(uint8_t); n -= 64 * sizeof(uint8_t)) {
47     v128_t vx0 = wasm_v128_load(x);
48     v128_t vx1 = wasm_v128_load(x + 16);
49     v128_t vx2 = wasm_v128_load(x + 32);
50     v128_t vx3 = wasm_v128_load(x + 48);
51     x += 64;
52 
53     v128_t vy0 = wasm_i8x16_swizzle(vtable0, vx0);
54     v128_t vy1 = wasm_i8x16_swizzle(vtable0, vx1);
55     v128_t vy2 = wasm_i8x16_swizzle(vtable0, vx2);
56     v128_t vy3 = wasm_i8x16_swizzle(vtable0, vx3);
57 
58     vx0 = wasm_i8x16_sub(vx0, voffset);
59     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable1, vx0));
60     vx1 = wasm_i8x16_sub(vx1, voffset);
61     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable1, vx1));
62     vx2 = wasm_i8x16_sub(vx2, voffset);
63     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable1, vx2));
64     vx3 = wasm_i8x16_sub(vx3, voffset);
65     vy3 = wasm_v128_or(vy3, wasm_i8x16_swizzle(vtable1, vx3));
66     vx0 = wasm_i8x16_sub(vx0, voffset);
67     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable2, vx0));
68     vx1 = wasm_i8x16_sub(vx1, voffset);
69     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable2, vx1));
70     vx2 = wasm_i8x16_sub(vx2, voffset);
71     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable2, vx2));
72     vx3 = wasm_i8x16_sub(vx3, voffset);
73     vy3 = wasm_v128_or(vy3, wasm_i8x16_swizzle(vtable2, vx3));
74     vx0 = wasm_i8x16_sub(vx0, voffset);
75     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable3, vx0));
76     vx1 = wasm_i8x16_sub(vx1, voffset);
77     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable3, vx1));
78     vx2 = wasm_i8x16_sub(vx2, voffset);
79     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable3, vx2));
80     vx3 = wasm_i8x16_sub(vx3, voffset);
81     vy3 = wasm_v128_or(vy3, wasm_i8x16_swizzle(vtable3, vx3));
82     vx0 = wasm_i8x16_sub(vx0, voffset);
83     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable4, vx0));
84     vx1 = wasm_i8x16_sub(vx1, voffset);
85     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable4, vx1));
86     vx2 = wasm_i8x16_sub(vx2, voffset);
87     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable4, vx2));
88     vx3 = wasm_i8x16_sub(vx3, voffset);
89     vy3 = wasm_v128_or(vy3, wasm_i8x16_swizzle(vtable4, vx3));
90     vx0 = wasm_i8x16_sub(vx0, voffset);
91     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable5, vx0));
92     vx1 = wasm_i8x16_sub(vx1, voffset);
93     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable5, vx1));
94     vx2 = wasm_i8x16_sub(vx2, voffset);
95     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable5, vx2));
96     vx3 = wasm_i8x16_sub(vx3, voffset);
97     vy3 = wasm_v128_or(vy3, wasm_i8x16_swizzle(vtable5, vx3));
98     vx0 = wasm_i8x16_sub(vx0, voffset);
99     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable6, vx0));
100     vx1 = wasm_i8x16_sub(vx1, voffset);
101     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable6, vx1));
102     vx2 = wasm_i8x16_sub(vx2, voffset);
103     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable6, vx2));
104     vx3 = wasm_i8x16_sub(vx3, voffset);
105     vy3 = wasm_v128_or(vy3, wasm_i8x16_swizzle(vtable6, vx3));
106     vx0 = wasm_i8x16_sub(vx0, voffset);
107     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable7, vx0));
108     vx1 = wasm_i8x16_sub(vx1, voffset);
109     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable7, vx1));
110     vx2 = wasm_i8x16_sub(vx2, voffset);
111     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable7, vx2));
112     vx3 = wasm_i8x16_sub(vx3, voffset);
113     vy3 = wasm_v128_or(vy3, wasm_i8x16_swizzle(vtable7, vx3));
114     vx0 = wasm_i8x16_sub(vx0, voffset);
115     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable8, vx0));
116     vx1 = wasm_i8x16_sub(vx1, voffset);
117     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable8, vx1));
118     vx2 = wasm_i8x16_sub(vx2, voffset);
119     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable8, vx2));
120     vx3 = wasm_i8x16_sub(vx3, voffset);
121     vy3 = wasm_v128_or(vy3, wasm_i8x16_swizzle(vtable8, vx3));
122     vx0 = wasm_i8x16_sub(vx0, voffset);
123     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable9, vx0));
124     vx1 = wasm_i8x16_sub(vx1, voffset);
125     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable9, vx1));
126     vx2 = wasm_i8x16_sub(vx2, voffset);
127     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable9, vx2));
128     vx3 = wasm_i8x16_sub(vx3, voffset);
129     vy3 = wasm_v128_or(vy3, wasm_i8x16_swizzle(vtable9, vx3));
130     vx0 = wasm_i8x16_sub(vx0, voffset);
131     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable10, vx0));
132     vx1 = wasm_i8x16_sub(vx1, voffset);
133     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable10, vx1));
134     vx2 = wasm_i8x16_sub(vx2, voffset);
135     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable10, vx2));
136     vx3 = wasm_i8x16_sub(vx3, voffset);
137     vy3 = wasm_v128_or(vy3, wasm_i8x16_swizzle(vtable10, vx3));
138     vx0 = wasm_i8x16_sub(vx0, voffset);
139     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable11, vx0));
140     vx1 = wasm_i8x16_sub(vx1, voffset);
141     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable11, vx1));
142     vx2 = wasm_i8x16_sub(vx2, voffset);
143     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable11, vx2));
144     vx3 = wasm_i8x16_sub(vx3, voffset);
145     vy3 = wasm_v128_or(vy3, wasm_i8x16_swizzle(vtable11, vx3));
146     vx0 = wasm_i8x16_sub(vx0, voffset);
147     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable12, vx0));
148     vx1 = wasm_i8x16_sub(vx1, voffset);
149     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable12, vx1));
150     vx2 = wasm_i8x16_sub(vx2, voffset);
151     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable12, vx2));
152     vx3 = wasm_i8x16_sub(vx3, voffset);
153     vy3 = wasm_v128_or(vy3, wasm_i8x16_swizzle(vtable12, vx3));
154     vx0 = wasm_i8x16_sub(vx0, voffset);
155     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable13, vx0));
156     vx1 = wasm_i8x16_sub(vx1, voffset);
157     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable13, vx1));
158     vx2 = wasm_i8x16_sub(vx2, voffset);
159     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable13, vx2));
160     vx3 = wasm_i8x16_sub(vx3, voffset);
161     vy3 = wasm_v128_or(vy3, wasm_i8x16_swizzle(vtable13, vx3));
162     vx0 = wasm_i8x16_sub(vx0, voffset);
163     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable14, vx0));
164     vx1 = wasm_i8x16_sub(vx1, voffset);
165     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable14, vx1));
166     vx2 = wasm_i8x16_sub(vx2, voffset);
167     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable14, vx2));
168     vx3 = wasm_i8x16_sub(vx3, voffset);
169     vy3 = wasm_v128_or(vy3, wasm_i8x16_swizzle(vtable14, vx3));
170     vx0 = wasm_i8x16_sub(vx0, voffset);
171     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable15, vx0));
172     vx1 = wasm_i8x16_sub(vx1, voffset);
173     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable15, vx1));
174     vx2 = wasm_i8x16_sub(vx2, voffset);
175     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable15, vx2));
176     vx3 = wasm_i8x16_sub(vx3, voffset);
177     vy3 = wasm_v128_or(vy3, wasm_i8x16_swizzle(vtable15, vx3));
178 
179     wasm_v128_store(y, vy0);
180     wasm_v128_store(y + 16, vy1);
181     wasm_v128_store(y + 32, vy2);
182     wasm_v128_store(y + 48, vy3);
183     y += 64;
184   }
185   for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
186     v128_t vx = wasm_v128_load(x);
187     x += 16;
188 
189     v128_t vy = wasm_i8x16_swizzle(vtable0, vx);
190 
191     vx = wasm_i8x16_sub(vx, voffset);
192     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable1, vx));
193     vx = wasm_i8x16_sub(vx, voffset);
194     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable2, vx));
195     vx = wasm_i8x16_sub(vx, voffset);
196     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable3, vx));
197     vx = wasm_i8x16_sub(vx, voffset);
198     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable4, vx));
199     vx = wasm_i8x16_sub(vx, voffset);
200     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable5, vx));
201     vx = wasm_i8x16_sub(vx, voffset);
202     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable6, vx));
203     vx = wasm_i8x16_sub(vx, voffset);
204     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable7, vx));
205     vx = wasm_i8x16_sub(vx, voffset);
206     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable8, vx));
207     vx = wasm_i8x16_sub(vx, voffset);
208     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable9, vx));
209     vx = wasm_i8x16_sub(vx, voffset);
210     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable10, vx));
211     vx = wasm_i8x16_sub(vx, voffset);
212     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable11, vx));
213     vx = wasm_i8x16_sub(vx, voffset);
214     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable12, vx));
215     vx = wasm_i8x16_sub(vx, voffset);
216     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable13, vx));
217     vx = wasm_i8x16_sub(vx, voffset);
218     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable14, vx));
219     vx = wasm_i8x16_sub(vx, voffset);
220     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable15, vx));
221 
222     wasm_v128_store(y, vy);
223     y += 16;
224   }
225   if XNN_UNLIKELY(n != 0) {
226     v128_t vx = wasm_v128_load(x);
227 
228     v128_t vy = wasm_i8x16_swizzle(vtable0, vx);
229 
230     vx = wasm_i8x16_sub(vx, voffset);
231     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable1, vx));
232     vx = wasm_i8x16_sub(vx, voffset);
233     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable2, vx));
234     vx = wasm_i8x16_sub(vx, voffset);
235     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable3, vx));
236     vx = wasm_i8x16_sub(vx, voffset);
237     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable4, vx));
238     vx = wasm_i8x16_sub(vx, voffset);
239     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable5, vx));
240     vx = wasm_i8x16_sub(vx, voffset);
241     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable6, vx));
242     vx = wasm_i8x16_sub(vx, voffset);
243     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable7, vx));
244     vx = wasm_i8x16_sub(vx, voffset);
245     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable8, vx));
246     vx = wasm_i8x16_sub(vx, voffset);
247     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable9, vx));
248     vx = wasm_i8x16_sub(vx, voffset);
249     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable10, vx));
250     vx = wasm_i8x16_sub(vx, voffset);
251     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable11, vx));
252     vx = wasm_i8x16_sub(vx, voffset);
253     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable12, vx));
254     vx = wasm_i8x16_sub(vx, voffset);
255     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable13, vx));
256     vx = wasm_i8x16_sub(vx, voffset);
257     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable14, vx));
258     vx = wasm_i8x16_sub(vx, voffset);
259     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable15, vx));
260 
261     if (n & (8 * sizeof(uint8_t))) {
262       *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
263       vy = wasm_v64x2_shuffle(vy, vy, 1, 1);
264       y += 8;
265     }
266     if (n & (4 * sizeof(uint8_t))) {
267       *((float*) y) = wasm_f32x4_extract_lane(vy, 0);
268       vy = wasm_u64x2_shr(vy, 32);
269       y += 4;
270     }
271     uint32_t vy_lo = wasm_i32x4_extract_lane(vy, 0);
272     if (n & (2 * sizeof(uint8_t))) {
273       *((uint16_t*) y) = (uint16_t) vy_lo;
274       vy_lo >>= 16;
275       y += 2;
276     }
277     if (n & (1 * sizeof(uint8_t))) {
278       *y = (uint8_t) vy_lo;
279     }
280   }
281 }
282