xref: /aosp_15_r20/external/XNNPACK/src/x16-transposec/gen/8x8-multi-mov-wasmsimd.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/x32-transposec/wasmsimd.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <wasm_simd128.h>
11 
12 #include <assert.h>
13 
14 #include <xnnpack/common.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/transpose.h>
17 
xnn_x16_transposec_ukernel__8x8_multi_mov_wasmsimd(const uint16_t * input,uint16_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)18 void xnn_x16_transposec_ukernel__8x8_multi_mov_wasmsimd(
19     const uint16_t* input,
20     uint16_t* output,
21     size_t input_stride,
22     size_t output_stride,
23     size_t block_width,
24     size_t block_height) XNN_OOB_READS
25 {
26   assert(output_stride >= block_height * sizeof(uint16_t));
27   assert(input_stride >= block_width * sizeof(uint16_t));
28 
29   const size_t tile_height = 8;
30   const size_t tile_width = 8;
31   const size_t tile_hbytes = tile_height * sizeof(uint16_t);
32   const size_t tile_wbytes = tile_width * sizeof(uint16_t);
33   const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
34   const size_t input_offset = tile_height * input_stride;
35   const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint16_t) - tile_hbytes;
36 
37   const uint16_t* i0 = input;
38   const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
39   const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
40   const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
41   const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
42   const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
43   const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
44   const uint16_t* i7 = (const uint16_t*) ((uintptr_t) i6 + input_stride);
45   uint16_t* o = (uint16_t*) ((uintptr_t) output - tile_hbytes);
46   const size_t minus_output_stride = -output_stride;
47 
48   do {
49     const size_t rem = min(block_width - 1, 7);
50     const size_t oN_stride = rem * output_stride;
51     const size_t oN_offset = oN_stride + tile_hbytes;
52     size_t bh = block_height;
53     for (; bh >= 8; bh -= 8) {
54       const v128_t v3_0 = wasm_v128_load(i0);
55       i0 = (uint16_t*) ((uintptr_t) i0 + input_offset);
56       const v128_t v3_1 = wasm_v128_load(i1);
57       i1 = (uint16_t*) ((uintptr_t) i1 + input_offset);
58       const v128_t v3_2 = wasm_v128_load(i2);
59       i2 = (uint16_t*) ((uintptr_t) i2 + input_offset);
60       const v128_t v3_3 = wasm_v128_load(i3);
61       i3 = (uint16_t*) ((uintptr_t) i3 + input_offset);
62       const v128_t v3_4 = wasm_v128_load(i4);
63       i4 = (uint16_t*) ((uintptr_t) i4 + input_offset);
64       const v128_t v3_5 = wasm_v128_load(i5);
65       i5 = (uint16_t*) ((uintptr_t) i5 + input_offset);
66       const v128_t v3_6 = wasm_v128_load(i6);
67       i6 = (uint16_t*) ((uintptr_t) i6 + input_offset);
68       const v128_t v3_7 = wasm_v128_load(i7);
69       i7 = (uint16_t*) ((uintptr_t) i7 + input_offset);
70 
71       const v128_t v2_0 = wasm_v16x8_shuffle(v3_0, v3_4, 0, 8, 1, 9, 2, 10, 3, 11);
72       const v128_t v2_1 = wasm_v16x8_shuffle(v3_0, v3_4, 4, 12, 5, 13, 6, 14, 7, 15);
73       const v128_t v2_2 = wasm_v16x8_shuffle(v3_1, v3_5, 0, 8, 1, 9, 2, 10, 3, 11);
74       const v128_t v2_3 = wasm_v16x8_shuffle(v3_1, v3_5, 4, 12, 5, 13, 6, 14, 7, 15);
75       const v128_t v2_4 = wasm_v16x8_shuffle(v3_2, v3_6, 0, 8, 1, 9, 2, 10, 3, 11);
76       const v128_t v2_5 = wasm_v16x8_shuffle(v3_2, v3_6, 4, 12, 5, 13, 6, 14, 7, 15);
77       const v128_t v2_6 = wasm_v16x8_shuffle(v3_3, v3_7, 0, 8, 1, 9, 2, 10, 3, 11);
78       const v128_t v2_7 = wasm_v16x8_shuffle(v3_3, v3_7, 4, 12, 5, 13, 6, 14, 7, 15);
79       const v128_t v1_0 = wasm_v16x8_shuffle(v2_0, v2_4, 0, 8, 1, 9, 2, 10, 3, 11);
80       const v128_t v1_1 = wasm_v16x8_shuffle(v2_0, v2_4, 4, 12, 5, 13, 6, 14, 7, 15);
81       const v128_t v1_2 = wasm_v16x8_shuffle(v2_1, v2_5, 0, 8, 1, 9, 2, 10, 3, 11);
82       const v128_t v1_3 = wasm_v16x8_shuffle(v2_1, v2_5, 4, 12, 5, 13, 6, 14, 7, 15);
83       const v128_t v1_4 = wasm_v16x8_shuffle(v2_2, v2_6, 0, 8, 1, 9, 2, 10, 3, 11);
84       const v128_t v1_5 = wasm_v16x8_shuffle(v2_2, v2_6, 4, 12, 5, 13, 6, 14, 7, 15);
85       const v128_t v1_6 = wasm_v16x8_shuffle(v2_3, v2_7, 0, 8, 1, 9, 2, 10, 3, 11);
86       const v128_t v1_7 = wasm_v16x8_shuffle(v2_3, v2_7, 4, 12, 5, 13, 6, 14, 7, 15);
87       const v128_t v0_0 = wasm_v16x8_shuffle(v1_0, v1_4, 0, 8, 1, 9, 2, 10, 3, 11);
88       const v128_t v0_1 = wasm_v16x8_shuffle(v1_0, v1_4, 4, 12, 5, 13, 6, 14, 7, 15);
89       const v128_t v0_2 = wasm_v16x8_shuffle(v1_1, v1_5, 0, 8, 1, 9, 2, 10, 3, 11);
90       const v128_t v0_3 = wasm_v16x8_shuffle(v1_1, v1_5, 4, 12, 5, 13, 6, 14, 7, 15);
91       const v128_t v0_4 = wasm_v16x8_shuffle(v1_2, v1_6, 0, 8, 1, 9, 2, 10, 3, 11);
92       const v128_t v0_5 = wasm_v16x8_shuffle(v1_2, v1_6, 4, 12, 5, 13, 6, 14, 7, 15);
93       const v128_t v0_6 = wasm_v16x8_shuffle(v1_3, v1_7, 0, 8, 1, 9, 2, 10, 3, 11);
94       const v128_t v0_7 = wasm_v16x8_shuffle(v1_3, v1_7, 4, 12, 5, 13, 6, 14, 7, 15);
95 
96       o = (uint16_t*) ((uintptr_t) o + oN_offset);
97       wasm_v128_store(o, v0_7);
98       uint16_t *oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
99       if XNN_UNPREDICTABLE(block_width > 7) {
100         o = oN;
101       }
102       wasm_v128_store(o, v0_6);
103       oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
104       if XNN_UNPREDICTABLE(block_width >= 7) {
105         o = oN;
106       }
107       wasm_v128_store(o, v0_5);
108       oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
109       if XNN_UNPREDICTABLE(block_width > 5) {
110         o = oN;
111       }
112       wasm_v128_store(o, v0_4);
113       oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
114       if XNN_UNPREDICTABLE(block_width >= 5) {
115         o = oN;
116       }
117       wasm_v128_store(o, v0_3);
118       oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
119       if XNN_UNPREDICTABLE(block_width > 3) {
120         o = oN;
121       }
122       wasm_v128_store(o, v0_2);
123       oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
124       if XNN_UNPREDICTABLE(block_width >= 3) {
125         o = oN;
126       }
127       wasm_v128_store(o, v0_1);
128       oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
129       if XNN_UNPREDICTABLE(block_width > 1) {
130         o = oN;
131       }
132       wasm_v128_store(o, v0_0);
133     }
134     o = (uint16_t*) ((uintptr_t) o + tile_hbytes);
135 
136     if (bh != 0) {
137       const v128_t v3_0 = wasm_v128_load(i0);
138       if XNN_UNPREDICTABLE(bh < 2) {
139         i1 = i0;
140       }
141       const v128_t v3_1 = wasm_v128_load(i1);
142       if XNN_UNPREDICTABLE(bh <= 2) {
143         i2 = i0;
144       }
145       const v128_t v3_2 = wasm_v128_load(i2);
146       if XNN_UNPREDICTABLE(bh < 4) {
147         i3 = i0;
148       }
149       const v128_t v3_3 = wasm_v128_load(i3);
150       if XNN_UNPREDICTABLE(bh <= 4) {
151         i4 = i0;
152       }
153       const v128_t v3_4 = wasm_v128_load(i4);
154       if XNN_UNPREDICTABLE(bh < 6) {
155         i5 = i0;
156       }
157       const v128_t v3_5 = wasm_v128_load(i5);
158       if XNN_UNPREDICTABLE(bh <= 6) {
159         i6 = i0;
160       }
161       const v128_t v3_6 = wasm_v128_load(i6);
162       const v128_t v3_7 = wasm_v128_xor(v3_0, v3_0);
163 
164       const v128_t v2_0 = wasm_v16x8_shuffle(v3_0, v3_4, 0, 8, 1, 9, 2, 10, 3, 11);
165       const v128_t v2_1 = wasm_v16x8_shuffle(v3_0, v3_4, 4, 12, 5, 13, 6, 14, 7, 15);
166       const v128_t v2_2 = wasm_v16x8_shuffle(v3_1, v3_5, 0, 8, 1, 9, 2, 10, 3, 11);
167       const v128_t v2_3 = wasm_v16x8_shuffle(v3_1, v3_5, 4, 12, 5, 13, 6, 14, 7, 15);
168       const v128_t v2_4 = wasm_v16x8_shuffle(v3_2, v3_6, 0, 8, 1, 9, 2, 10, 3, 11);
169       const v128_t v2_5 = wasm_v16x8_shuffle(v3_2, v3_6, 4, 12, 5, 13, 6, 14, 7, 15);
170       const v128_t v2_6 = wasm_v16x8_shuffle(v3_3, v3_7, 0, 8, 1, 9, 2, 10, 3, 11);
171       const v128_t v2_7 = wasm_v16x8_shuffle(v3_3, v3_7, 4, 12, 5, 13, 6, 14, 7, 15);
172       const v128_t v1_0 = wasm_v16x8_shuffle(v2_0, v2_4, 0, 8, 1, 9, 2, 10, 3, 11);
173       const v128_t v1_1 = wasm_v16x8_shuffle(v2_0, v2_4, 4, 12, 5, 13, 6, 14, 7, 15);
174       const v128_t v1_2 = wasm_v16x8_shuffle(v2_1, v2_5, 0, 8, 1, 9, 2, 10, 3, 11);
175       const v128_t v1_3 = wasm_v16x8_shuffle(v2_1, v2_5, 4, 12, 5, 13, 6, 14, 7, 15);
176       const v128_t v1_4 = wasm_v16x8_shuffle(v2_2, v2_6, 0, 8, 1, 9, 2, 10, 3, 11);
177       const v128_t v1_5 = wasm_v16x8_shuffle(v2_2, v2_6, 4, 12, 5, 13, 6, 14, 7, 15);
178       const v128_t v1_6 = wasm_v16x8_shuffle(v2_3, v2_7, 0, 8, 1, 9, 2, 10, 3, 11);
179       const v128_t v1_7 = wasm_v16x8_shuffle(v2_3, v2_7, 4, 12, 5, 13, 6, 14, 7, 15);
180 
181       v128_t v0_0 = wasm_v16x8_shuffle(v1_0, v1_4, 0, 8, 1, 9, 2, 10, 3, 11);
182       v128_t v0_1 = wasm_v16x8_shuffle(v1_0, v1_4, 4, 12, 5, 13, 6, 14, 7, 15);
183       v128_t v0_2 = wasm_v16x8_shuffle(v1_1, v1_5, 0, 8, 1, 9, 2, 10, 3, 11);
184       v128_t v0_3 = wasm_v16x8_shuffle(v1_1, v1_5, 4, 12, 5, 13, 6, 14, 7, 15);
185       v128_t v0_4 = wasm_v16x8_shuffle(v1_2, v1_6, 0, 8, 1, 9, 2, 10, 3, 11);
186       v128_t v0_5 = wasm_v16x8_shuffle(v1_2, v1_6, 4, 12, 5, 13, 6, 14, 7, 15);
187       v128_t v0_6 = wasm_v16x8_shuffle(v1_3, v1_7, 0, 8, 1, 9, 2, 10, 3, 11);
188       v128_t v0_7 = wasm_v16x8_shuffle(v1_3, v1_7, 4, 12, 5, 13, 6, 14, 7, 15);
189 
190       if (bh & 4) {
191         o = (uint16_t*) ((uintptr_t) o + oN_stride);
192         *((double*) o) = wasm_f64x2_extract_lane(v0_7, 0);
193         uint16_t *oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
194         if XNN_UNPREDICTABLE(block_width > 7) {
195           o = oN;
196         }
197         *((double*) o) = wasm_f64x2_extract_lane(v0_6, 0);
198         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
199         if XNN_UNPREDICTABLE(block_width >= 7) {
200           o = oN;
201         }
202         *((double*) o) = wasm_f64x2_extract_lane(v0_5, 0);
203         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
204         if XNN_UNPREDICTABLE(block_width > 5) {
205           o = oN;
206         }
207         *((double*) o) = wasm_f64x2_extract_lane(v0_4, 0);
208         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
209         if XNN_UNPREDICTABLE(block_width >= 5) {
210           o = oN;
211         }
212         *((double*) o) = wasm_f64x2_extract_lane(v0_3, 0);
213         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
214         if XNN_UNPREDICTABLE(block_width > 3) {
215           o = oN;
216         }
217         *((double*) o) = wasm_f64x2_extract_lane(v0_2, 0);
218         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
219         if XNN_UNPREDICTABLE(block_width >= 3) {
220           o = oN;
221         }
222         *((double*) o) = wasm_f64x2_extract_lane(v0_1, 0);
223         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
224         if XNN_UNPREDICTABLE(block_width > 1) {
225           o = oN;
226         }
227         *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0);
228         o += 4;
229         v0_0 = wasm_v64x2_shuffle(v0_0, v0_0, 1, 1);
230         v0_1 = wasm_v64x2_shuffle(v0_1, v0_1, 1, 1);
231         v0_2 = wasm_v64x2_shuffle(v0_2, v0_2, 1, 1);
232         v0_3 = wasm_v64x2_shuffle(v0_3, v0_3, 1, 1);
233         v0_4 = wasm_v64x2_shuffle(v0_4, v0_4, 1, 1);
234         v0_5 = wasm_v64x2_shuffle(v0_5, v0_5, 1, 1);
235         v0_6 = wasm_v64x2_shuffle(v0_6, v0_6, 1, 1);
236         v0_7 = wasm_v64x2_shuffle(v0_7, v0_7, 1, 1);
237       }
238 
239       if (bh & 2) {
240         o = (uint16_t*) ((uintptr_t) o + oN_stride);
241         *((float*) o) = wasm_f32x4_extract_lane(v0_7, 0);
242         uint16_t *oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
243         if XNN_UNPREDICTABLE(block_width > 7) {
244           o = oN;
245         }
246         *((float*) o) = wasm_f32x4_extract_lane(v0_6, 0);
247         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
248         if XNN_UNPREDICTABLE(block_width >= 7) {
249           o = oN;
250         }
251         *((float*) o) = wasm_f32x4_extract_lane(v0_5, 0);
252         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
253         if XNN_UNPREDICTABLE(block_width > 5) {
254           o = oN;
255         }
256         *((float*) o) = wasm_f32x4_extract_lane(v0_4, 0);
257         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
258         if XNN_UNPREDICTABLE(block_width >= 5) {
259           o = oN;
260         }
261         *((float*) o) = wasm_f32x4_extract_lane(v0_3, 0);
262         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
263         if XNN_UNPREDICTABLE(block_width > 3) {
264           o = oN;
265         }
266         *((float*) o) = wasm_f32x4_extract_lane(v0_2, 0);
267         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
268         if XNN_UNPREDICTABLE(block_width >= 3) {
269           o = oN;
270         }
271         *((float*) o) = wasm_f32x4_extract_lane(v0_1, 0);
272         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
273         if XNN_UNPREDICTABLE(block_width > 1) {
274           o = oN;
275         }
276         *((float*) o) = wasm_f32x4_extract_lane(v0_0, 0);
277         o += 2;
278         v0_0 = wasm_u64x2_shr(v0_0, 32);
279         v0_1 = wasm_u64x2_shr(v0_1, 32);
280         v0_2 = wasm_u64x2_shr(v0_2, 32);
281         v0_3 = wasm_u64x2_shr(v0_3, 32);
282         v0_4 = wasm_u64x2_shr(v0_4, 32);
283         v0_5 = wasm_u64x2_shr(v0_5, 32);
284         v0_6 = wasm_u64x2_shr(v0_6, 32);
285         v0_7 = wasm_u64x2_shr(v0_7, 32);
286       }
287       if (bh & 1) {
288         o = (uint16_t*) ((uintptr_t) o + oN_stride);
289         *o = wasm_i16x8_extract_lane(v0_7, 0);
290         uint16_t *oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
291         if XNN_UNPREDICTABLE(block_width > 7) {
292           o = oN;
293         }
294         *o = wasm_i16x8_extract_lane(v0_6, 0);
295         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
296         if XNN_UNPREDICTABLE(block_width >= 7) {
297           o = oN;
298         }
299         *o = wasm_i16x8_extract_lane(v0_5, 0);
300         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
301         if XNN_UNPREDICTABLE(block_width > 5) {
302           o = oN;
303         }
304         *o = wasm_i16x8_extract_lane(v0_4, 0);
305         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
306         if XNN_UNPREDICTABLE(block_width >= 5) {
307           o = oN;
308         }
309         *o = wasm_i16x8_extract_lane(v0_3, 0);
310         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
311         if XNN_UNPREDICTABLE(block_width > 3) {
312           o = oN;
313         }
314         *o = wasm_i16x8_extract_lane(v0_2, 0);
315         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
316         if XNN_UNPREDICTABLE(block_width >= 3) {
317           o = oN;
318         }
319         *o = wasm_i16x8_extract_lane(v0_1, 0);
320         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
321         if XNN_UNPREDICTABLE(block_width > 1) {
322           o = oN;
323         }
324         *o = wasm_i16x8_extract_lane(v0_0, 0);
325       }
326     }
327 
328     i0 = (const uint16_t*) ((uintptr_t) i0 + input_reset);
329     i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
330     i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
331     i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
332     i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
333     i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
334     i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
335     i7 = (const uint16_t*) ((uintptr_t) i6 + input_stride);
336     o = (uint16_t*) ((uintptr_t) o + output_reset);
337     block_width = doz(block_width, tile_width);
338   } while (block_width != 0);
339 }
340