xref: /aosp_15_r20/external/XNNPACK/src/x16-transposec/gen/8x8-reuse-mov-sse2.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/x32-transposec/sse2.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <immintrin.h>
11 
12 #include <assert.h>
13 
14 #include <xnnpack/common.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/transpose.h>
17 #include <xnnpack/unaligned.h>
18 
19 
xnn_x16_transposec_ukernel__8x8_reuse_mov_sse2(const uint16_t * input,uint16_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)20 void xnn_x16_transposec_ukernel__8x8_reuse_mov_sse2(
21     const uint16_t* input,
22     uint16_t* output,
23     size_t input_stride,
24     size_t output_stride,
25     size_t block_width,
26     size_t block_height) XNN_OOB_READS
27 {
28   assert(output_stride >= block_height * sizeof(uint16_t));
29   assert(input_stride >= block_width * sizeof(uint16_t));
30 
31   const size_t tile_height = 8;
32   const size_t tile_width = 8;
33   const size_t tile_hbytes = tile_height * sizeof(uint16_t);
34   const size_t tile_wbytes = tile_width * sizeof(uint16_t);
35   const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
36   const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint16_t) - tile_hbytes;
37 
38   const uint16_t* i0 = input;
39   uint16_t* o = (uint16_t*) ((uintptr_t) output - tile_hbytes);
40   const size_t minus_output_stride = -output_stride;
41 
42   do {
43     const size_t rem = min(block_width - 1, 7);
44     const size_t oN_stride = rem * output_stride;
45     const size_t oN_offset = oN_stride + tile_hbytes;
46     size_t bh = block_height;
47     for (; bh >= 8; bh -= 8) {
48       const __m128i v3_0 = _mm_loadu_si128((const __m128i*) i0);
49       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
50       const __m128i v3_1 = _mm_loadu_si128((const __m128i*) i0);
51       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
52       const __m128i v3_2 = _mm_loadu_si128((const __m128i*) i0);
53       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
54       const __m128i v3_3 = _mm_loadu_si128((const __m128i*) i0);
55       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
56       const __m128i v3_4 = _mm_loadu_si128((const __m128i*) i0);
57       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
58       const __m128i v3_5 = _mm_loadu_si128((const __m128i*) i0);
59       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
60       const __m128i v3_6 = _mm_loadu_si128((const __m128i*) i0);
61       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
62       const __m128i v3_7 = _mm_loadu_si128((const __m128i*) i0);
63       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
64 
65       const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_1);
66       const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_1);
67       const __m128i v2_2 = _mm_unpacklo_epi16(v3_2, v3_3);
68       const __m128i v2_3 = _mm_unpackhi_epi16(v3_2, v3_3);
69       const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_5);
70       const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_5);
71       const __m128i v2_6 = _mm_unpacklo_epi16(v3_6, v3_7);
72       const __m128i v2_7 = _mm_unpackhi_epi16(v3_6, v3_7);
73 
74       const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_2);
75       const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_2);
76       const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_3);
77       const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_3);
78       const __m128i v1_4 = _mm_unpacklo_epi32(v2_4, v2_6);
79       const __m128i v1_5 = _mm_unpackhi_epi32(v2_4, v2_6);
80       const __m128i v1_6 = _mm_unpacklo_epi32(v2_5, v2_7);
81       const __m128i v1_7 = _mm_unpackhi_epi32(v2_5, v2_7);
82 
83       const __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_4);
84       const __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_4);
85       const __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_5);
86       const __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_5);
87       const __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_6);
88       const __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_6);
89       const __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_7);
90       const __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_7);
91 
92 
93       o = (uint16_t*) ((uintptr_t) o + oN_offset);
94       _mm_storeu_si128((__m128i*) o, v0_7);
95       uint16_t *oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
96       if XNN_UNPREDICTABLE(block_width > 7) {
97         o = oN;
98       }
99       _mm_storeu_si128((__m128i*) o, v0_6);
100       oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
101       if XNN_UNPREDICTABLE(block_width >= 7) {
102         o = oN;
103       }
104       _mm_storeu_si128((__m128i*) o, v0_5);
105       oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
106       if XNN_UNPREDICTABLE(block_width > 5) {
107         o = oN;
108       }
109       _mm_storeu_si128((__m128i*) o, v0_4);
110       oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
111       if XNN_UNPREDICTABLE(block_width >= 5) {
112         o = oN;
113       }
114       _mm_storeu_si128((__m128i*) o, v0_3);
115       oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
116       if XNN_UNPREDICTABLE(block_width > 3) {
117         o = oN;
118       }
119       _mm_storeu_si128((__m128i*) o, v0_2);
120       oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
121       if XNN_UNPREDICTABLE(block_width >= 3) {
122         o = oN;
123       }
124       _mm_storeu_si128((__m128i*) o, v0_1);
125       oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
126       if XNN_UNPREDICTABLE(block_width > 1) {
127         o = oN;
128       }
129       _mm_storeu_si128((__m128i*) o, v0_0);
130     }
131     o = (uint16_t*) ((uintptr_t) o + tile_hbytes);
132     if (bh != 0) {
133       const __m128i v3_0 = _mm_loadu_si128((const __m128i*) i0);
134       const uint16_t *i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
135       if XNN_UNPREDICTABLE(bh < 2) {
136         i1 = i0;
137       }
138       const __m128i v3_1 = _mm_loadu_si128((const __m128i*) i1);
139       const uint16_t *i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
140       if XNN_UNPREDICTABLE(bh <= 2) {
141         i2 = i1;
142       }
143       const __m128i v3_2 = _mm_loadu_si128((const __m128i*) i2);
144       const uint16_t *i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
145       if XNN_UNPREDICTABLE(bh < 4) {
146         i3 = i2;
147       }
148       const __m128i v3_3 = _mm_loadu_si128((const __m128i*) i3);
149       const uint16_t *i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
150       if XNN_UNPREDICTABLE(bh <= 4) {
151         i4 = i3;
152       }
153       const __m128i v3_4 = _mm_loadu_si128((const __m128i*) i4);
154       const uint16_t *i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
155       if XNN_UNPREDICTABLE(bh < 6) {
156         i5 = i4;
157       }
158       const __m128i v3_5 = _mm_loadu_si128((const __m128i*) i5);
159       const uint16_t *i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
160       if XNN_UNPREDICTABLE(bh <= 6) {
161         i6 = i5;
162       }
163       const __m128i v3_6 = _mm_loadu_si128((const __m128i*) i6);
164       const __m128i v3_7 = _mm_undefined_si128();
165 
166       const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_1);
167       const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_1);
168       const __m128i v2_2 = _mm_unpacklo_epi16(v3_2, v3_3);
169       const __m128i v2_3 = _mm_unpackhi_epi16(v3_2, v3_3);
170       const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_5);
171       const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_5);
172       const __m128i v2_6 = _mm_unpacklo_epi16(v3_6, v3_7);
173       const __m128i v2_7 = _mm_unpackhi_epi16(v3_6, v3_7);
174 
175       const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_2);
176       const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_2);
177       const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_3);
178       const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_3);
179       const __m128i v1_4 = _mm_unpacklo_epi32(v2_4, v2_6);
180       const __m128i v1_5 = _mm_unpackhi_epi32(v2_4, v2_6);
181       const __m128i v1_6 = _mm_unpacklo_epi32(v2_5, v2_7);
182       const __m128i v1_7 = _mm_unpackhi_epi32(v2_5, v2_7);
183 
184       __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_4);
185       __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_4);
186       __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_5);
187       __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_5);
188       __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_6);
189       __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_6);
190       __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_7);
191       __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_7);
192 
193 
194       if (bh & 4) {
195         o = (uint16_t*) ((uintptr_t) o + oN_stride);
196         _mm_storel_epi64((__m128i*) o, v0_7);
197         uint16_t *oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
198         if XNN_UNPREDICTABLE(block_width > 7) {
199           o = oN;
200         }
201         _mm_storel_epi64((__m128i*) o, v0_6);
202         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
203         if XNN_UNPREDICTABLE(block_width >= 7) {
204           o = oN;
205         }
206         _mm_storel_epi64((__m128i*) o, v0_5);
207         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
208         if XNN_UNPREDICTABLE(block_width > 5) {
209           o = oN;
210         }
211         _mm_storel_epi64((__m128i*) o, v0_4);
212         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
213         if XNN_UNPREDICTABLE(block_width >= 5) {
214           o = oN;
215         }
216         _mm_storel_epi64((__m128i*) o, v0_3);
217         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
218         if XNN_UNPREDICTABLE(block_width > 3) {
219           o = oN;
220         }
221         _mm_storel_epi64((__m128i*) o, v0_2);
222         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
223         if XNN_UNPREDICTABLE(block_width >= 3) {
224           o = oN;
225         }
226         _mm_storel_epi64((__m128i*) o, v0_1);
227         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
228         if XNN_UNPREDICTABLE(block_width > 1) {
229           o = oN;
230         }
231         _mm_storel_epi64((__m128i*) o, v0_0);
232         o += 4;
233         v0_0 = _mm_unpackhi_epi64(v0_0, v0_0);
234         v0_1 = _mm_unpackhi_epi64(v0_1, v0_1);
235         v0_2 = _mm_unpackhi_epi64(v0_2, v0_2);
236         v0_3 = _mm_unpackhi_epi64(v0_3, v0_3);
237         v0_4 = _mm_unpackhi_epi64(v0_4, v0_4);
238         v0_5 = _mm_unpackhi_epi64(v0_5, v0_5);
239         v0_6 = _mm_unpackhi_epi64(v0_6, v0_6);
240         v0_7 = _mm_unpackhi_epi64(v0_7, v0_7);
241       }
242 
243       if (bh & 2) {
244         o = (uint16_t*) ((uintptr_t) o + oN_stride);
245         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_7));
246         uint16_t *oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
247         if XNN_UNPREDICTABLE(block_width > 7) {
248           o = oN;
249         }
250         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_6));
251         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
252         if XNN_UNPREDICTABLE(block_width >= 7) {
253           o = oN;
254         }
255         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_5));
256         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
257         if XNN_UNPREDICTABLE(block_width > 5) {
258           o = oN;
259         }
260         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_4));
261         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
262         if XNN_UNPREDICTABLE(block_width >= 5) {
263           o = oN;
264         }
265         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_3));
266         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
267         if XNN_UNPREDICTABLE(block_width > 3) {
268           o = oN;
269         }
270         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_2));
271         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
272         if XNN_UNPREDICTABLE(block_width >= 3) {
273           o = oN;
274         }
275         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_1));
276         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
277         if XNN_UNPREDICTABLE(block_width > 1) {
278           o = oN;
279         }
280         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_0));
281         o += 2;
282         v0_0 = _mm_srli_epi64(v0_0, 32);
283         v0_1 = _mm_srli_epi64(v0_1, 32);
284         v0_2 = _mm_srli_epi64(v0_2, 32);
285         v0_3 = _mm_srli_epi64(v0_3, 32);
286         v0_4 = _mm_srli_epi64(v0_4, 32);
287         v0_5 = _mm_srli_epi64(v0_5, 32);
288         v0_6 = _mm_srli_epi64(v0_6, 32);
289         v0_7 = _mm_srli_epi64(v0_7, 32);
290       }
291       if (bh & 1) {
292         o = (uint16_t*) ((uintptr_t) o + oN_stride);
293         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_7));
294         uint16_t* oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
295         if XNN_UNPREDICTABLE(block_width > 7) {
296           o = oN;
297         }
298         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_6));
299         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
300         if XNN_UNPREDICTABLE(block_width >= 7) {
301           o = oN;
302         }
303         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_5));
304         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
305         if XNN_UNPREDICTABLE(block_width > 5) {
306           o = oN;
307         }
308         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_4));
309         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
310         if XNN_UNPREDICTABLE(block_width >= 5) {
311           o = oN;
312         }
313         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_3));
314         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
315         if XNN_UNPREDICTABLE(block_width > 3) {
316           o = oN;
317         }
318         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_2));
319         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
320         if XNN_UNPREDICTABLE(block_width >= 3) {
321           o = oN;
322         }
323         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_1));
324         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
325         if XNN_UNPREDICTABLE(block_width > 1) {
326           o = oN;
327         }
328         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_0));
329       }
330     }
331 
332     i0 = (const uint16_t*) ((uintptr_t) i0 + input_reset);
333     o = (uint16_t*) ((uintptr_t) o + output_reset);
334     block_width = doz(block_width, tile_width);
335   } while (block_width != 0);
336 }
337