xref: /aosp_15_r20/external/executorch/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_sse.c (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 #include "fht.h"
2 static inline void helper_float_1(float *buf);
helper_float_1(float * buf)3 static inline void helper_float_1(float *buf) {
4   for (int j = 0; j < 2; j += 2) {
5     for (int k = 0; k < 1; ++k) {
6       float u = buf[j + k];
7       float v = buf[j + k + 1];
8       buf[j + k] = u + v;
9       buf[j + k + 1] = u - v;
10     }
11   }
12 }
13 static inline void helper_float_2(float *buf);
helper_float_2(float * buf)14 static inline void helper_float_2(float *buf) {
15   for (int j = 0; j < 4; j += 4) {
16     __asm__ volatile (
17       "movups (%0), %%xmm0\n"
18       "movaps %%xmm0, %%xmm8\n"
19       "shufps $160, %%xmm8, %%xmm8\n"
20       "shufps $245, %%xmm0, %%xmm0\n"
21       "xorps %%xmm9, %%xmm9\n"
22       "subps %%xmm0, %%xmm9\n"
23       "addsubps %%xmm9, %%xmm8\n"
24       "movaps %%xmm8, %%xmm0\n"
25       "movaps %%xmm0, %%xmm8\n"
26       "shufps $68, %%xmm8, %%xmm8\n"
27       "xorps %%xmm9, %%xmm9\n"
28       "movaps %%xmm0, %%xmm10\n"
29       "shufps $14, %%xmm9, %%xmm10\n"
30       "movaps %%xmm0, %%xmm11\n"
31       "shufps $224, %%xmm11, %%xmm9\n"
32       "addps %%xmm8, %%xmm10\n"
33       "subps %%xmm9, %%xmm10\n"
34       "movaps %%xmm10, %%xmm0\n"
35       "movups %%xmm0, (%0)\n"
36       :: "r"(buf + j) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
37     );
38   }
39 }
40 static inline void helper_float_3(float *buf);
helper_float_3(float * buf)41 static inline void helper_float_3(float *buf) {
42   for (int j = 0; j < 8; j += 8) {
43     for (int k = 0; k < 4; k += 4) {
44       __asm__ volatile (
45         "movups (%0), %%xmm0\n"
46         "movups (%1), %%xmm1\n"
47         "movaps %%xmm0, %%xmm8\n"
48         "shufps $160, %%xmm8, %%xmm8\n"
49         "shufps $245, %%xmm0, %%xmm0\n"
50         "xorps %%xmm9, %%xmm9\n"
51         "subps %%xmm0, %%xmm9\n"
52         "addsubps %%xmm9, %%xmm8\n"
53         "movaps %%xmm8, %%xmm0\n"
54         "movaps %%xmm1, %%xmm8\n"
55         "shufps $160, %%xmm8, %%xmm8\n"
56         "shufps $245, %%xmm1, %%xmm1\n"
57         "xorps %%xmm9, %%xmm9\n"
58         "subps %%xmm1, %%xmm9\n"
59         "addsubps %%xmm9, %%xmm8\n"
60         "movaps %%xmm8, %%xmm1\n"
61         "movaps %%xmm0, %%xmm8\n"
62         "shufps $68, %%xmm8, %%xmm8\n"
63         "xorps %%xmm9, %%xmm9\n"
64         "movaps %%xmm0, %%xmm10\n"
65         "shufps $14, %%xmm9, %%xmm10\n"
66         "movaps %%xmm0, %%xmm11\n"
67         "shufps $224, %%xmm11, %%xmm9\n"
68         "addps %%xmm8, %%xmm10\n"
69         "subps %%xmm9, %%xmm10\n"
70         "movaps %%xmm10, %%xmm0\n"
71         "movaps %%xmm1, %%xmm8\n"
72         "shufps $68, %%xmm8, %%xmm8\n"
73         "xorps %%xmm9, %%xmm9\n"
74         "movaps %%xmm1, %%xmm10\n"
75         "shufps $14, %%xmm9, %%xmm10\n"
76         "movaps %%xmm1, %%xmm11\n"
77         "shufps $224, %%xmm11, %%xmm9\n"
78         "addps %%xmm8, %%xmm10\n"
79         "subps %%xmm9, %%xmm10\n"
80         "movaps %%xmm10, %%xmm1\n"
81         "movaps %%xmm0, %%xmm8\n"
82         "movaps %%xmm0, %%xmm9\n"
83         "addps %%xmm1, %%xmm8\n"
84         "subps %%xmm1, %%xmm9\n"
85         "movups %%xmm8, (%0)\n"
86         "movups %%xmm9, (%1)\n"
87         :: "r"(buf + j + k + 0), "r"(buf + j + k + 4) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
88       );
89     }
90   }
91 }
92 static inline void helper_float_4(float *buf);
helper_float_4(float * buf)93 static inline void helper_float_4(float *buf) {
94   for (int j = 0; j < 16; j += 16) {
95     for (int k = 0; k < 4; k += 4) {
96       __asm__ volatile (
97         "movups (%0), %%xmm0\n"
98         "movups (%1), %%xmm1\n"
99         "movups (%2), %%xmm2\n"
100         "movups (%3), %%xmm3\n"
101         "movaps %%xmm0, %%xmm8\n"
102         "shufps $160, %%xmm8, %%xmm8\n"
103         "shufps $245, %%xmm0, %%xmm0\n"
104         "xorps %%xmm9, %%xmm9\n"
105         "subps %%xmm0, %%xmm9\n"
106         "addsubps %%xmm9, %%xmm8\n"
107         "movaps %%xmm8, %%xmm0\n"
108         "movaps %%xmm1, %%xmm8\n"
109         "shufps $160, %%xmm8, %%xmm8\n"
110         "shufps $245, %%xmm1, %%xmm1\n"
111         "xorps %%xmm9, %%xmm9\n"
112         "subps %%xmm1, %%xmm9\n"
113         "addsubps %%xmm9, %%xmm8\n"
114         "movaps %%xmm8, %%xmm1\n"
115         "movaps %%xmm2, %%xmm8\n"
116         "shufps $160, %%xmm8, %%xmm8\n"
117         "shufps $245, %%xmm2, %%xmm2\n"
118         "xorps %%xmm9, %%xmm9\n"
119         "subps %%xmm2, %%xmm9\n"
120         "addsubps %%xmm9, %%xmm8\n"
121         "movaps %%xmm8, %%xmm2\n"
122         "movaps %%xmm3, %%xmm8\n"
123         "shufps $160, %%xmm8, %%xmm8\n"
124         "shufps $245, %%xmm3, %%xmm3\n"
125         "xorps %%xmm9, %%xmm9\n"
126         "subps %%xmm3, %%xmm9\n"
127         "addsubps %%xmm9, %%xmm8\n"
128         "movaps %%xmm8, %%xmm3\n"
129         "movaps %%xmm0, %%xmm8\n"
130         "shufps $68, %%xmm8, %%xmm8\n"
131         "xorps %%xmm9, %%xmm9\n"
132         "movaps %%xmm0, %%xmm10\n"
133         "shufps $14, %%xmm9, %%xmm10\n"
134         "movaps %%xmm0, %%xmm11\n"
135         "shufps $224, %%xmm11, %%xmm9\n"
136         "addps %%xmm8, %%xmm10\n"
137         "subps %%xmm9, %%xmm10\n"
138         "movaps %%xmm10, %%xmm0\n"
139         "movaps %%xmm1, %%xmm8\n"
140         "shufps $68, %%xmm8, %%xmm8\n"
141         "xorps %%xmm9, %%xmm9\n"
142         "movaps %%xmm1, %%xmm10\n"
143         "shufps $14, %%xmm9, %%xmm10\n"
144         "movaps %%xmm1, %%xmm11\n"
145         "shufps $224, %%xmm11, %%xmm9\n"
146         "addps %%xmm8, %%xmm10\n"
147         "subps %%xmm9, %%xmm10\n"
148         "movaps %%xmm10, %%xmm1\n"
149         "movaps %%xmm2, %%xmm8\n"
150         "shufps $68, %%xmm8, %%xmm8\n"
151         "xorps %%xmm9, %%xmm9\n"
152         "movaps %%xmm2, %%xmm10\n"
153         "shufps $14, %%xmm9, %%xmm10\n"
154         "movaps %%xmm2, %%xmm11\n"
155         "shufps $224, %%xmm11, %%xmm9\n"
156         "addps %%xmm8, %%xmm10\n"
157         "subps %%xmm9, %%xmm10\n"
158         "movaps %%xmm10, %%xmm2\n"
159         "movaps %%xmm3, %%xmm8\n"
160         "shufps $68, %%xmm8, %%xmm8\n"
161         "xorps %%xmm9, %%xmm9\n"
162         "movaps %%xmm3, %%xmm10\n"
163         "shufps $14, %%xmm9, %%xmm10\n"
164         "movaps %%xmm3, %%xmm11\n"
165         "shufps $224, %%xmm11, %%xmm9\n"
166         "addps %%xmm8, %%xmm10\n"
167         "subps %%xmm9, %%xmm10\n"
168         "movaps %%xmm10, %%xmm3\n"
169         "movaps %%xmm0, %%xmm8\n"
170         "movaps %%xmm0, %%xmm9\n"
171         "addps %%xmm1, %%xmm8\n"
172         "subps %%xmm1, %%xmm9\n"
173         "movaps %%xmm2, %%xmm10\n"
174         "movaps %%xmm2, %%xmm11\n"
175         "addps %%xmm3, %%xmm10\n"
176         "subps %%xmm3, %%xmm11\n"
177         "movaps %%xmm8, %%xmm0\n"
178         "movaps %%xmm8, %%xmm2\n"
179         "addps %%xmm10, %%xmm0\n"
180         "subps %%xmm10, %%xmm2\n"
181         "movaps %%xmm9, %%xmm1\n"
182         "movaps %%xmm9, %%xmm3\n"
183         "addps %%xmm11, %%xmm1\n"
184         "subps %%xmm11, %%xmm3\n"
185         "movups %%xmm0, (%0)\n"
186         "movups %%xmm1, (%1)\n"
187         "movups %%xmm2, (%2)\n"
188         "movups %%xmm3, (%3)\n"
189         :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
190       );
191     }
192   }
193 }
194 static inline void helper_float_5(float *buf);
helper_float_5(float * buf)195 static inline void helper_float_5(float *buf) {
196   for (int j = 0; j < 32; j += 32) {
197     for (int k = 0; k < 4; k += 4) {
198       __asm__ volatile (
199         "movups (%0), %%xmm0\n"
200         "movups (%1), %%xmm1\n"
201         "movups (%2), %%xmm2\n"
202         "movups (%3), %%xmm3\n"
203         "movups (%4), %%xmm4\n"
204         "movups (%5), %%xmm5\n"
205         "movups (%6), %%xmm6\n"
206         "movups (%7), %%xmm7\n"
207         "movaps %%xmm0, %%xmm8\n"
208         "shufps $160, %%xmm8, %%xmm8\n"
209         "shufps $245, %%xmm0, %%xmm0\n"
210         "xorps %%xmm9, %%xmm9\n"
211         "subps %%xmm0, %%xmm9\n"
212         "addsubps %%xmm9, %%xmm8\n"
213         "movaps %%xmm8, %%xmm0\n"
214         "movaps %%xmm1, %%xmm8\n"
215         "shufps $160, %%xmm8, %%xmm8\n"
216         "shufps $245, %%xmm1, %%xmm1\n"
217         "xorps %%xmm9, %%xmm9\n"
218         "subps %%xmm1, %%xmm9\n"
219         "addsubps %%xmm9, %%xmm8\n"
220         "movaps %%xmm8, %%xmm1\n"
221         "movaps %%xmm2, %%xmm8\n"
222         "shufps $160, %%xmm8, %%xmm8\n"
223         "shufps $245, %%xmm2, %%xmm2\n"
224         "xorps %%xmm9, %%xmm9\n"
225         "subps %%xmm2, %%xmm9\n"
226         "addsubps %%xmm9, %%xmm8\n"
227         "movaps %%xmm8, %%xmm2\n"
228         "movaps %%xmm3, %%xmm8\n"
229         "shufps $160, %%xmm8, %%xmm8\n"
230         "shufps $245, %%xmm3, %%xmm3\n"
231         "xorps %%xmm9, %%xmm9\n"
232         "subps %%xmm3, %%xmm9\n"
233         "addsubps %%xmm9, %%xmm8\n"
234         "movaps %%xmm8, %%xmm3\n"
235         "movaps %%xmm4, %%xmm8\n"
236         "shufps $160, %%xmm8, %%xmm8\n"
237         "shufps $245, %%xmm4, %%xmm4\n"
238         "xorps %%xmm9, %%xmm9\n"
239         "subps %%xmm4, %%xmm9\n"
240         "addsubps %%xmm9, %%xmm8\n"
241         "movaps %%xmm8, %%xmm4\n"
242         "movaps %%xmm5, %%xmm8\n"
243         "shufps $160, %%xmm8, %%xmm8\n"
244         "shufps $245, %%xmm5, %%xmm5\n"
245         "xorps %%xmm9, %%xmm9\n"
246         "subps %%xmm5, %%xmm9\n"
247         "addsubps %%xmm9, %%xmm8\n"
248         "movaps %%xmm8, %%xmm5\n"
249         "movaps %%xmm6, %%xmm8\n"
250         "shufps $160, %%xmm8, %%xmm8\n"
251         "shufps $245, %%xmm6, %%xmm6\n"
252         "xorps %%xmm9, %%xmm9\n"
253         "subps %%xmm6, %%xmm9\n"
254         "addsubps %%xmm9, %%xmm8\n"
255         "movaps %%xmm8, %%xmm6\n"
256         "movaps %%xmm7, %%xmm8\n"
257         "shufps $160, %%xmm8, %%xmm8\n"
258         "shufps $245, %%xmm7, %%xmm7\n"
259         "xorps %%xmm9, %%xmm9\n"
260         "subps %%xmm7, %%xmm9\n"
261         "addsubps %%xmm9, %%xmm8\n"
262         "movaps %%xmm8, %%xmm7\n"
263         "movaps %%xmm0, %%xmm8\n"
264         "shufps $68, %%xmm8, %%xmm8\n"
265         "xorps %%xmm9, %%xmm9\n"
266         "movaps %%xmm0, %%xmm10\n"
267         "shufps $14, %%xmm9, %%xmm10\n"
268         "movaps %%xmm0, %%xmm11\n"
269         "shufps $224, %%xmm11, %%xmm9\n"
270         "addps %%xmm8, %%xmm10\n"
271         "subps %%xmm9, %%xmm10\n"
272         "movaps %%xmm10, %%xmm0\n"
273         "movaps %%xmm1, %%xmm8\n"
274         "shufps $68, %%xmm8, %%xmm8\n"
275         "xorps %%xmm9, %%xmm9\n"
276         "movaps %%xmm1, %%xmm10\n"
277         "shufps $14, %%xmm9, %%xmm10\n"
278         "movaps %%xmm1, %%xmm11\n"
279         "shufps $224, %%xmm11, %%xmm9\n"
280         "addps %%xmm8, %%xmm10\n"
281         "subps %%xmm9, %%xmm10\n"
282         "movaps %%xmm10, %%xmm1\n"
283         "movaps %%xmm2, %%xmm8\n"
284         "shufps $68, %%xmm8, %%xmm8\n"
285         "xorps %%xmm9, %%xmm9\n"
286         "movaps %%xmm2, %%xmm10\n"
287         "shufps $14, %%xmm9, %%xmm10\n"
288         "movaps %%xmm2, %%xmm11\n"
289         "shufps $224, %%xmm11, %%xmm9\n"
290         "addps %%xmm8, %%xmm10\n"
291         "subps %%xmm9, %%xmm10\n"
292         "movaps %%xmm10, %%xmm2\n"
293         "movaps %%xmm3, %%xmm8\n"
294         "shufps $68, %%xmm8, %%xmm8\n"
295         "xorps %%xmm9, %%xmm9\n"
296         "movaps %%xmm3, %%xmm10\n"
297         "shufps $14, %%xmm9, %%xmm10\n"
298         "movaps %%xmm3, %%xmm11\n"
299         "shufps $224, %%xmm11, %%xmm9\n"
300         "addps %%xmm8, %%xmm10\n"
301         "subps %%xmm9, %%xmm10\n"
302         "movaps %%xmm10, %%xmm3\n"
303         "movaps %%xmm4, %%xmm8\n"
304         "shufps $68, %%xmm8, %%xmm8\n"
305         "xorps %%xmm9, %%xmm9\n"
306         "movaps %%xmm4, %%xmm10\n"
307         "shufps $14, %%xmm9, %%xmm10\n"
308         "movaps %%xmm4, %%xmm11\n"
309         "shufps $224, %%xmm11, %%xmm9\n"
310         "addps %%xmm8, %%xmm10\n"
311         "subps %%xmm9, %%xmm10\n"
312         "movaps %%xmm10, %%xmm4\n"
313         "movaps %%xmm5, %%xmm8\n"
314         "shufps $68, %%xmm8, %%xmm8\n"
315         "xorps %%xmm9, %%xmm9\n"
316         "movaps %%xmm5, %%xmm10\n"
317         "shufps $14, %%xmm9, %%xmm10\n"
318         "movaps %%xmm5, %%xmm11\n"
319         "shufps $224, %%xmm11, %%xmm9\n"
320         "addps %%xmm8, %%xmm10\n"
321         "subps %%xmm9, %%xmm10\n"
322         "movaps %%xmm10, %%xmm5\n"
323         "movaps %%xmm6, %%xmm8\n"
324         "shufps $68, %%xmm8, %%xmm8\n"
325         "xorps %%xmm9, %%xmm9\n"
326         "movaps %%xmm6, %%xmm10\n"
327         "shufps $14, %%xmm9, %%xmm10\n"
328         "movaps %%xmm6, %%xmm11\n"
329         "shufps $224, %%xmm11, %%xmm9\n"
330         "addps %%xmm8, %%xmm10\n"
331         "subps %%xmm9, %%xmm10\n"
332         "movaps %%xmm10, %%xmm6\n"
333         "movaps %%xmm7, %%xmm8\n"
334         "shufps $68, %%xmm8, %%xmm8\n"
335         "xorps %%xmm9, %%xmm9\n"
336         "movaps %%xmm7, %%xmm10\n"
337         "shufps $14, %%xmm9, %%xmm10\n"
338         "movaps %%xmm7, %%xmm11\n"
339         "shufps $224, %%xmm11, %%xmm9\n"
340         "addps %%xmm8, %%xmm10\n"
341         "subps %%xmm9, %%xmm10\n"
342         "movaps %%xmm10, %%xmm7\n"
343         "movaps %%xmm0, %%xmm8\n"
344         "movaps %%xmm0, %%xmm9\n"
345         "addps %%xmm1, %%xmm8\n"
346         "subps %%xmm1, %%xmm9\n"
347         "movaps %%xmm2, %%xmm10\n"
348         "movaps %%xmm2, %%xmm11\n"
349         "addps %%xmm3, %%xmm10\n"
350         "subps %%xmm3, %%xmm11\n"
351         "movaps %%xmm4, %%xmm12\n"
352         "movaps %%xmm4, %%xmm13\n"
353         "addps %%xmm5, %%xmm12\n"
354         "subps %%xmm5, %%xmm13\n"
355         "movaps %%xmm6, %%xmm14\n"
356         "movaps %%xmm6, %%xmm15\n"
357         "addps %%xmm7, %%xmm14\n"
358         "subps %%xmm7, %%xmm15\n"
359         "movaps %%xmm8, %%xmm0\n"
360         "movaps %%xmm8, %%xmm2\n"
361         "addps %%xmm10, %%xmm0\n"
362         "subps %%xmm10, %%xmm2\n"
363         "movaps %%xmm9, %%xmm1\n"
364         "movaps %%xmm9, %%xmm3\n"
365         "addps %%xmm11, %%xmm1\n"
366         "subps %%xmm11, %%xmm3\n"
367         "movaps %%xmm12, %%xmm4\n"
368         "movaps %%xmm12, %%xmm6\n"
369         "addps %%xmm14, %%xmm4\n"
370         "subps %%xmm14, %%xmm6\n"
371         "movaps %%xmm13, %%xmm5\n"
372         "movaps %%xmm13, %%xmm7\n"
373         "addps %%xmm15, %%xmm5\n"
374         "subps %%xmm15, %%xmm7\n"
375         "movaps %%xmm0, %%xmm8\n"
376         "movaps %%xmm0, %%xmm12\n"
377         "addps %%xmm4, %%xmm8\n"
378         "subps %%xmm4, %%xmm12\n"
379         "movaps %%xmm1, %%xmm9\n"
380         "movaps %%xmm1, %%xmm13\n"
381         "addps %%xmm5, %%xmm9\n"
382         "subps %%xmm5, %%xmm13\n"
383         "movaps %%xmm2, %%xmm10\n"
384         "movaps %%xmm2, %%xmm14\n"
385         "addps %%xmm6, %%xmm10\n"
386         "subps %%xmm6, %%xmm14\n"
387         "movaps %%xmm3, %%xmm11\n"
388         "movaps %%xmm3, %%xmm15\n"
389         "addps %%xmm7, %%xmm11\n"
390         "subps %%xmm7, %%xmm15\n"
391         "movups %%xmm8, (%0)\n"
392         "movups %%xmm9, (%1)\n"
393         "movups %%xmm10, (%2)\n"
394         "movups %%xmm11, (%3)\n"
395         "movups %%xmm12, (%4)\n"
396         "movups %%xmm13, (%5)\n"
397         "movups %%xmm14, (%6)\n"
398         "movups %%xmm15, (%7)\n"
399         :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
400       );
401     }
402   }
403 }
404 static inline void helper_float_6(float *buf);
helper_float_6(float * buf)405 static inline void helper_float_6(float *buf) {
406   for (int j = 0; j < 64; j += 32) {
407     for (int k = 0; k < 4; k += 4) {
408       __asm__ volatile (
409         "movups (%0), %%xmm0\n"
410         "movups (%1), %%xmm1\n"
411         "movups (%2), %%xmm2\n"
412         "movups (%3), %%xmm3\n"
413         "movups (%4), %%xmm4\n"
414         "movups (%5), %%xmm5\n"
415         "movups (%6), %%xmm6\n"
416         "movups (%7), %%xmm7\n"
417         "movaps %%xmm0, %%xmm8\n"
418         "shufps $160, %%xmm8, %%xmm8\n"
419         "shufps $245, %%xmm0, %%xmm0\n"
420         "xorps %%xmm9, %%xmm9\n"
421         "subps %%xmm0, %%xmm9\n"
422         "addsubps %%xmm9, %%xmm8\n"
423         "movaps %%xmm8, %%xmm0\n"
424         "movaps %%xmm1, %%xmm8\n"
425         "shufps $160, %%xmm8, %%xmm8\n"
426         "shufps $245, %%xmm1, %%xmm1\n"
427         "xorps %%xmm9, %%xmm9\n"
428         "subps %%xmm1, %%xmm9\n"
429         "addsubps %%xmm9, %%xmm8\n"
430         "movaps %%xmm8, %%xmm1\n"
431         "movaps %%xmm2, %%xmm8\n"
432         "shufps $160, %%xmm8, %%xmm8\n"
433         "shufps $245, %%xmm2, %%xmm2\n"
434         "xorps %%xmm9, %%xmm9\n"
435         "subps %%xmm2, %%xmm9\n"
436         "addsubps %%xmm9, %%xmm8\n"
437         "movaps %%xmm8, %%xmm2\n"
438         "movaps %%xmm3, %%xmm8\n"
439         "shufps $160, %%xmm8, %%xmm8\n"
440         "shufps $245, %%xmm3, %%xmm3\n"
441         "xorps %%xmm9, %%xmm9\n"
442         "subps %%xmm3, %%xmm9\n"
443         "addsubps %%xmm9, %%xmm8\n"
444         "movaps %%xmm8, %%xmm3\n"
445         "movaps %%xmm4, %%xmm8\n"
446         "shufps $160, %%xmm8, %%xmm8\n"
447         "shufps $245, %%xmm4, %%xmm4\n"
448         "xorps %%xmm9, %%xmm9\n"
449         "subps %%xmm4, %%xmm9\n"
450         "addsubps %%xmm9, %%xmm8\n"
451         "movaps %%xmm8, %%xmm4\n"
452         "movaps %%xmm5, %%xmm8\n"
453         "shufps $160, %%xmm8, %%xmm8\n"
454         "shufps $245, %%xmm5, %%xmm5\n"
455         "xorps %%xmm9, %%xmm9\n"
456         "subps %%xmm5, %%xmm9\n"
457         "addsubps %%xmm9, %%xmm8\n"
458         "movaps %%xmm8, %%xmm5\n"
459         "movaps %%xmm6, %%xmm8\n"
460         "shufps $160, %%xmm8, %%xmm8\n"
461         "shufps $245, %%xmm6, %%xmm6\n"
462         "xorps %%xmm9, %%xmm9\n"
463         "subps %%xmm6, %%xmm9\n"
464         "addsubps %%xmm9, %%xmm8\n"
465         "movaps %%xmm8, %%xmm6\n"
466         "movaps %%xmm7, %%xmm8\n"
467         "shufps $160, %%xmm8, %%xmm8\n"
468         "shufps $245, %%xmm7, %%xmm7\n"
469         "xorps %%xmm9, %%xmm9\n"
470         "subps %%xmm7, %%xmm9\n"
471         "addsubps %%xmm9, %%xmm8\n"
472         "movaps %%xmm8, %%xmm7\n"
473         "movaps %%xmm0, %%xmm8\n"
474         "shufps $68, %%xmm8, %%xmm8\n"
475         "xorps %%xmm9, %%xmm9\n"
476         "movaps %%xmm0, %%xmm10\n"
477         "shufps $14, %%xmm9, %%xmm10\n"
478         "movaps %%xmm0, %%xmm11\n"
479         "shufps $224, %%xmm11, %%xmm9\n"
480         "addps %%xmm8, %%xmm10\n"
481         "subps %%xmm9, %%xmm10\n"
482         "movaps %%xmm10, %%xmm0\n"
483         "movaps %%xmm1, %%xmm8\n"
484         "shufps $68, %%xmm8, %%xmm8\n"
485         "xorps %%xmm9, %%xmm9\n"
486         "movaps %%xmm1, %%xmm10\n"
487         "shufps $14, %%xmm9, %%xmm10\n"
488         "movaps %%xmm1, %%xmm11\n"
489         "shufps $224, %%xmm11, %%xmm9\n"
490         "addps %%xmm8, %%xmm10\n"
491         "subps %%xmm9, %%xmm10\n"
492         "movaps %%xmm10, %%xmm1\n"
493         "movaps %%xmm2, %%xmm8\n"
494         "shufps $68, %%xmm8, %%xmm8\n"
495         "xorps %%xmm9, %%xmm9\n"
496         "movaps %%xmm2, %%xmm10\n"
497         "shufps $14, %%xmm9, %%xmm10\n"
498         "movaps %%xmm2, %%xmm11\n"
499         "shufps $224, %%xmm11, %%xmm9\n"
500         "addps %%xmm8, %%xmm10\n"
501         "subps %%xmm9, %%xmm10\n"
502         "movaps %%xmm10, %%xmm2\n"
503         "movaps %%xmm3, %%xmm8\n"
504         "shufps $68, %%xmm8, %%xmm8\n"
505         "xorps %%xmm9, %%xmm9\n"
506         "movaps %%xmm3, %%xmm10\n"
507         "shufps $14, %%xmm9, %%xmm10\n"
508         "movaps %%xmm3, %%xmm11\n"
509         "shufps $224, %%xmm11, %%xmm9\n"
510         "addps %%xmm8, %%xmm10\n"
511         "subps %%xmm9, %%xmm10\n"
512         "movaps %%xmm10, %%xmm3\n"
513         "movaps %%xmm4, %%xmm8\n"
514         "shufps $68, %%xmm8, %%xmm8\n"
515         "xorps %%xmm9, %%xmm9\n"
516         "movaps %%xmm4, %%xmm10\n"
517         "shufps $14, %%xmm9, %%xmm10\n"
518         "movaps %%xmm4, %%xmm11\n"
519         "shufps $224, %%xmm11, %%xmm9\n"
520         "addps %%xmm8, %%xmm10\n"
521         "subps %%xmm9, %%xmm10\n"
522         "movaps %%xmm10, %%xmm4\n"
523         "movaps %%xmm5, %%xmm8\n"
524         "shufps $68, %%xmm8, %%xmm8\n"
525         "xorps %%xmm9, %%xmm9\n"
526         "movaps %%xmm5, %%xmm10\n"
527         "shufps $14, %%xmm9, %%xmm10\n"
528         "movaps %%xmm5, %%xmm11\n"
529         "shufps $224, %%xmm11, %%xmm9\n"
530         "addps %%xmm8, %%xmm10\n"
531         "subps %%xmm9, %%xmm10\n"
532         "movaps %%xmm10, %%xmm5\n"
533         "movaps %%xmm6, %%xmm8\n"
534         "shufps $68, %%xmm8, %%xmm8\n"
535         "xorps %%xmm9, %%xmm9\n"
536         "movaps %%xmm6, %%xmm10\n"
537         "shufps $14, %%xmm9, %%xmm10\n"
538         "movaps %%xmm6, %%xmm11\n"
539         "shufps $224, %%xmm11, %%xmm9\n"
540         "addps %%xmm8, %%xmm10\n"
541         "subps %%xmm9, %%xmm10\n"
542         "movaps %%xmm10, %%xmm6\n"
543         "movaps %%xmm7, %%xmm8\n"
544         "shufps $68, %%xmm8, %%xmm8\n"
545         "xorps %%xmm9, %%xmm9\n"
546         "movaps %%xmm7, %%xmm10\n"
547         "shufps $14, %%xmm9, %%xmm10\n"
548         "movaps %%xmm7, %%xmm11\n"
549         "shufps $224, %%xmm11, %%xmm9\n"
550         "addps %%xmm8, %%xmm10\n"
551         "subps %%xmm9, %%xmm10\n"
552         "movaps %%xmm10, %%xmm7\n"
553         "movaps %%xmm0, %%xmm8\n"
554         "movaps %%xmm0, %%xmm9\n"
555         "addps %%xmm1, %%xmm8\n"
556         "subps %%xmm1, %%xmm9\n"
557         "movaps %%xmm2, %%xmm10\n"
558         "movaps %%xmm2, %%xmm11\n"
559         "addps %%xmm3, %%xmm10\n"
560         "subps %%xmm3, %%xmm11\n"
561         "movaps %%xmm4, %%xmm12\n"
562         "movaps %%xmm4, %%xmm13\n"
563         "addps %%xmm5, %%xmm12\n"
564         "subps %%xmm5, %%xmm13\n"
565         "movaps %%xmm6, %%xmm14\n"
566         "movaps %%xmm6, %%xmm15\n"
567         "addps %%xmm7, %%xmm14\n"
568         "subps %%xmm7, %%xmm15\n"
569         "movaps %%xmm8, %%xmm0\n"
570         "movaps %%xmm8, %%xmm2\n"
571         "addps %%xmm10, %%xmm0\n"
572         "subps %%xmm10, %%xmm2\n"
573         "movaps %%xmm9, %%xmm1\n"
574         "movaps %%xmm9, %%xmm3\n"
575         "addps %%xmm11, %%xmm1\n"
576         "subps %%xmm11, %%xmm3\n"
577         "movaps %%xmm12, %%xmm4\n"
578         "movaps %%xmm12, %%xmm6\n"
579         "addps %%xmm14, %%xmm4\n"
580         "subps %%xmm14, %%xmm6\n"
581         "movaps %%xmm13, %%xmm5\n"
582         "movaps %%xmm13, %%xmm7\n"
583         "addps %%xmm15, %%xmm5\n"
584         "subps %%xmm15, %%xmm7\n"
585         "movaps %%xmm0, %%xmm8\n"
586         "movaps %%xmm0, %%xmm12\n"
587         "addps %%xmm4, %%xmm8\n"
588         "subps %%xmm4, %%xmm12\n"
589         "movaps %%xmm1, %%xmm9\n"
590         "movaps %%xmm1, %%xmm13\n"
591         "addps %%xmm5, %%xmm9\n"
592         "subps %%xmm5, %%xmm13\n"
593         "movaps %%xmm2, %%xmm10\n"
594         "movaps %%xmm2, %%xmm14\n"
595         "addps %%xmm6, %%xmm10\n"
596         "subps %%xmm6, %%xmm14\n"
597         "movaps %%xmm3, %%xmm11\n"
598         "movaps %%xmm3, %%xmm15\n"
599         "addps %%xmm7, %%xmm11\n"
600         "subps %%xmm7, %%xmm15\n"
601         "movups %%xmm8, (%0)\n"
602         "movups %%xmm9, (%1)\n"
603         "movups %%xmm10, (%2)\n"
604         "movups %%xmm11, (%3)\n"
605         "movups %%xmm12, (%4)\n"
606         "movups %%xmm13, (%5)\n"
607         "movups %%xmm14, (%6)\n"
608         "movups %%xmm15, (%7)\n"
609         :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
610       );
611     }
612   }
613   for (int j = 0; j < 64; j += 64) {
614     for (int k = 0; k < 32; k += 4) {
615       __asm__ volatile (
616         "movups (%0), %%xmm0\n"
617         "movups (%1), %%xmm1\n"
618         "movaps %%xmm0, %%xmm8\n"
619         "movaps %%xmm0, %%xmm9\n"
620         "addps %%xmm1, %%xmm8\n"
621         "subps %%xmm1, %%xmm9\n"
622         "movups %%xmm8, (%0)\n"
623         "movups %%xmm9, (%1)\n"
624         :: "r"(buf + j + k + 0), "r"(buf + j + k + 32) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
625       );
626     }
627   }
628 }
629 void helper_float_7_recursive(float *buf, int depth);
helper_float_7_recursive(float * buf,int depth)630 void helper_float_7_recursive(float *buf, int depth) {
631   if (depth == 7) {
632     for (int j = 0; j < 128; j += 32) {
633       for (int k = 0; k < 4; k += 4) {
634         __asm__ volatile (
635           "movups (%0), %%xmm0\n"
636           "movups (%1), %%xmm1\n"
637           "movups (%2), %%xmm2\n"
638           "movups (%3), %%xmm3\n"
639           "movups (%4), %%xmm4\n"
640           "movups (%5), %%xmm5\n"
641           "movups (%6), %%xmm6\n"
642           "movups (%7), %%xmm7\n"
643           "movaps %%xmm0, %%xmm8\n"
644           "shufps $160, %%xmm8, %%xmm8\n"
645           "shufps $245, %%xmm0, %%xmm0\n"
646           "xorps %%xmm9, %%xmm9\n"
647           "subps %%xmm0, %%xmm9\n"
648           "addsubps %%xmm9, %%xmm8\n"
649           "movaps %%xmm8, %%xmm0\n"
650           "movaps %%xmm1, %%xmm8\n"
651           "shufps $160, %%xmm8, %%xmm8\n"
652           "shufps $245, %%xmm1, %%xmm1\n"
653           "xorps %%xmm9, %%xmm9\n"
654           "subps %%xmm1, %%xmm9\n"
655           "addsubps %%xmm9, %%xmm8\n"
656           "movaps %%xmm8, %%xmm1\n"
657           "movaps %%xmm2, %%xmm8\n"
658           "shufps $160, %%xmm8, %%xmm8\n"
659           "shufps $245, %%xmm2, %%xmm2\n"
660           "xorps %%xmm9, %%xmm9\n"
661           "subps %%xmm2, %%xmm9\n"
662           "addsubps %%xmm9, %%xmm8\n"
663           "movaps %%xmm8, %%xmm2\n"
664           "movaps %%xmm3, %%xmm8\n"
665           "shufps $160, %%xmm8, %%xmm8\n"
666           "shufps $245, %%xmm3, %%xmm3\n"
667           "xorps %%xmm9, %%xmm9\n"
668           "subps %%xmm3, %%xmm9\n"
669           "addsubps %%xmm9, %%xmm8\n"
670           "movaps %%xmm8, %%xmm3\n"
671           "movaps %%xmm4, %%xmm8\n"
672           "shufps $160, %%xmm8, %%xmm8\n"
673           "shufps $245, %%xmm4, %%xmm4\n"
674           "xorps %%xmm9, %%xmm9\n"
675           "subps %%xmm4, %%xmm9\n"
676           "addsubps %%xmm9, %%xmm8\n"
677           "movaps %%xmm8, %%xmm4\n"
678           "movaps %%xmm5, %%xmm8\n"
679           "shufps $160, %%xmm8, %%xmm8\n"
680           "shufps $245, %%xmm5, %%xmm5\n"
681           "xorps %%xmm9, %%xmm9\n"
682           "subps %%xmm5, %%xmm9\n"
683           "addsubps %%xmm9, %%xmm8\n"
684           "movaps %%xmm8, %%xmm5\n"
685           "movaps %%xmm6, %%xmm8\n"
686           "shufps $160, %%xmm8, %%xmm8\n"
687           "shufps $245, %%xmm6, %%xmm6\n"
688           "xorps %%xmm9, %%xmm9\n"
689           "subps %%xmm6, %%xmm9\n"
690           "addsubps %%xmm9, %%xmm8\n"
691           "movaps %%xmm8, %%xmm6\n"
692           "movaps %%xmm7, %%xmm8\n"
693           "shufps $160, %%xmm8, %%xmm8\n"
694           "shufps $245, %%xmm7, %%xmm7\n"
695           "xorps %%xmm9, %%xmm9\n"
696           "subps %%xmm7, %%xmm9\n"
697           "addsubps %%xmm9, %%xmm8\n"
698           "movaps %%xmm8, %%xmm7\n"
699           "movaps %%xmm0, %%xmm8\n"
700           "shufps $68, %%xmm8, %%xmm8\n"
701           "xorps %%xmm9, %%xmm9\n"
702           "movaps %%xmm0, %%xmm10\n"
703           "shufps $14, %%xmm9, %%xmm10\n"
704           "movaps %%xmm0, %%xmm11\n"
705           "shufps $224, %%xmm11, %%xmm9\n"
706           "addps %%xmm8, %%xmm10\n"
707           "subps %%xmm9, %%xmm10\n"
708           "movaps %%xmm10, %%xmm0\n"
709           "movaps %%xmm1, %%xmm8\n"
710           "shufps $68, %%xmm8, %%xmm8\n"
711           "xorps %%xmm9, %%xmm9\n"
712           "movaps %%xmm1, %%xmm10\n"
713           "shufps $14, %%xmm9, %%xmm10\n"
714           "movaps %%xmm1, %%xmm11\n"
715           "shufps $224, %%xmm11, %%xmm9\n"
716           "addps %%xmm8, %%xmm10\n"
717           "subps %%xmm9, %%xmm10\n"
718           "movaps %%xmm10, %%xmm1\n"
719           "movaps %%xmm2, %%xmm8\n"
720           "shufps $68, %%xmm8, %%xmm8\n"
721           "xorps %%xmm9, %%xmm9\n"
722           "movaps %%xmm2, %%xmm10\n"
723           "shufps $14, %%xmm9, %%xmm10\n"
724           "movaps %%xmm2, %%xmm11\n"
725           "shufps $224, %%xmm11, %%xmm9\n"
726           "addps %%xmm8, %%xmm10\n"
727           "subps %%xmm9, %%xmm10\n"
728           "movaps %%xmm10, %%xmm2\n"
729           "movaps %%xmm3, %%xmm8\n"
730           "shufps $68, %%xmm8, %%xmm8\n"
731           "xorps %%xmm9, %%xmm9\n"
732           "movaps %%xmm3, %%xmm10\n"
733           "shufps $14, %%xmm9, %%xmm10\n"
734           "movaps %%xmm3, %%xmm11\n"
735           "shufps $224, %%xmm11, %%xmm9\n"
736           "addps %%xmm8, %%xmm10\n"
737           "subps %%xmm9, %%xmm10\n"
738           "movaps %%xmm10, %%xmm3\n"
739           "movaps %%xmm4, %%xmm8\n"
740           "shufps $68, %%xmm8, %%xmm8\n"
741           "xorps %%xmm9, %%xmm9\n"
742           "movaps %%xmm4, %%xmm10\n"
743           "shufps $14, %%xmm9, %%xmm10\n"
744           "movaps %%xmm4, %%xmm11\n"
745           "shufps $224, %%xmm11, %%xmm9\n"
746           "addps %%xmm8, %%xmm10\n"
747           "subps %%xmm9, %%xmm10\n"
748           "movaps %%xmm10, %%xmm4\n"
749           "movaps %%xmm5, %%xmm8\n"
750           "shufps $68, %%xmm8, %%xmm8\n"
751           "xorps %%xmm9, %%xmm9\n"
752           "movaps %%xmm5, %%xmm10\n"
753           "shufps $14, %%xmm9, %%xmm10\n"
754           "movaps %%xmm5, %%xmm11\n"
755           "shufps $224, %%xmm11, %%xmm9\n"
756           "addps %%xmm8, %%xmm10\n"
757           "subps %%xmm9, %%xmm10\n"
758           "movaps %%xmm10, %%xmm5\n"
759           "movaps %%xmm6, %%xmm8\n"
760           "shufps $68, %%xmm8, %%xmm8\n"
761           "xorps %%xmm9, %%xmm9\n"
762           "movaps %%xmm6, %%xmm10\n"
763           "shufps $14, %%xmm9, %%xmm10\n"
764           "movaps %%xmm6, %%xmm11\n"
765           "shufps $224, %%xmm11, %%xmm9\n"
766           "addps %%xmm8, %%xmm10\n"
767           "subps %%xmm9, %%xmm10\n"
768           "movaps %%xmm10, %%xmm6\n"
769           "movaps %%xmm7, %%xmm8\n"
770           "shufps $68, %%xmm8, %%xmm8\n"
771           "xorps %%xmm9, %%xmm9\n"
772           "movaps %%xmm7, %%xmm10\n"
773           "shufps $14, %%xmm9, %%xmm10\n"
774           "movaps %%xmm7, %%xmm11\n"
775           "shufps $224, %%xmm11, %%xmm9\n"
776           "addps %%xmm8, %%xmm10\n"
777           "subps %%xmm9, %%xmm10\n"
778           "movaps %%xmm10, %%xmm7\n"
779           "movaps %%xmm0, %%xmm8\n"
780           "movaps %%xmm0, %%xmm9\n"
781           "addps %%xmm1, %%xmm8\n"
782           "subps %%xmm1, %%xmm9\n"
783           "movaps %%xmm2, %%xmm10\n"
784           "movaps %%xmm2, %%xmm11\n"
785           "addps %%xmm3, %%xmm10\n"
786           "subps %%xmm3, %%xmm11\n"
787           "movaps %%xmm4, %%xmm12\n"
788           "movaps %%xmm4, %%xmm13\n"
789           "addps %%xmm5, %%xmm12\n"
790           "subps %%xmm5, %%xmm13\n"
791           "movaps %%xmm6, %%xmm14\n"
792           "movaps %%xmm6, %%xmm15\n"
793           "addps %%xmm7, %%xmm14\n"
794           "subps %%xmm7, %%xmm15\n"
795           "movaps %%xmm8, %%xmm0\n"
796           "movaps %%xmm8, %%xmm2\n"
797           "addps %%xmm10, %%xmm0\n"
798           "subps %%xmm10, %%xmm2\n"
799           "movaps %%xmm9, %%xmm1\n"
800           "movaps %%xmm9, %%xmm3\n"
801           "addps %%xmm11, %%xmm1\n"
802           "subps %%xmm11, %%xmm3\n"
803           "movaps %%xmm12, %%xmm4\n"
804           "movaps %%xmm12, %%xmm6\n"
805           "addps %%xmm14, %%xmm4\n"
806           "subps %%xmm14, %%xmm6\n"
807           "movaps %%xmm13, %%xmm5\n"
808           "movaps %%xmm13, %%xmm7\n"
809           "addps %%xmm15, %%xmm5\n"
810           "subps %%xmm15, %%xmm7\n"
811           "movaps %%xmm0, %%xmm8\n"
812           "movaps %%xmm0, %%xmm12\n"
813           "addps %%xmm4, %%xmm8\n"
814           "subps %%xmm4, %%xmm12\n"
815           "movaps %%xmm1, %%xmm9\n"
816           "movaps %%xmm1, %%xmm13\n"
817           "addps %%xmm5, %%xmm9\n"
818           "subps %%xmm5, %%xmm13\n"
819           "movaps %%xmm2, %%xmm10\n"
820           "movaps %%xmm2, %%xmm14\n"
821           "addps %%xmm6, %%xmm10\n"
822           "subps %%xmm6, %%xmm14\n"
823           "movaps %%xmm3, %%xmm11\n"
824           "movaps %%xmm3, %%xmm15\n"
825           "addps %%xmm7, %%xmm11\n"
826           "subps %%xmm7, %%xmm15\n"
827           "movups %%xmm8, (%0)\n"
828           "movups %%xmm9, (%1)\n"
829           "movups %%xmm10, (%2)\n"
830           "movups %%xmm11, (%3)\n"
831           "movups %%xmm12, (%4)\n"
832           "movups %%xmm13, (%5)\n"
833           "movups %%xmm14, (%6)\n"
834           "movups %%xmm15, (%7)\n"
835           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
836         );
837       }
838     }
839     for (int j = 0; j < 128; j += 128) {
840       for (int k = 0; k < 32; k += 4) {
841         __asm__ volatile (
842           "movups (%0), %%xmm0\n"
843           "movups (%1), %%xmm1\n"
844           "movups (%2), %%xmm2\n"
845           "movups (%3), %%xmm3\n"
846           "movaps %%xmm0, %%xmm8\n"
847           "movaps %%xmm0, %%xmm9\n"
848           "addps %%xmm1, %%xmm8\n"
849           "subps %%xmm1, %%xmm9\n"
850           "movaps %%xmm2, %%xmm10\n"
851           "movaps %%xmm2, %%xmm11\n"
852           "addps %%xmm3, %%xmm10\n"
853           "subps %%xmm3, %%xmm11\n"
854           "movaps %%xmm8, %%xmm0\n"
855           "movaps %%xmm8, %%xmm2\n"
856           "addps %%xmm10, %%xmm0\n"
857           "subps %%xmm10, %%xmm2\n"
858           "movaps %%xmm9, %%xmm1\n"
859           "movaps %%xmm9, %%xmm3\n"
860           "addps %%xmm11, %%xmm1\n"
861           "subps %%xmm11, %%xmm3\n"
862           "movups %%xmm0, (%0)\n"
863           "movups %%xmm1, (%1)\n"
864           "movups %%xmm2, (%2)\n"
865           "movups %%xmm3, (%3)\n"
866           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
867         );
868       }
869     }
870     return;
871   }
872 }
873 void helper_float_7(float *buf);
helper_float_7(float * buf)874 void helper_float_7(float *buf) {
875   helper_float_7_recursive(buf, 7);
876 }
877 static inline void helper_float_8(float *buf);
helper_float_8(float * buf)878 static inline void helper_float_8(float *buf) {
879   for (int j = 0; j < 256; j += 32) {
880     for (int k = 0; k < 4; k += 4) {
881       __asm__ volatile (
882         "movups (%0), %%xmm0\n"
883         "movups (%1), %%xmm1\n"
884         "movups (%2), %%xmm2\n"
885         "movups (%3), %%xmm3\n"
886         "movups (%4), %%xmm4\n"
887         "movups (%5), %%xmm5\n"
888         "movups (%6), %%xmm6\n"
889         "movups (%7), %%xmm7\n"
890         "movaps %%xmm0, %%xmm8\n"
891         "shufps $160, %%xmm8, %%xmm8\n"
892         "shufps $245, %%xmm0, %%xmm0\n"
893         "xorps %%xmm9, %%xmm9\n"
894         "subps %%xmm0, %%xmm9\n"
895         "addsubps %%xmm9, %%xmm8\n"
896         "movaps %%xmm8, %%xmm0\n"
897         "movaps %%xmm1, %%xmm8\n"
898         "shufps $160, %%xmm8, %%xmm8\n"
899         "shufps $245, %%xmm1, %%xmm1\n"
900         "xorps %%xmm9, %%xmm9\n"
901         "subps %%xmm1, %%xmm9\n"
902         "addsubps %%xmm9, %%xmm8\n"
903         "movaps %%xmm8, %%xmm1\n"
904         "movaps %%xmm2, %%xmm8\n"
905         "shufps $160, %%xmm8, %%xmm8\n"
906         "shufps $245, %%xmm2, %%xmm2\n"
907         "xorps %%xmm9, %%xmm9\n"
908         "subps %%xmm2, %%xmm9\n"
909         "addsubps %%xmm9, %%xmm8\n"
910         "movaps %%xmm8, %%xmm2\n"
911         "movaps %%xmm3, %%xmm8\n"
912         "shufps $160, %%xmm8, %%xmm8\n"
913         "shufps $245, %%xmm3, %%xmm3\n"
914         "xorps %%xmm9, %%xmm9\n"
915         "subps %%xmm3, %%xmm9\n"
916         "addsubps %%xmm9, %%xmm8\n"
917         "movaps %%xmm8, %%xmm3\n"
918         "movaps %%xmm4, %%xmm8\n"
919         "shufps $160, %%xmm8, %%xmm8\n"
920         "shufps $245, %%xmm4, %%xmm4\n"
921         "xorps %%xmm9, %%xmm9\n"
922         "subps %%xmm4, %%xmm9\n"
923         "addsubps %%xmm9, %%xmm8\n"
924         "movaps %%xmm8, %%xmm4\n"
925         "movaps %%xmm5, %%xmm8\n"
926         "shufps $160, %%xmm8, %%xmm8\n"
927         "shufps $245, %%xmm5, %%xmm5\n"
928         "xorps %%xmm9, %%xmm9\n"
929         "subps %%xmm5, %%xmm9\n"
930         "addsubps %%xmm9, %%xmm8\n"
931         "movaps %%xmm8, %%xmm5\n"
932         "movaps %%xmm6, %%xmm8\n"
933         "shufps $160, %%xmm8, %%xmm8\n"
934         "shufps $245, %%xmm6, %%xmm6\n"
935         "xorps %%xmm9, %%xmm9\n"
936         "subps %%xmm6, %%xmm9\n"
937         "addsubps %%xmm9, %%xmm8\n"
938         "movaps %%xmm8, %%xmm6\n"
939         "movaps %%xmm7, %%xmm8\n"
940         "shufps $160, %%xmm8, %%xmm8\n"
941         "shufps $245, %%xmm7, %%xmm7\n"
942         "xorps %%xmm9, %%xmm9\n"
943         "subps %%xmm7, %%xmm9\n"
944         "addsubps %%xmm9, %%xmm8\n"
945         "movaps %%xmm8, %%xmm7\n"
946         "movaps %%xmm0, %%xmm8\n"
947         "shufps $68, %%xmm8, %%xmm8\n"
948         "xorps %%xmm9, %%xmm9\n"
949         "movaps %%xmm0, %%xmm10\n"
950         "shufps $14, %%xmm9, %%xmm10\n"
951         "movaps %%xmm0, %%xmm11\n"
952         "shufps $224, %%xmm11, %%xmm9\n"
953         "addps %%xmm8, %%xmm10\n"
954         "subps %%xmm9, %%xmm10\n"
955         "movaps %%xmm10, %%xmm0\n"
956         "movaps %%xmm1, %%xmm8\n"
957         "shufps $68, %%xmm8, %%xmm8\n"
958         "xorps %%xmm9, %%xmm9\n"
959         "movaps %%xmm1, %%xmm10\n"
960         "shufps $14, %%xmm9, %%xmm10\n"
961         "movaps %%xmm1, %%xmm11\n"
962         "shufps $224, %%xmm11, %%xmm9\n"
963         "addps %%xmm8, %%xmm10\n"
964         "subps %%xmm9, %%xmm10\n"
965         "movaps %%xmm10, %%xmm1\n"
966         "movaps %%xmm2, %%xmm8\n"
967         "shufps $68, %%xmm8, %%xmm8\n"
968         "xorps %%xmm9, %%xmm9\n"
969         "movaps %%xmm2, %%xmm10\n"
970         "shufps $14, %%xmm9, %%xmm10\n"
971         "movaps %%xmm2, %%xmm11\n"
972         "shufps $224, %%xmm11, %%xmm9\n"
973         "addps %%xmm8, %%xmm10\n"
974         "subps %%xmm9, %%xmm10\n"
975         "movaps %%xmm10, %%xmm2\n"
976         "movaps %%xmm3, %%xmm8\n"
977         "shufps $68, %%xmm8, %%xmm8\n"
978         "xorps %%xmm9, %%xmm9\n"
979         "movaps %%xmm3, %%xmm10\n"
980         "shufps $14, %%xmm9, %%xmm10\n"
981         "movaps %%xmm3, %%xmm11\n"
982         "shufps $224, %%xmm11, %%xmm9\n"
983         "addps %%xmm8, %%xmm10\n"
984         "subps %%xmm9, %%xmm10\n"
985         "movaps %%xmm10, %%xmm3\n"
986         "movaps %%xmm4, %%xmm8\n"
987         "shufps $68, %%xmm8, %%xmm8\n"
988         "xorps %%xmm9, %%xmm9\n"
989         "movaps %%xmm4, %%xmm10\n"
990         "shufps $14, %%xmm9, %%xmm10\n"
991         "movaps %%xmm4, %%xmm11\n"
992         "shufps $224, %%xmm11, %%xmm9\n"
993         "addps %%xmm8, %%xmm10\n"
994         "subps %%xmm9, %%xmm10\n"
995         "movaps %%xmm10, %%xmm4\n"
996         "movaps %%xmm5, %%xmm8\n"
997         "shufps $68, %%xmm8, %%xmm8\n"
998         "xorps %%xmm9, %%xmm9\n"
999         "movaps %%xmm5, %%xmm10\n"
1000         "shufps $14, %%xmm9, %%xmm10\n"
1001         "movaps %%xmm5, %%xmm11\n"
1002         "shufps $224, %%xmm11, %%xmm9\n"
1003         "addps %%xmm8, %%xmm10\n"
1004         "subps %%xmm9, %%xmm10\n"
1005         "movaps %%xmm10, %%xmm5\n"
1006         "movaps %%xmm6, %%xmm8\n"
1007         "shufps $68, %%xmm8, %%xmm8\n"
1008         "xorps %%xmm9, %%xmm9\n"
1009         "movaps %%xmm6, %%xmm10\n"
1010         "shufps $14, %%xmm9, %%xmm10\n"
1011         "movaps %%xmm6, %%xmm11\n"
1012         "shufps $224, %%xmm11, %%xmm9\n"
1013         "addps %%xmm8, %%xmm10\n"
1014         "subps %%xmm9, %%xmm10\n"
1015         "movaps %%xmm10, %%xmm6\n"
1016         "movaps %%xmm7, %%xmm8\n"
1017         "shufps $68, %%xmm8, %%xmm8\n"
1018         "xorps %%xmm9, %%xmm9\n"
1019         "movaps %%xmm7, %%xmm10\n"
1020         "shufps $14, %%xmm9, %%xmm10\n"
1021         "movaps %%xmm7, %%xmm11\n"
1022         "shufps $224, %%xmm11, %%xmm9\n"
1023         "addps %%xmm8, %%xmm10\n"
1024         "subps %%xmm9, %%xmm10\n"
1025         "movaps %%xmm10, %%xmm7\n"
1026         "movaps %%xmm0, %%xmm8\n"
1027         "movaps %%xmm0, %%xmm9\n"
1028         "addps %%xmm1, %%xmm8\n"
1029         "subps %%xmm1, %%xmm9\n"
1030         "movaps %%xmm2, %%xmm10\n"
1031         "movaps %%xmm2, %%xmm11\n"
1032         "addps %%xmm3, %%xmm10\n"
1033         "subps %%xmm3, %%xmm11\n"
1034         "movaps %%xmm4, %%xmm12\n"
1035         "movaps %%xmm4, %%xmm13\n"
1036         "addps %%xmm5, %%xmm12\n"
1037         "subps %%xmm5, %%xmm13\n"
1038         "movaps %%xmm6, %%xmm14\n"
1039         "movaps %%xmm6, %%xmm15\n"
1040         "addps %%xmm7, %%xmm14\n"
1041         "subps %%xmm7, %%xmm15\n"
1042         "movaps %%xmm8, %%xmm0\n"
1043         "movaps %%xmm8, %%xmm2\n"
1044         "addps %%xmm10, %%xmm0\n"
1045         "subps %%xmm10, %%xmm2\n"
1046         "movaps %%xmm9, %%xmm1\n"
1047         "movaps %%xmm9, %%xmm3\n"
1048         "addps %%xmm11, %%xmm1\n"
1049         "subps %%xmm11, %%xmm3\n"
1050         "movaps %%xmm12, %%xmm4\n"
1051         "movaps %%xmm12, %%xmm6\n"
1052         "addps %%xmm14, %%xmm4\n"
1053         "subps %%xmm14, %%xmm6\n"
1054         "movaps %%xmm13, %%xmm5\n"
1055         "movaps %%xmm13, %%xmm7\n"
1056         "addps %%xmm15, %%xmm5\n"
1057         "subps %%xmm15, %%xmm7\n"
1058         "movaps %%xmm0, %%xmm8\n"
1059         "movaps %%xmm0, %%xmm12\n"
1060         "addps %%xmm4, %%xmm8\n"
1061         "subps %%xmm4, %%xmm12\n"
1062         "movaps %%xmm1, %%xmm9\n"
1063         "movaps %%xmm1, %%xmm13\n"
1064         "addps %%xmm5, %%xmm9\n"
1065         "subps %%xmm5, %%xmm13\n"
1066         "movaps %%xmm2, %%xmm10\n"
1067         "movaps %%xmm2, %%xmm14\n"
1068         "addps %%xmm6, %%xmm10\n"
1069         "subps %%xmm6, %%xmm14\n"
1070         "movaps %%xmm3, %%xmm11\n"
1071         "movaps %%xmm3, %%xmm15\n"
1072         "addps %%xmm7, %%xmm11\n"
1073         "subps %%xmm7, %%xmm15\n"
1074         "movups %%xmm8, (%0)\n"
1075         "movups %%xmm9, (%1)\n"
1076         "movups %%xmm10, (%2)\n"
1077         "movups %%xmm11, (%3)\n"
1078         "movups %%xmm12, (%4)\n"
1079         "movups %%xmm13, (%5)\n"
1080         "movups %%xmm14, (%6)\n"
1081         "movups %%xmm15, (%7)\n"
1082         :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
1083       );
1084     }
1085   }
1086   for (int j = 0; j < 256; j += 256) {
1087     for (int k = 0; k < 32; k += 4) {
1088       __asm__ volatile (
1089         "movups (%0), %%xmm0\n"
1090         "movups (%1), %%xmm1\n"
1091         "movups (%2), %%xmm2\n"
1092         "movups (%3), %%xmm3\n"
1093         "movups (%4), %%xmm4\n"
1094         "movups (%5), %%xmm5\n"
1095         "movups (%6), %%xmm6\n"
1096         "movups (%7), %%xmm7\n"
1097         "movaps %%xmm0, %%xmm8\n"
1098         "movaps %%xmm0, %%xmm9\n"
1099         "addps %%xmm1, %%xmm8\n"
1100         "subps %%xmm1, %%xmm9\n"
1101         "movaps %%xmm2, %%xmm10\n"
1102         "movaps %%xmm2, %%xmm11\n"
1103         "addps %%xmm3, %%xmm10\n"
1104         "subps %%xmm3, %%xmm11\n"
1105         "movaps %%xmm4, %%xmm12\n"
1106         "movaps %%xmm4, %%xmm13\n"
1107         "addps %%xmm5, %%xmm12\n"
1108         "subps %%xmm5, %%xmm13\n"
1109         "movaps %%xmm6, %%xmm14\n"
1110         "movaps %%xmm6, %%xmm15\n"
1111         "addps %%xmm7, %%xmm14\n"
1112         "subps %%xmm7, %%xmm15\n"
1113         "movaps %%xmm8, %%xmm0\n"
1114         "movaps %%xmm8, %%xmm2\n"
1115         "addps %%xmm10, %%xmm0\n"
1116         "subps %%xmm10, %%xmm2\n"
1117         "movaps %%xmm9, %%xmm1\n"
1118         "movaps %%xmm9, %%xmm3\n"
1119         "addps %%xmm11, %%xmm1\n"
1120         "subps %%xmm11, %%xmm3\n"
1121         "movaps %%xmm12, %%xmm4\n"
1122         "movaps %%xmm12, %%xmm6\n"
1123         "addps %%xmm14, %%xmm4\n"
1124         "subps %%xmm14, %%xmm6\n"
1125         "movaps %%xmm13, %%xmm5\n"
1126         "movaps %%xmm13, %%xmm7\n"
1127         "addps %%xmm15, %%xmm5\n"
1128         "subps %%xmm15, %%xmm7\n"
1129         "movaps %%xmm0, %%xmm8\n"
1130         "movaps %%xmm0, %%xmm12\n"
1131         "addps %%xmm4, %%xmm8\n"
1132         "subps %%xmm4, %%xmm12\n"
1133         "movaps %%xmm1, %%xmm9\n"
1134         "movaps %%xmm1, %%xmm13\n"
1135         "addps %%xmm5, %%xmm9\n"
1136         "subps %%xmm5, %%xmm13\n"
1137         "movaps %%xmm2, %%xmm10\n"
1138         "movaps %%xmm2, %%xmm14\n"
1139         "addps %%xmm6, %%xmm10\n"
1140         "subps %%xmm6, %%xmm14\n"
1141         "movaps %%xmm3, %%xmm11\n"
1142         "movaps %%xmm3, %%xmm15\n"
1143         "addps %%xmm7, %%xmm11\n"
1144         "subps %%xmm7, %%xmm15\n"
1145         "movups %%xmm8, (%0)\n"
1146         "movups %%xmm9, (%1)\n"
1147         "movups %%xmm10, (%2)\n"
1148         "movups %%xmm11, (%3)\n"
1149         "movups %%xmm12, (%4)\n"
1150         "movups %%xmm13, (%5)\n"
1151         "movups %%xmm14, (%6)\n"
1152         "movups %%xmm15, (%7)\n"
1153         :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
1154       );
1155     }
1156   }
1157 }
1158 static inline void helper_float_9(float *buf);
helper_float_9(float * buf)1159 static inline void helper_float_9(float *buf) {
1160   for (int j = 0; j < 512; j += 32) {
1161     for (int k = 0; k < 4; k += 4) {
1162       __asm__ volatile (
1163         "movups (%0), %%xmm0\n"
1164         "movups (%1), %%xmm1\n"
1165         "movups (%2), %%xmm2\n"
1166         "movups (%3), %%xmm3\n"
1167         "movups (%4), %%xmm4\n"
1168         "movups (%5), %%xmm5\n"
1169         "movups (%6), %%xmm6\n"
1170         "movups (%7), %%xmm7\n"
1171         "movaps %%xmm0, %%xmm8\n"
1172         "shufps $160, %%xmm8, %%xmm8\n"
1173         "shufps $245, %%xmm0, %%xmm0\n"
1174         "xorps %%xmm9, %%xmm9\n"
1175         "subps %%xmm0, %%xmm9\n"
1176         "addsubps %%xmm9, %%xmm8\n"
1177         "movaps %%xmm8, %%xmm0\n"
1178         "movaps %%xmm1, %%xmm8\n"
1179         "shufps $160, %%xmm8, %%xmm8\n"
1180         "shufps $245, %%xmm1, %%xmm1\n"
1181         "xorps %%xmm9, %%xmm9\n"
1182         "subps %%xmm1, %%xmm9\n"
1183         "addsubps %%xmm9, %%xmm8\n"
1184         "movaps %%xmm8, %%xmm1\n"
1185         "movaps %%xmm2, %%xmm8\n"
1186         "shufps $160, %%xmm8, %%xmm8\n"
1187         "shufps $245, %%xmm2, %%xmm2\n"
1188         "xorps %%xmm9, %%xmm9\n"
1189         "subps %%xmm2, %%xmm9\n"
1190         "addsubps %%xmm9, %%xmm8\n"
1191         "movaps %%xmm8, %%xmm2\n"
1192         "movaps %%xmm3, %%xmm8\n"
1193         "shufps $160, %%xmm8, %%xmm8\n"
1194         "shufps $245, %%xmm3, %%xmm3\n"
1195         "xorps %%xmm9, %%xmm9\n"
1196         "subps %%xmm3, %%xmm9\n"
1197         "addsubps %%xmm9, %%xmm8\n"
1198         "movaps %%xmm8, %%xmm3\n"
1199         "movaps %%xmm4, %%xmm8\n"
1200         "shufps $160, %%xmm8, %%xmm8\n"
1201         "shufps $245, %%xmm4, %%xmm4\n"
1202         "xorps %%xmm9, %%xmm9\n"
1203         "subps %%xmm4, %%xmm9\n"
1204         "addsubps %%xmm9, %%xmm8\n"
1205         "movaps %%xmm8, %%xmm4\n"
1206         "movaps %%xmm5, %%xmm8\n"
1207         "shufps $160, %%xmm8, %%xmm8\n"
1208         "shufps $245, %%xmm5, %%xmm5\n"
1209         "xorps %%xmm9, %%xmm9\n"
1210         "subps %%xmm5, %%xmm9\n"
1211         "addsubps %%xmm9, %%xmm8\n"
1212         "movaps %%xmm8, %%xmm5\n"
1213         "movaps %%xmm6, %%xmm8\n"
1214         "shufps $160, %%xmm8, %%xmm8\n"
1215         "shufps $245, %%xmm6, %%xmm6\n"
1216         "xorps %%xmm9, %%xmm9\n"
1217         "subps %%xmm6, %%xmm9\n"
1218         "addsubps %%xmm9, %%xmm8\n"
1219         "movaps %%xmm8, %%xmm6\n"
1220         "movaps %%xmm7, %%xmm8\n"
1221         "shufps $160, %%xmm8, %%xmm8\n"
1222         "shufps $245, %%xmm7, %%xmm7\n"
1223         "xorps %%xmm9, %%xmm9\n"
1224         "subps %%xmm7, %%xmm9\n"
1225         "addsubps %%xmm9, %%xmm8\n"
1226         "movaps %%xmm8, %%xmm7\n"
1227         "movaps %%xmm0, %%xmm8\n"
1228         "shufps $68, %%xmm8, %%xmm8\n"
1229         "xorps %%xmm9, %%xmm9\n"
1230         "movaps %%xmm0, %%xmm10\n"
1231         "shufps $14, %%xmm9, %%xmm10\n"
1232         "movaps %%xmm0, %%xmm11\n"
1233         "shufps $224, %%xmm11, %%xmm9\n"
1234         "addps %%xmm8, %%xmm10\n"
1235         "subps %%xmm9, %%xmm10\n"
1236         "movaps %%xmm10, %%xmm0\n"
1237         "movaps %%xmm1, %%xmm8\n"
1238         "shufps $68, %%xmm8, %%xmm8\n"
1239         "xorps %%xmm9, %%xmm9\n"
1240         "movaps %%xmm1, %%xmm10\n"
1241         "shufps $14, %%xmm9, %%xmm10\n"
1242         "movaps %%xmm1, %%xmm11\n"
1243         "shufps $224, %%xmm11, %%xmm9\n"
1244         "addps %%xmm8, %%xmm10\n"
1245         "subps %%xmm9, %%xmm10\n"
1246         "movaps %%xmm10, %%xmm1\n"
1247         "movaps %%xmm2, %%xmm8\n"
1248         "shufps $68, %%xmm8, %%xmm8\n"
1249         "xorps %%xmm9, %%xmm9\n"
1250         "movaps %%xmm2, %%xmm10\n"
1251         "shufps $14, %%xmm9, %%xmm10\n"
1252         "movaps %%xmm2, %%xmm11\n"
1253         "shufps $224, %%xmm11, %%xmm9\n"
1254         "addps %%xmm8, %%xmm10\n"
1255         "subps %%xmm9, %%xmm10\n"
1256         "movaps %%xmm10, %%xmm2\n"
1257         "movaps %%xmm3, %%xmm8\n"
1258         "shufps $68, %%xmm8, %%xmm8\n"
1259         "xorps %%xmm9, %%xmm9\n"
1260         "movaps %%xmm3, %%xmm10\n"
1261         "shufps $14, %%xmm9, %%xmm10\n"
1262         "movaps %%xmm3, %%xmm11\n"
1263         "shufps $224, %%xmm11, %%xmm9\n"
1264         "addps %%xmm8, %%xmm10\n"
1265         "subps %%xmm9, %%xmm10\n"
1266         "movaps %%xmm10, %%xmm3\n"
1267         "movaps %%xmm4, %%xmm8\n"
1268         "shufps $68, %%xmm8, %%xmm8\n"
1269         "xorps %%xmm9, %%xmm9\n"
1270         "movaps %%xmm4, %%xmm10\n"
1271         "shufps $14, %%xmm9, %%xmm10\n"
1272         "movaps %%xmm4, %%xmm11\n"
1273         "shufps $224, %%xmm11, %%xmm9\n"
1274         "addps %%xmm8, %%xmm10\n"
1275         "subps %%xmm9, %%xmm10\n"
1276         "movaps %%xmm10, %%xmm4\n"
1277         "movaps %%xmm5, %%xmm8\n"
1278         "shufps $68, %%xmm8, %%xmm8\n"
1279         "xorps %%xmm9, %%xmm9\n"
1280         "movaps %%xmm5, %%xmm10\n"
1281         "shufps $14, %%xmm9, %%xmm10\n"
1282         "movaps %%xmm5, %%xmm11\n"
1283         "shufps $224, %%xmm11, %%xmm9\n"
1284         "addps %%xmm8, %%xmm10\n"
1285         "subps %%xmm9, %%xmm10\n"
1286         "movaps %%xmm10, %%xmm5\n"
1287         "movaps %%xmm6, %%xmm8\n"
1288         "shufps $68, %%xmm8, %%xmm8\n"
1289         "xorps %%xmm9, %%xmm9\n"
1290         "movaps %%xmm6, %%xmm10\n"
1291         "shufps $14, %%xmm9, %%xmm10\n"
1292         "movaps %%xmm6, %%xmm11\n"
1293         "shufps $224, %%xmm11, %%xmm9\n"
1294         "addps %%xmm8, %%xmm10\n"
1295         "subps %%xmm9, %%xmm10\n"
1296         "movaps %%xmm10, %%xmm6\n"
1297         "movaps %%xmm7, %%xmm8\n"
1298         "shufps $68, %%xmm8, %%xmm8\n"
1299         "xorps %%xmm9, %%xmm9\n"
1300         "movaps %%xmm7, %%xmm10\n"
1301         "shufps $14, %%xmm9, %%xmm10\n"
1302         "movaps %%xmm7, %%xmm11\n"
1303         "shufps $224, %%xmm11, %%xmm9\n"
1304         "addps %%xmm8, %%xmm10\n"
1305         "subps %%xmm9, %%xmm10\n"
1306         "movaps %%xmm10, %%xmm7\n"
1307         "movaps %%xmm0, %%xmm8\n"
1308         "movaps %%xmm0, %%xmm9\n"
1309         "addps %%xmm1, %%xmm8\n"
1310         "subps %%xmm1, %%xmm9\n"
1311         "movaps %%xmm2, %%xmm10\n"
1312         "movaps %%xmm2, %%xmm11\n"
1313         "addps %%xmm3, %%xmm10\n"
1314         "subps %%xmm3, %%xmm11\n"
1315         "movaps %%xmm4, %%xmm12\n"
1316         "movaps %%xmm4, %%xmm13\n"
1317         "addps %%xmm5, %%xmm12\n"
1318         "subps %%xmm5, %%xmm13\n"
1319         "movaps %%xmm6, %%xmm14\n"
1320         "movaps %%xmm6, %%xmm15\n"
1321         "addps %%xmm7, %%xmm14\n"
1322         "subps %%xmm7, %%xmm15\n"
1323         "movaps %%xmm8, %%xmm0\n"
1324         "movaps %%xmm8, %%xmm2\n"
1325         "addps %%xmm10, %%xmm0\n"
1326         "subps %%xmm10, %%xmm2\n"
1327         "movaps %%xmm9, %%xmm1\n"
1328         "movaps %%xmm9, %%xmm3\n"
1329         "addps %%xmm11, %%xmm1\n"
1330         "subps %%xmm11, %%xmm3\n"
1331         "movaps %%xmm12, %%xmm4\n"
1332         "movaps %%xmm12, %%xmm6\n"
1333         "addps %%xmm14, %%xmm4\n"
1334         "subps %%xmm14, %%xmm6\n"
1335         "movaps %%xmm13, %%xmm5\n"
1336         "movaps %%xmm13, %%xmm7\n"
1337         "addps %%xmm15, %%xmm5\n"
1338         "subps %%xmm15, %%xmm7\n"
1339         "movaps %%xmm0, %%xmm8\n"
1340         "movaps %%xmm0, %%xmm12\n"
1341         "addps %%xmm4, %%xmm8\n"
1342         "subps %%xmm4, %%xmm12\n"
1343         "movaps %%xmm1, %%xmm9\n"
1344         "movaps %%xmm1, %%xmm13\n"
1345         "addps %%xmm5, %%xmm9\n"
1346         "subps %%xmm5, %%xmm13\n"
1347         "movaps %%xmm2, %%xmm10\n"
1348         "movaps %%xmm2, %%xmm14\n"
1349         "addps %%xmm6, %%xmm10\n"
1350         "subps %%xmm6, %%xmm14\n"
1351         "movaps %%xmm3, %%xmm11\n"
1352         "movaps %%xmm3, %%xmm15\n"
1353         "addps %%xmm7, %%xmm11\n"
1354         "subps %%xmm7, %%xmm15\n"
1355         "movups %%xmm8, (%0)\n"
1356         "movups %%xmm9, (%1)\n"
1357         "movups %%xmm10, (%2)\n"
1358         "movups %%xmm11, (%3)\n"
1359         "movups %%xmm12, (%4)\n"
1360         "movups %%xmm13, (%5)\n"
1361         "movups %%xmm14, (%6)\n"
1362         "movups %%xmm15, (%7)\n"
1363         :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
1364       );
1365     }
1366   }
1367   for (int j = 0; j < 512; j += 256) {
1368     for (int k = 0; k < 32; k += 4) {
1369       __asm__ volatile (
1370         "movups (%0), %%xmm0\n"
1371         "movups (%1), %%xmm1\n"
1372         "movups (%2), %%xmm2\n"
1373         "movups (%3), %%xmm3\n"
1374         "movups (%4), %%xmm4\n"
1375         "movups (%5), %%xmm5\n"
1376         "movups (%6), %%xmm6\n"
1377         "movups (%7), %%xmm7\n"
1378         "movaps %%xmm0, %%xmm8\n"
1379         "movaps %%xmm0, %%xmm9\n"
1380         "addps %%xmm1, %%xmm8\n"
1381         "subps %%xmm1, %%xmm9\n"
1382         "movaps %%xmm2, %%xmm10\n"
1383         "movaps %%xmm2, %%xmm11\n"
1384         "addps %%xmm3, %%xmm10\n"
1385         "subps %%xmm3, %%xmm11\n"
1386         "movaps %%xmm4, %%xmm12\n"
1387         "movaps %%xmm4, %%xmm13\n"
1388         "addps %%xmm5, %%xmm12\n"
1389         "subps %%xmm5, %%xmm13\n"
1390         "movaps %%xmm6, %%xmm14\n"
1391         "movaps %%xmm6, %%xmm15\n"
1392         "addps %%xmm7, %%xmm14\n"
1393         "subps %%xmm7, %%xmm15\n"
1394         "movaps %%xmm8, %%xmm0\n"
1395         "movaps %%xmm8, %%xmm2\n"
1396         "addps %%xmm10, %%xmm0\n"
1397         "subps %%xmm10, %%xmm2\n"
1398         "movaps %%xmm9, %%xmm1\n"
1399         "movaps %%xmm9, %%xmm3\n"
1400         "addps %%xmm11, %%xmm1\n"
1401         "subps %%xmm11, %%xmm3\n"
1402         "movaps %%xmm12, %%xmm4\n"
1403         "movaps %%xmm12, %%xmm6\n"
1404         "addps %%xmm14, %%xmm4\n"
1405         "subps %%xmm14, %%xmm6\n"
1406         "movaps %%xmm13, %%xmm5\n"
1407         "movaps %%xmm13, %%xmm7\n"
1408         "addps %%xmm15, %%xmm5\n"
1409         "subps %%xmm15, %%xmm7\n"
1410         "movaps %%xmm0, %%xmm8\n"
1411         "movaps %%xmm0, %%xmm12\n"
1412         "addps %%xmm4, %%xmm8\n"
1413         "subps %%xmm4, %%xmm12\n"
1414         "movaps %%xmm1, %%xmm9\n"
1415         "movaps %%xmm1, %%xmm13\n"
1416         "addps %%xmm5, %%xmm9\n"
1417         "subps %%xmm5, %%xmm13\n"
1418         "movaps %%xmm2, %%xmm10\n"
1419         "movaps %%xmm2, %%xmm14\n"
1420         "addps %%xmm6, %%xmm10\n"
1421         "subps %%xmm6, %%xmm14\n"
1422         "movaps %%xmm3, %%xmm11\n"
1423         "movaps %%xmm3, %%xmm15\n"
1424         "addps %%xmm7, %%xmm11\n"
1425         "subps %%xmm7, %%xmm15\n"
1426         "movups %%xmm8, (%0)\n"
1427         "movups %%xmm9, (%1)\n"
1428         "movups %%xmm10, (%2)\n"
1429         "movups %%xmm11, (%3)\n"
1430         "movups %%xmm12, (%4)\n"
1431         "movups %%xmm13, (%5)\n"
1432         "movups %%xmm14, (%6)\n"
1433         "movups %%xmm15, (%7)\n"
1434         :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
1435       );
1436     }
1437   }
1438   for (int j = 0; j < 512; j += 512) {
1439     for (int k = 0; k < 256; k += 4) {
1440       __asm__ volatile (
1441         "movups (%0), %%xmm0\n"
1442         "movups (%1), %%xmm1\n"
1443         "movaps %%xmm0, %%xmm8\n"
1444         "movaps %%xmm0, %%xmm9\n"
1445         "addps %%xmm1, %%xmm8\n"
1446         "subps %%xmm1, %%xmm9\n"
1447         "movups %%xmm8, (%0)\n"
1448         "movups %%xmm9, (%1)\n"
1449         :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
1450       );
1451     }
1452   }
1453 }
1454 static inline void helper_float_10(float *buf);
helper_float_10(float * buf)1455 static inline void helper_float_10(float *buf) {
1456   for (int j = 0; j < 1024; j += 32) {
1457     for (int k = 0; k < 4; k += 4) {
1458       __asm__ volatile (
1459         "movups (%0), %%xmm0\n"
1460         "movups (%1), %%xmm1\n"
1461         "movups (%2), %%xmm2\n"
1462         "movups (%3), %%xmm3\n"
1463         "movups (%4), %%xmm4\n"
1464         "movups (%5), %%xmm5\n"
1465         "movups (%6), %%xmm6\n"
1466         "movups (%7), %%xmm7\n"
1467         "movaps %%xmm0, %%xmm8\n"
1468         "shufps $160, %%xmm8, %%xmm8\n"
1469         "shufps $245, %%xmm0, %%xmm0\n"
1470         "xorps %%xmm9, %%xmm9\n"
1471         "subps %%xmm0, %%xmm9\n"
1472         "addsubps %%xmm9, %%xmm8\n"
1473         "movaps %%xmm8, %%xmm0\n"
1474         "movaps %%xmm1, %%xmm8\n"
1475         "shufps $160, %%xmm8, %%xmm8\n"
1476         "shufps $245, %%xmm1, %%xmm1\n"
1477         "xorps %%xmm9, %%xmm9\n"
1478         "subps %%xmm1, %%xmm9\n"
1479         "addsubps %%xmm9, %%xmm8\n"
1480         "movaps %%xmm8, %%xmm1\n"
1481         "movaps %%xmm2, %%xmm8\n"
1482         "shufps $160, %%xmm8, %%xmm8\n"
1483         "shufps $245, %%xmm2, %%xmm2\n"
1484         "xorps %%xmm9, %%xmm9\n"
1485         "subps %%xmm2, %%xmm9\n"
1486         "addsubps %%xmm9, %%xmm8\n"
1487         "movaps %%xmm8, %%xmm2\n"
1488         "movaps %%xmm3, %%xmm8\n"
1489         "shufps $160, %%xmm8, %%xmm8\n"
1490         "shufps $245, %%xmm3, %%xmm3\n"
1491         "xorps %%xmm9, %%xmm9\n"
1492         "subps %%xmm3, %%xmm9\n"
1493         "addsubps %%xmm9, %%xmm8\n"
1494         "movaps %%xmm8, %%xmm3\n"
1495         "movaps %%xmm4, %%xmm8\n"
1496         "shufps $160, %%xmm8, %%xmm8\n"
1497         "shufps $245, %%xmm4, %%xmm4\n"
1498         "xorps %%xmm9, %%xmm9\n"
1499         "subps %%xmm4, %%xmm9\n"
1500         "addsubps %%xmm9, %%xmm8\n"
1501         "movaps %%xmm8, %%xmm4\n"
1502         "movaps %%xmm5, %%xmm8\n"
1503         "shufps $160, %%xmm8, %%xmm8\n"
1504         "shufps $245, %%xmm5, %%xmm5\n"
1505         "xorps %%xmm9, %%xmm9\n"
1506         "subps %%xmm5, %%xmm9\n"
1507         "addsubps %%xmm9, %%xmm8\n"
1508         "movaps %%xmm8, %%xmm5\n"
1509         "movaps %%xmm6, %%xmm8\n"
1510         "shufps $160, %%xmm8, %%xmm8\n"
1511         "shufps $245, %%xmm6, %%xmm6\n"
1512         "xorps %%xmm9, %%xmm9\n"
1513         "subps %%xmm6, %%xmm9\n"
1514         "addsubps %%xmm9, %%xmm8\n"
1515         "movaps %%xmm8, %%xmm6\n"
1516         "movaps %%xmm7, %%xmm8\n"
1517         "shufps $160, %%xmm8, %%xmm8\n"
1518         "shufps $245, %%xmm7, %%xmm7\n"
1519         "xorps %%xmm9, %%xmm9\n"
1520         "subps %%xmm7, %%xmm9\n"
1521         "addsubps %%xmm9, %%xmm8\n"
1522         "movaps %%xmm8, %%xmm7\n"
1523         "movaps %%xmm0, %%xmm8\n"
1524         "shufps $68, %%xmm8, %%xmm8\n"
1525         "xorps %%xmm9, %%xmm9\n"
1526         "movaps %%xmm0, %%xmm10\n"
1527         "shufps $14, %%xmm9, %%xmm10\n"
1528         "movaps %%xmm0, %%xmm11\n"
1529         "shufps $224, %%xmm11, %%xmm9\n"
1530         "addps %%xmm8, %%xmm10\n"
1531         "subps %%xmm9, %%xmm10\n"
1532         "movaps %%xmm10, %%xmm0\n"
1533         "movaps %%xmm1, %%xmm8\n"
1534         "shufps $68, %%xmm8, %%xmm8\n"
1535         "xorps %%xmm9, %%xmm9\n"
1536         "movaps %%xmm1, %%xmm10\n"
1537         "shufps $14, %%xmm9, %%xmm10\n"
1538         "movaps %%xmm1, %%xmm11\n"
1539         "shufps $224, %%xmm11, %%xmm9\n"
1540         "addps %%xmm8, %%xmm10\n"
1541         "subps %%xmm9, %%xmm10\n"
1542         "movaps %%xmm10, %%xmm1\n"
1543         "movaps %%xmm2, %%xmm8\n"
1544         "shufps $68, %%xmm8, %%xmm8\n"
1545         "xorps %%xmm9, %%xmm9\n"
1546         "movaps %%xmm2, %%xmm10\n"
1547         "shufps $14, %%xmm9, %%xmm10\n"
1548         "movaps %%xmm2, %%xmm11\n"
1549         "shufps $224, %%xmm11, %%xmm9\n"
1550         "addps %%xmm8, %%xmm10\n"
1551         "subps %%xmm9, %%xmm10\n"
1552         "movaps %%xmm10, %%xmm2\n"
1553         "movaps %%xmm3, %%xmm8\n"
1554         "shufps $68, %%xmm8, %%xmm8\n"
1555         "xorps %%xmm9, %%xmm9\n"
1556         "movaps %%xmm3, %%xmm10\n"
1557         "shufps $14, %%xmm9, %%xmm10\n"
1558         "movaps %%xmm3, %%xmm11\n"
1559         "shufps $224, %%xmm11, %%xmm9\n"
1560         "addps %%xmm8, %%xmm10\n"
1561         "subps %%xmm9, %%xmm10\n"
1562         "movaps %%xmm10, %%xmm3\n"
1563         "movaps %%xmm4, %%xmm8\n"
1564         "shufps $68, %%xmm8, %%xmm8\n"
1565         "xorps %%xmm9, %%xmm9\n"
1566         "movaps %%xmm4, %%xmm10\n"
1567         "shufps $14, %%xmm9, %%xmm10\n"
1568         "movaps %%xmm4, %%xmm11\n"
1569         "shufps $224, %%xmm11, %%xmm9\n"
1570         "addps %%xmm8, %%xmm10\n"
1571         "subps %%xmm9, %%xmm10\n"
1572         "movaps %%xmm10, %%xmm4\n"
1573         "movaps %%xmm5, %%xmm8\n"
1574         "shufps $68, %%xmm8, %%xmm8\n"
1575         "xorps %%xmm9, %%xmm9\n"
1576         "movaps %%xmm5, %%xmm10\n"
1577         "shufps $14, %%xmm9, %%xmm10\n"
1578         "movaps %%xmm5, %%xmm11\n"
1579         "shufps $224, %%xmm11, %%xmm9\n"
1580         "addps %%xmm8, %%xmm10\n"
1581         "subps %%xmm9, %%xmm10\n"
1582         "movaps %%xmm10, %%xmm5\n"
1583         "movaps %%xmm6, %%xmm8\n"
1584         "shufps $68, %%xmm8, %%xmm8\n"
1585         "xorps %%xmm9, %%xmm9\n"
1586         "movaps %%xmm6, %%xmm10\n"
1587         "shufps $14, %%xmm9, %%xmm10\n"
1588         "movaps %%xmm6, %%xmm11\n"
1589         "shufps $224, %%xmm11, %%xmm9\n"
1590         "addps %%xmm8, %%xmm10\n"
1591         "subps %%xmm9, %%xmm10\n"
1592         "movaps %%xmm10, %%xmm6\n"
1593         "movaps %%xmm7, %%xmm8\n"
1594         "shufps $68, %%xmm8, %%xmm8\n"
1595         "xorps %%xmm9, %%xmm9\n"
1596         "movaps %%xmm7, %%xmm10\n"
1597         "shufps $14, %%xmm9, %%xmm10\n"
1598         "movaps %%xmm7, %%xmm11\n"
1599         "shufps $224, %%xmm11, %%xmm9\n"
1600         "addps %%xmm8, %%xmm10\n"
1601         "subps %%xmm9, %%xmm10\n"
1602         "movaps %%xmm10, %%xmm7\n"
1603         "movaps %%xmm0, %%xmm8\n"
1604         "movaps %%xmm0, %%xmm9\n"
1605         "addps %%xmm1, %%xmm8\n"
1606         "subps %%xmm1, %%xmm9\n"
1607         "movaps %%xmm2, %%xmm10\n"
1608         "movaps %%xmm2, %%xmm11\n"
1609         "addps %%xmm3, %%xmm10\n"
1610         "subps %%xmm3, %%xmm11\n"
1611         "movaps %%xmm4, %%xmm12\n"
1612         "movaps %%xmm4, %%xmm13\n"
1613         "addps %%xmm5, %%xmm12\n"
1614         "subps %%xmm5, %%xmm13\n"
1615         "movaps %%xmm6, %%xmm14\n"
1616         "movaps %%xmm6, %%xmm15\n"
1617         "addps %%xmm7, %%xmm14\n"
1618         "subps %%xmm7, %%xmm15\n"
1619         "movaps %%xmm8, %%xmm0\n"
1620         "movaps %%xmm8, %%xmm2\n"
1621         "addps %%xmm10, %%xmm0\n"
1622         "subps %%xmm10, %%xmm2\n"
1623         "movaps %%xmm9, %%xmm1\n"
1624         "movaps %%xmm9, %%xmm3\n"
1625         "addps %%xmm11, %%xmm1\n"
1626         "subps %%xmm11, %%xmm3\n"
1627         "movaps %%xmm12, %%xmm4\n"
1628         "movaps %%xmm12, %%xmm6\n"
1629         "addps %%xmm14, %%xmm4\n"
1630         "subps %%xmm14, %%xmm6\n"
1631         "movaps %%xmm13, %%xmm5\n"
1632         "movaps %%xmm13, %%xmm7\n"
1633         "addps %%xmm15, %%xmm5\n"
1634         "subps %%xmm15, %%xmm7\n"
1635         "movaps %%xmm0, %%xmm8\n"
1636         "movaps %%xmm0, %%xmm12\n"
1637         "addps %%xmm4, %%xmm8\n"
1638         "subps %%xmm4, %%xmm12\n"
1639         "movaps %%xmm1, %%xmm9\n"
1640         "movaps %%xmm1, %%xmm13\n"
1641         "addps %%xmm5, %%xmm9\n"
1642         "subps %%xmm5, %%xmm13\n"
1643         "movaps %%xmm2, %%xmm10\n"
1644         "movaps %%xmm2, %%xmm14\n"
1645         "addps %%xmm6, %%xmm10\n"
1646         "subps %%xmm6, %%xmm14\n"
1647         "movaps %%xmm3, %%xmm11\n"
1648         "movaps %%xmm3, %%xmm15\n"
1649         "addps %%xmm7, %%xmm11\n"
1650         "subps %%xmm7, %%xmm15\n"
1651         "movups %%xmm8, (%0)\n"
1652         "movups %%xmm9, (%1)\n"
1653         "movups %%xmm10, (%2)\n"
1654         "movups %%xmm11, (%3)\n"
1655         "movups %%xmm12, (%4)\n"
1656         "movups %%xmm13, (%5)\n"
1657         "movups %%xmm14, (%6)\n"
1658         "movups %%xmm15, (%7)\n"
1659         :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
1660       );
1661     }
1662   }
1663   for (int j = 0; j < 1024; j += 256) {
1664     for (int k = 0; k < 32; k += 4) {
1665       __asm__ volatile (
1666         "movups (%0), %%xmm0\n"
1667         "movups (%1), %%xmm1\n"
1668         "movups (%2), %%xmm2\n"
1669         "movups (%3), %%xmm3\n"
1670         "movups (%4), %%xmm4\n"
1671         "movups (%5), %%xmm5\n"
1672         "movups (%6), %%xmm6\n"
1673         "movups (%7), %%xmm7\n"
1674         "movaps %%xmm0, %%xmm8\n"
1675         "movaps %%xmm0, %%xmm9\n"
1676         "addps %%xmm1, %%xmm8\n"
1677         "subps %%xmm1, %%xmm9\n"
1678         "movaps %%xmm2, %%xmm10\n"
1679         "movaps %%xmm2, %%xmm11\n"
1680         "addps %%xmm3, %%xmm10\n"
1681         "subps %%xmm3, %%xmm11\n"
1682         "movaps %%xmm4, %%xmm12\n"
1683         "movaps %%xmm4, %%xmm13\n"
1684         "addps %%xmm5, %%xmm12\n"
1685         "subps %%xmm5, %%xmm13\n"
1686         "movaps %%xmm6, %%xmm14\n"
1687         "movaps %%xmm6, %%xmm15\n"
1688         "addps %%xmm7, %%xmm14\n"
1689         "subps %%xmm7, %%xmm15\n"
1690         "movaps %%xmm8, %%xmm0\n"
1691         "movaps %%xmm8, %%xmm2\n"
1692         "addps %%xmm10, %%xmm0\n"
1693         "subps %%xmm10, %%xmm2\n"
1694         "movaps %%xmm9, %%xmm1\n"
1695         "movaps %%xmm9, %%xmm3\n"
1696         "addps %%xmm11, %%xmm1\n"
1697         "subps %%xmm11, %%xmm3\n"
1698         "movaps %%xmm12, %%xmm4\n"
1699         "movaps %%xmm12, %%xmm6\n"
1700         "addps %%xmm14, %%xmm4\n"
1701         "subps %%xmm14, %%xmm6\n"
1702         "movaps %%xmm13, %%xmm5\n"
1703         "movaps %%xmm13, %%xmm7\n"
1704         "addps %%xmm15, %%xmm5\n"
1705         "subps %%xmm15, %%xmm7\n"
1706         "movaps %%xmm0, %%xmm8\n"
1707         "movaps %%xmm0, %%xmm12\n"
1708         "addps %%xmm4, %%xmm8\n"
1709         "subps %%xmm4, %%xmm12\n"
1710         "movaps %%xmm1, %%xmm9\n"
1711         "movaps %%xmm1, %%xmm13\n"
1712         "addps %%xmm5, %%xmm9\n"
1713         "subps %%xmm5, %%xmm13\n"
1714         "movaps %%xmm2, %%xmm10\n"
1715         "movaps %%xmm2, %%xmm14\n"
1716         "addps %%xmm6, %%xmm10\n"
1717         "subps %%xmm6, %%xmm14\n"
1718         "movaps %%xmm3, %%xmm11\n"
1719         "movaps %%xmm3, %%xmm15\n"
1720         "addps %%xmm7, %%xmm11\n"
1721         "subps %%xmm7, %%xmm15\n"
1722         "movups %%xmm8, (%0)\n"
1723         "movups %%xmm9, (%1)\n"
1724         "movups %%xmm10, (%2)\n"
1725         "movups %%xmm11, (%3)\n"
1726         "movups %%xmm12, (%4)\n"
1727         "movups %%xmm13, (%5)\n"
1728         "movups %%xmm14, (%6)\n"
1729         "movups %%xmm15, (%7)\n"
1730         :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
1731       );
1732     }
1733   }
1734   for (int j = 0; j < 1024; j += 1024) {
1735     for (int k = 0; k < 256; k += 4) {
1736       __asm__ volatile (
1737         "movups (%0), %%xmm0\n"
1738         "movups (%1), %%xmm1\n"
1739         "movups (%2), %%xmm2\n"
1740         "movups (%3), %%xmm3\n"
1741         "movaps %%xmm0, %%xmm8\n"
1742         "movaps %%xmm0, %%xmm9\n"
1743         "addps %%xmm1, %%xmm8\n"
1744         "subps %%xmm1, %%xmm9\n"
1745         "movaps %%xmm2, %%xmm10\n"
1746         "movaps %%xmm2, %%xmm11\n"
1747         "addps %%xmm3, %%xmm10\n"
1748         "subps %%xmm3, %%xmm11\n"
1749         "movaps %%xmm8, %%xmm0\n"
1750         "movaps %%xmm8, %%xmm2\n"
1751         "addps %%xmm10, %%xmm0\n"
1752         "subps %%xmm10, %%xmm2\n"
1753         "movaps %%xmm9, %%xmm1\n"
1754         "movaps %%xmm9, %%xmm3\n"
1755         "addps %%xmm11, %%xmm1\n"
1756         "subps %%xmm11, %%xmm3\n"
1757         "movups %%xmm0, (%0)\n"
1758         "movups %%xmm1, (%1)\n"
1759         "movups %%xmm2, (%2)\n"
1760         "movups %%xmm3, (%3)\n"
1761         :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
1762       );
1763     }
1764   }
1765 }
1766 static inline void helper_float_11(float *buf);
helper_float_11(float * buf)1767 static inline void helper_float_11(float *buf) {
1768   for (int j = 0; j < 2048; j += 32) {
1769     for (int k = 0; k < 4; k += 4) {
1770       __asm__ volatile (
1771         "movups (%0), %%xmm0\n"
1772         "movups (%1), %%xmm1\n"
1773         "movups (%2), %%xmm2\n"
1774         "movups (%3), %%xmm3\n"
1775         "movups (%4), %%xmm4\n"
1776         "movups (%5), %%xmm5\n"
1777         "movups (%6), %%xmm6\n"
1778         "movups (%7), %%xmm7\n"
1779         "movaps %%xmm0, %%xmm8\n"
1780         "shufps $160, %%xmm8, %%xmm8\n"
1781         "shufps $245, %%xmm0, %%xmm0\n"
1782         "xorps %%xmm9, %%xmm9\n"
1783         "subps %%xmm0, %%xmm9\n"
1784         "addsubps %%xmm9, %%xmm8\n"
1785         "movaps %%xmm8, %%xmm0\n"
1786         "movaps %%xmm1, %%xmm8\n"
1787         "shufps $160, %%xmm8, %%xmm8\n"
1788         "shufps $245, %%xmm1, %%xmm1\n"
1789         "xorps %%xmm9, %%xmm9\n"
1790         "subps %%xmm1, %%xmm9\n"
1791         "addsubps %%xmm9, %%xmm8\n"
1792         "movaps %%xmm8, %%xmm1\n"
1793         "movaps %%xmm2, %%xmm8\n"
1794         "shufps $160, %%xmm8, %%xmm8\n"
1795         "shufps $245, %%xmm2, %%xmm2\n"
1796         "xorps %%xmm9, %%xmm9\n"
1797         "subps %%xmm2, %%xmm9\n"
1798         "addsubps %%xmm9, %%xmm8\n"
1799         "movaps %%xmm8, %%xmm2\n"
1800         "movaps %%xmm3, %%xmm8\n"
1801         "shufps $160, %%xmm8, %%xmm8\n"
1802         "shufps $245, %%xmm3, %%xmm3\n"
1803         "xorps %%xmm9, %%xmm9\n"
1804         "subps %%xmm3, %%xmm9\n"
1805         "addsubps %%xmm9, %%xmm8\n"
1806         "movaps %%xmm8, %%xmm3\n"
1807         "movaps %%xmm4, %%xmm8\n"
1808         "shufps $160, %%xmm8, %%xmm8\n"
1809         "shufps $245, %%xmm4, %%xmm4\n"
1810         "xorps %%xmm9, %%xmm9\n"
1811         "subps %%xmm4, %%xmm9\n"
1812         "addsubps %%xmm9, %%xmm8\n"
1813         "movaps %%xmm8, %%xmm4\n"
1814         "movaps %%xmm5, %%xmm8\n"
1815         "shufps $160, %%xmm8, %%xmm8\n"
1816         "shufps $245, %%xmm5, %%xmm5\n"
1817         "xorps %%xmm9, %%xmm9\n"
1818         "subps %%xmm5, %%xmm9\n"
1819         "addsubps %%xmm9, %%xmm8\n"
1820         "movaps %%xmm8, %%xmm5\n"
1821         "movaps %%xmm6, %%xmm8\n"
1822         "shufps $160, %%xmm8, %%xmm8\n"
1823         "shufps $245, %%xmm6, %%xmm6\n"
1824         "xorps %%xmm9, %%xmm9\n"
1825         "subps %%xmm6, %%xmm9\n"
1826         "addsubps %%xmm9, %%xmm8\n"
1827         "movaps %%xmm8, %%xmm6\n"
1828         "movaps %%xmm7, %%xmm8\n"
1829         "shufps $160, %%xmm8, %%xmm8\n"
1830         "shufps $245, %%xmm7, %%xmm7\n"
1831         "xorps %%xmm9, %%xmm9\n"
1832         "subps %%xmm7, %%xmm9\n"
1833         "addsubps %%xmm9, %%xmm8\n"
1834         "movaps %%xmm8, %%xmm7\n"
1835         "movaps %%xmm0, %%xmm8\n"
1836         "shufps $68, %%xmm8, %%xmm8\n"
1837         "xorps %%xmm9, %%xmm9\n"
1838         "movaps %%xmm0, %%xmm10\n"
1839         "shufps $14, %%xmm9, %%xmm10\n"
1840         "movaps %%xmm0, %%xmm11\n"
1841         "shufps $224, %%xmm11, %%xmm9\n"
1842         "addps %%xmm8, %%xmm10\n"
1843         "subps %%xmm9, %%xmm10\n"
1844         "movaps %%xmm10, %%xmm0\n"
1845         "movaps %%xmm1, %%xmm8\n"
1846         "shufps $68, %%xmm8, %%xmm8\n"
1847         "xorps %%xmm9, %%xmm9\n"
1848         "movaps %%xmm1, %%xmm10\n"
1849         "shufps $14, %%xmm9, %%xmm10\n"
1850         "movaps %%xmm1, %%xmm11\n"
1851         "shufps $224, %%xmm11, %%xmm9\n"
1852         "addps %%xmm8, %%xmm10\n"
1853         "subps %%xmm9, %%xmm10\n"
1854         "movaps %%xmm10, %%xmm1\n"
1855         "movaps %%xmm2, %%xmm8\n"
1856         "shufps $68, %%xmm8, %%xmm8\n"
1857         "xorps %%xmm9, %%xmm9\n"
1858         "movaps %%xmm2, %%xmm10\n"
1859         "shufps $14, %%xmm9, %%xmm10\n"
1860         "movaps %%xmm2, %%xmm11\n"
1861         "shufps $224, %%xmm11, %%xmm9\n"
1862         "addps %%xmm8, %%xmm10\n"
1863         "subps %%xmm9, %%xmm10\n"
1864         "movaps %%xmm10, %%xmm2\n"
1865         "movaps %%xmm3, %%xmm8\n"
1866         "shufps $68, %%xmm8, %%xmm8\n"
1867         "xorps %%xmm9, %%xmm9\n"
1868         "movaps %%xmm3, %%xmm10\n"
1869         "shufps $14, %%xmm9, %%xmm10\n"
1870         "movaps %%xmm3, %%xmm11\n"
1871         "shufps $224, %%xmm11, %%xmm9\n"
1872         "addps %%xmm8, %%xmm10\n"
1873         "subps %%xmm9, %%xmm10\n"
1874         "movaps %%xmm10, %%xmm3\n"
1875         "movaps %%xmm4, %%xmm8\n"
1876         "shufps $68, %%xmm8, %%xmm8\n"
1877         "xorps %%xmm9, %%xmm9\n"
1878         "movaps %%xmm4, %%xmm10\n"
1879         "shufps $14, %%xmm9, %%xmm10\n"
1880         "movaps %%xmm4, %%xmm11\n"
1881         "shufps $224, %%xmm11, %%xmm9\n"
1882         "addps %%xmm8, %%xmm10\n"
1883         "subps %%xmm9, %%xmm10\n"
1884         "movaps %%xmm10, %%xmm4\n"
1885         "movaps %%xmm5, %%xmm8\n"
1886         "shufps $68, %%xmm8, %%xmm8\n"
1887         "xorps %%xmm9, %%xmm9\n"
1888         "movaps %%xmm5, %%xmm10\n"
1889         "shufps $14, %%xmm9, %%xmm10\n"
1890         "movaps %%xmm5, %%xmm11\n"
1891         "shufps $224, %%xmm11, %%xmm9\n"
1892         "addps %%xmm8, %%xmm10\n"
1893         "subps %%xmm9, %%xmm10\n"
1894         "movaps %%xmm10, %%xmm5\n"
1895         "movaps %%xmm6, %%xmm8\n"
1896         "shufps $68, %%xmm8, %%xmm8\n"
1897         "xorps %%xmm9, %%xmm9\n"
1898         "movaps %%xmm6, %%xmm10\n"
1899         "shufps $14, %%xmm9, %%xmm10\n"
1900         "movaps %%xmm6, %%xmm11\n"
1901         "shufps $224, %%xmm11, %%xmm9\n"
1902         "addps %%xmm8, %%xmm10\n"
1903         "subps %%xmm9, %%xmm10\n"
1904         "movaps %%xmm10, %%xmm6\n"
1905         "movaps %%xmm7, %%xmm8\n"
1906         "shufps $68, %%xmm8, %%xmm8\n"
1907         "xorps %%xmm9, %%xmm9\n"
1908         "movaps %%xmm7, %%xmm10\n"
1909         "shufps $14, %%xmm9, %%xmm10\n"
1910         "movaps %%xmm7, %%xmm11\n"
1911         "shufps $224, %%xmm11, %%xmm9\n"
1912         "addps %%xmm8, %%xmm10\n"
1913         "subps %%xmm9, %%xmm10\n"
1914         "movaps %%xmm10, %%xmm7\n"
1915         "movaps %%xmm0, %%xmm8\n"
1916         "movaps %%xmm0, %%xmm9\n"
1917         "addps %%xmm1, %%xmm8\n"
1918         "subps %%xmm1, %%xmm9\n"
1919         "movaps %%xmm2, %%xmm10\n"
1920         "movaps %%xmm2, %%xmm11\n"
1921         "addps %%xmm3, %%xmm10\n"
1922         "subps %%xmm3, %%xmm11\n"
1923         "movaps %%xmm4, %%xmm12\n"
1924         "movaps %%xmm4, %%xmm13\n"
1925         "addps %%xmm5, %%xmm12\n"
1926         "subps %%xmm5, %%xmm13\n"
1927         "movaps %%xmm6, %%xmm14\n"
1928         "movaps %%xmm6, %%xmm15\n"
1929         "addps %%xmm7, %%xmm14\n"
1930         "subps %%xmm7, %%xmm15\n"
1931         "movaps %%xmm8, %%xmm0\n"
1932         "movaps %%xmm8, %%xmm2\n"
1933         "addps %%xmm10, %%xmm0\n"
1934         "subps %%xmm10, %%xmm2\n"
1935         "movaps %%xmm9, %%xmm1\n"
1936         "movaps %%xmm9, %%xmm3\n"
1937         "addps %%xmm11, %%xmm1\n"
1938         "subps %%xmm11, %%xmm3\n"
1939         "movaps %%xmm12, %%xmm4\n"
1940         "movaps %%xmm12, %%xmm6\n"
1941         "addps %%xmm14, %%xmm4\n"
1942         "subps %%xmm14, %%xmm6\n"
1943         "movaps %%xmm13, %%xmm5\n"
1944         "movaps %%xmm13, %%xmm7\n"
1945         "addps %%xmm15, %%xmm5\n"
1946         "subps %%xmm15, %%xmm7\n"
1947         "movaps %%xmm0, %%xmm8\n"
1948         "movaps %%xmm0, %%xmm12\n"
1949         "addps %%xmm4, %%xmm8\n"
1950         "subps %%xmm4, %%xmm12\n"
1951         "movaps %%xmm1, %%xmm9\n"
1952         "movaps %%xmm1, %%xmm13\n"
1953         "addps %%xmm5, %%xmm9\n"
1954         "subps %%xmm5, %%xmm13\n"
1955         "movaps %%xmm2, %%xmm10\n"
1956         "movaps %%xmm2, %%xmm14\n"
1957         "addps %%xmm6, %%xmm10\n"
1958         "subps %%xmm6, %%xmm14\n"
1959         "movaps %%xmm3, %%xmm11\n"
1960         "movaps %%xmm3, %%xmm15\n"
1961         "addps %%xmm7, %%xmm11\n"
1962         "subps %%xmm7, %%xmm15\n"
1963         "movups %%xmm8, (%0)\n"
1964         "movups %%xmm9, (%1)\n"
1965         "movups %%xmm10, (%2)\n"
1966         "movups %%xmm11, (%3)\n"
1967         "movups %%xmm12, (%4)\n"
1968         "movups %%xmm13, (%5)\n"
1969         "movups %%xmm14, (%6)\n"
1970         "movups %%xmm15, (%7)\n"
1971         :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
1972       );
1973     }
1974   }
1975   for (int j = 0; j < 2048; j += 256) {
1976     for (int k = 0; k < 32; k += 4) {
1977       __asm__ volatile (
1978         "movups (%0), %%xmm0\n"
1979         "movups (%1), %%xmm1\n"
1980         "movups (%2), %%xmm2\n"
1981         "movups (%3), %%xmm3\n"
1982         "movups (%4), %%xmm4\n"
1983         "movups (%5), %%xmm5\n"
1984         "movups (%6), %%xmm6\n"
1985         "movups (%7), %%xmm7\n"
1986         "movaps %%xmm0, %%xmm8\n"
1987         "movaps %%xmm0, %%xmm9\n"
1988         "addps %%xmm1, %%xmm8\n"
1989         "subps %%xmm1, %%xmm9\n"
1990         "movaps %%xmm2, %%xmm10\n"
1991         "movaps %%xmm2, %%xmm11\n"
1992         "addps %%xmm3, %%xmm10\n"
1993         "subps %%xmm3, %%xmm11\n"
1994         "movaps %%xmm4, %%xmm12\n"
1995         "movaps %%xmm4, %%xmm13\n"
1996         "addps %%xmm5, %%xmm12\n"
1997         "subps %%xmm5, %%xmm13\n"
1998         "movaps %%xmm6, %%xmm14\n"
1999         "movaps %%xmm6, %%xmm15\n"
2000         "addps %%xmm7, %%xmm14\n"
2001         "subps %%xmm7, %%xmm15\n"
2002         "movaps %%xmm8, %%xmm0\n"
2003         "movaps %%xmm8, %%xmm2\n"
2004         "addps %%xmm10, %%xmm0\n"
2005         "subps %%xmm10, %%xmm2\n"
2006         "movaps %%xmm9, %%xmm1\n"
2007         "movaps %%xmm9, %%xmm3\n"
2008         "addps %%xmm11, %%xmm1\n"
2009         "subps %%xmm11, %%xmm3\n"
2010         "movaps %%xmm12, %%xmm4\n"
2011         "movaps %%xmm12, %%xmm6\n"
2012         "addps %%xmm14, %%xmm4\n"
2013         "subps %%xmm14, %%xmm6\n"
2014         "movaps %%xmm13, %%xmm5\n"
2015         "movaps %%xmm13, %%xmm7\n"
2016         "addps %%xmm15, %%xmm5\n"
2017         "subps %%xmm15, %%xmm7\n"
2018         "movaps %%xmm0, %%xmm8\n"
2019         "movaps %%xmm0, %%xmm12\n"
2020         "addps %%xmm4, %%xmm8\n"
2021         "subps %%xmm4, %%xmm12\n"
2022         "movaps %%xmm1, %%xmm9\n"
2023         "movaps %%xmm1, %%xmm13\n"
2024         "addps %%xmm5, %%xmm9\n"
2025         "subps %%xmm5, %%xmm13\n"
2026         "movaps %%xmm2, %%xmm10\n"
2027         "movaps %%xmm2, %%xmm14\n"
2028         "addps %%xmm6, %%xmm10\n"
2029         "subps %%xmm6, %%xmm14\n"
2030         "movaps %%xmm3, %%xmm11\n"
2031         "movaps %%xmm3, %%xmm15\n"
2032         "addps %%xmm7, %%xmm11\n"
2033         "subps %%xmm7, %%xmm15\n"
2034         "movups %%xmm8, (%0)\n"
2035         "movups %%xmm9, (%1)\n"
2036         "movups %%xmm10, (%2)\n"
2037         "movups %%xmm11, (%3)\n"
2038         "movups %%xmm12, (%4)\n"
2039         "movups %%xmm13, (%5)\n"
2040         "movups %%xmm14, (%6)\n"
2041         "movups %%xmm15, (%7)\n"
2042         :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
2043       );
2044     }
2045   }
2046   for (int j = 0; j < 2048; j += 2048) {
2047     for (int k = 0; k < 256; k += 4) {
2048       __asm__ volatile (
2049         "movups (%0), %%xmm0\n"
2050         "movups (%1), %%xmm1\n"
2051         "movups (%2), %%xmm2\n"
2052         "movups (%3), %%xmm3\n"
2053         "movups (%4), %%xmm4\n"
2054         "movups (%5), %%xmm5\n"
2055         "movups (%6), %%xmm6\n"
2056         "movups (%7), %%xmm7\n"
2057         "movaps %%xmm0, %%xmm8\n"
2058         "movaps %%xmm0, %%xmm9\n"
2059         "addps %%xmm1, %%xmm8\n"
2060         "subps %%xmm1, %%xmm9\n"
2061         "movaps %%xmm2, %%xmm10\n"
2062         "movaps %%xmm2, %%xmm11\n"
2063         "addps %%xmm3, %%xmm10\n"
2064         "subps %%xmm3, %%xmm11\n"
2065         "movaps %%xmm4, %%xmm12\n"
2066         "movaps %%xmm4, %%xmm13\n"
2067         "addps %%xmm5, %%xmm12\n"
2068         "subps %%xmm5, %%xmm13\n"
2069         "movaps %%xmm6, %%xmm14\n"
2070         "movaps %%xmm6, %%xmm15\n"
2071         "addps %%xmm7, %%xmm14\n"
2072         "subps %%xmm7, %%xmm15\n"
2073         "movaps %%xmm8, %%xmm0\n"
2074         "movaps %%xmm8, %%xmm2\n"
2075         "addps %%xmm10, %%xmm0\n"
2076         "subps %%xmm10, %%xmm2\n"
2077         "movaps %%xmm9, %%xmm1\n"
2078         "movaps %%xmm9, %%xmm3\n"
2079         "addps %%xmm11, %%xmm1\n"
2080         "subps %%xmm11, %%xmm3\n"
2081         "movaps %%xmm12, %%xmm4\n"
2082         "movaps %%xmm12, %%xmm6\n"
2083         "addps %%xmm14, %%xmm4\n"
2084         "subps %%xmm14, %%xmm6\n"
2085         "movaps %%xmm13, %%xmm5\n"
2086         "movaps %%xmm13, %%xmm7\n"
2087         "addps %%xmm15, %%xmm5\n"
2088         "subps %%xmm15, %%xmm7\n"
2089         "movaps %%xmm0, %%xmm8\n"
2090         "movaps %%xmm0, %%xmm12\n"
2091         "addps %%xmm4, %%xmm8\n"
2092         "subps %%xmm4, %%xmm12\n"
2093         "movaps %%xmm1, %%xmm9\n"
2094         "movaps %%xmm1, %%xmm13\n"
2095         "addps %%xmm5, %%xmm9\n"
2096         "subps %%xmm5, %%xmm13\n"
2097         "movaps %%xmm2, %%xmm10\n"
2098         "movaps %%xmm2, %%xmm14\n"
2099         "addps %%xmm6, %%xmm10\n"
2100         "subps %%xmm6, %%xmm14\n"
2101         "movaps %%xmm3, %%xmm11\n"
2102         "movaps %%xmm3, %%xmm15\n"
2103         "addps %%xmm7, %%xmm11\n"
2104         "subps %%xmm7, %%xmm15\n"
2105         "movups %%xmm8, (%0)\n"
2106         "movups %%xmm9, (%1)\n"
2107         "movups %%xmm10, (%2)\n"
2108         "movups %%xmm11, (%3)\n"
2109         "movups %%xmm12, (%4)\n"
2110         "movups %%xmm13, (%5)\n"
2111         "movups %%xmm14, (%6)\n"
2112         "movups %%xmm15, (%7)\n"
2113         :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
2114       );
2115     }
2116   }
2117 }
2118 void helper_float_12_recursive(float *buf, int depth);
helper_float_12_recursive(float * buf,int depth)2119 void helper_float_12_recursive(float *buf, int depth) {
2120   if (depth == 7) {
2121     for (int j = 0; j < 128; j += 32) {
2122       for (int k = 0; k < 4; k += 4) {
2123         __asm__ volatile (
2124           "movups (%0), %%xmm0\n"
2125           "movups (%1), %%xmm1\n"
2126           "movups (%2), %%xmm2\n"
2127           "movups (%3), %%xmm3\n"
2128           "movups (%4), %%xmm4\n"
2129           "movups (%5), %%xmm5\n"
2130           "movups (%6), %%xmm6\n"
2131           "movups (%7), %%xmm7\n"
2132           "movaps %%xmm0, %%xmm8\n"
2133           "shufps $160, %%xmm8, %%xmm8\n"
2134           "shufps $245, %%xmm0, %%xmm0\n"
2135           "xorps %%xmm9, %%xmm9\n"
2136           "subps %%xmm0, %%xmm9\n"
2137           "addsubps %%xmm9, %%xmm8\n"
2138           "movaps %%xmm8, %%xmm0\n"
2139           "movaps %%xmm1, %%xmm8\n"
2140           "shufps $160, %%xmm8, %%xmm8\n"
2141           "shufps $245, %%xmm1, %%xmm1\n"
2142           "xorps %%xmm9, %%xmm9\n"
2143           "subps %%xmm1, %%xmm9\n"
2144           "addsubps %%xmm9, %%xmm8\n"
2145           "movaps %%xmm8, %%xmm1\n"
2146           "movaps %%xmm2, %%xmm8\n"
2147           "shufps $160, %%xmm8, %%xmm8\n"
2148           "shufps $245, %%xmm2, %%xmm2\n"
2149           "xorps %%xmm9, %%xmm9\n"
2150           "subps %%xmm2, %%xmm9\n"
2151           "addsubps %%xmm9, %%xmm8\n"
2152           "movaps %%xmm8, %%xmm2\n"
2153           "movaps %%xmm3, %%xmm8\n"
2154           "shufps $160, %%xmm8, %%xmm8\n"
2155           "shufps $245, %%xmm3, %%xmm3\n"
2156           "xorps %%xmm9, %%xmm9\n"
2157           "subps %%xmm3, %%xmm9\n"
2158           "addsubps %%xmm9, %%xmm8\n"
2159           "movaps %%xmm8, %%xmm3\n"
2160           "movaps %%xmm4, %%xmm8\n"
2161           "shufps $160, %%xmm8, %%xmm8\n"
2162           "shufps $245, %%xmm4, %%xmm4\n"
2163           "xorps %%xmm9, %%xmm9\n"
2164           "subps %%xmm4, %%xmm9\n"
2165           "addsubps %%xmm9, %%xmm8\n"
2166           "movaps %%xmm8, %%xmm4\n"
2167           "movaps %%xmm5, %%xmm8\n"
2168           "shufps $160, %%xmm8, %%xmm8\n"
2169           "shufps $245, %%xmm5, %%xmm5\n"
2170           "xorps %%xmm9, %%xmm9\n"
2171           "subps %%xmm5, %%xmm9\n"
2172           "addsubps %%xmm9, %%xmm8\n"
2173           "movaps %%xmm8, %%xmm5\n"
2174           "movaps %%xmm6, %%xmm8\n"
2175           "shufps $160, %%xmm8, %%xmm8\n"
2176           "shufps $245, %%xmm6, %%xmm6\n"
2177           "xorps %%xmm9, %%xmm9\n"
2178           "subps %%xmm6, %%xmm9\n"
2179           "addsubps %%xmm9, %%xmm8\n"
2180           "movaps %%xmm8, %%xmm6\n"
2181           "movaps %%xmm7, %%xmm8\n"
2182           "shufps $160, %%xmm8, %%xmm8\n"
2183           "shufps $245, %%xmm7, %%xmm7\n"
2184           "xorps %%xmm9, %%xmm9\n"
2185           "subps %%xmm7, %%xmm9\n"
2186           "addsubps %%xmm9, %%xmm8\n"
2187           "movaps %%xmm8, %%xmm7\n"
2188           "movaps %%xmm0, %%xmm8\n"
2189           "shufps $68, %%xmm8, %%xmm8\n"
2190           "xorps %%xmm9, %%xmm9\n"
2191           "movaps %%xmm0, %%xmm10\n"
2192           "shufps $14, %%xmm9, %%xmm10\n"
2193           "movaps %%xmm0, %%xmm11\n"
2194           "shufps $224, %%xmm11, %%xmm9\n"
2195           "addps %%xmm8, %%xmm10\n"
2196           "subps %%xmm9, %%xmm10\n"
2197           "movaps %%xmm10, %%xmm0\n"
2198           "movaps %%xmm1, %%xmm8\n"
2199           "shufps $68, %%xmm8, %%xmm8\n"
2200           "xorps %%xmm9, %%xmm9\n"
2201           "movaps %%xmm1, %%xmm10\n"
2202           "shufps $14, %%xmm9, %%xmm10\n"
2203           "movaps %%xmm1, %%xmm11\n"
2204           "shufps $224, %%xmm11, %%xmm9\n"
2205           "addps %%xmm8, %%xmm10\n"
2206           "subps %%xmm9, %%xmm10\n"
2207           "movaps %%xmm10, %%xmm1\n"
2208           "movaps %%xmm2, %%xmm8\n"
2209           "shufps $68, %%xmm8, %%xmm8\n"
2210           "xorps %%xmm9, %%xmm9\n"
2211           "movaps %%xmm2, %%xmm10\n"
2212           "shufps $14, %%xmm9, %%xmm10\n"
2213           "movaps %%xmm2, %%xmm11\n"
2214           "shufps $224, %%xmm11, %%xmm9\n"
2215           "addps %%xmm8, %%xmm10\n"
2216           "subps %%xmm9, %%xmm10\n"
2217           "movaps %%xmm10, %%xmm2\n"
2218           "movaps %%xmm3, %%xmm8\n"
2219           "shufps $68, %%xmm8, %%xmm8\n"
2220           "xorps %%xmm9, %%xmm9\n"
2221           "movaps %%xmm3, %%xmm10\n"
2222           "shufps $14, %%xmm9, %%xmm10\n"
2223           "movaps %%xmm3, %%xmm11\n"
2224           "shufps $224, %%xmm11, %%xmm9\n"
2225           "addps %%xmm8, %%xmm10\n"
2226           "subps %%xmm9, %%xmm10\n"
2227           "movaps %%xmm10, %%xmm3\n"
2228           "movaps %%xmm4, %%xmm8\n"
2229           "shufps $68, %%xmm8, %%xmm8\n"
2230           "xorps %%xmm9, %%xmm9\n"
2231           "movaps %%xmm4, %%xmm10\n"
2232           "shufps $14, %%xmm9, %%xmm10\n"
2233           "movaps %%xmm4, %%xmm11\n"
2234           "shufps $224, %%xmm11, %%xmm9\n"
2235           "addps %%xmm8, %%xmm10\n"
2236           "subps %%xmm9, %%xmm10\n"
2237           "movaps %%xmm10, %%xmm4\n"
2238           "movaps %%xmm5, %%xmm8\n"
2239           "shufps $68, %%xmm8, %%xmm8\n"
2240           "xorps %%xmm9, %%xmm9\n"
2241           "movaps %%xmm5, %%xmm10\n"
2242           "shufps $14, %%xmm9, %%xmm10\n"
2243           "movaps %%xmm5, %%xmm11\n"
2244           "shufps $224, %%xmm11, %%xmm9\n"
2245           "addps %%xmm8, %%xmm10\n"
2246           "subps %%xmm9, %%xmm10\n"
2247           "movaps %%xmm10, %%xmm5\n"
2248           "movaps %%xmm6, %%xmm8\n"
2249           "shufps $68, %%xmm8, %%xmm8\n"
2250           "xorps %%xmm9, %%xmm9\n"
2251           "movaps %%xmm6, %%xmm10\n"
2252           "shufps $14, %%xmm9, %%xmm10\n"
2253           "movaps %%xmm6, %%xmm11\n"
2254           "shufps $224, %%xmm11, %%xmm9\n"
2255           "addps %%xmm8, %%xmm10\n"
2256           "subps %%xmm9, %%xmm10\n"
2257           "movaps %%xmm10, %%xmm6\n"
2258           "movaps %%xmm7, %%xmm8\n"
2259           "shufps $68, %%xmm8, %%xmm8\n"
2260           "xorps %%xmm9, %%xmm9\n"
2261           "movaps %%xmm7, %%xmm10\n"
2262           "shufps $14, %%xmm9, %%xmm10\n"
2263           "movaps %%xmm7, %%xmm11\n"
2264           "shufps $224, %%xmm11, %%xmm9\n"
2265           "addps %%xmm8, %%xmm10\n"
2266           "subps %%xmm9, %%xmm10\n"
2267           "movaps %%xmm10, %%xmm7\n"
2268           "movaps %%xmm0, %%xmm8\n"
2269           "movaps %%xmm0, %%xmm9\n"
2270           "addps %%xmm1, %%xmm8\n"
2271           "subps %%xmm1, %%xmm9\n"
2272           "movaps %%xmm2, %%xmm10\n"
2273           "movaps %%xmm2, %%xmm11\n"
2274           "addps %%xmm3, %%xmm10\n"
2275           "subps %%xmm3, %%xmm11\n"
2276           "movaps %%xmm4, %%xmm12\n"
2277           "movaps %%xmm4, %%xmm13\n"
2278           "addps %%xmm5, %%xmm12\n"
2279           "subps %%xmm5, %%xmm13\n"
2280           "movaps %%xmm6, %%xmm14\n"
2281           "movaps %%xmm6, %%xmm15\n"
2282           "addps %%xmm7, %%xmm14\n"
2283           "subps %%xmm7, %%xmm15\n"
2284           "movaps %%xmm8, %%xmm0\n"
2285           "movaps %%xmm8, %%xmm2\n"
2286           "addps %%xmm10, %%xmm0\n"
2287           "subps %%xmm10, %%xmm2\n"
2288           "movaps %%xmm9, %%xmm1\n"
2289           "movaps %%xmm9, %%xmm3\n"
2290           "addps %%xmm11, %%xmm1\n"
2291           "subps %%xmm11, %%xmm3\n"
2292           "movaps %%xmm12, %%xmm4\n"
2293           "movaps %%xmm12, %%xmm6\n"
2294           "addps %%xmm14, %%xmm4\n"
2295           "subps %%xmm14, %%xmm6\n"
2296           "movaps %%xmm13, %%xmm5\n"
2297           "movaps %%xmm13, %%xmm7\n"
2298           "addps %%xmm15, %%xmm5\n"
2299           "subps %%xmm15, %%xmm7\n"
2300           "movaps %%xmm0, %%xmm8\n"
2301           "movaps %%xmm0, %%xmm12\n"
2302           "addps %%xmm4, %%xmm8\n"
2303           "subps %%xmm4, %%xmm12\n"
2304           "movaps %%xmm1, %%xmm9\n"
2305           "movaps %%xmm1, %%xmm13\n"
2306           "addps %%xmm5, %%xmm9\n"
2307           "subps %%xmm5, %%xmm13\n"
2308           "movaps %%xmm2, %%xmm10\n"
2309           "movaps %%xmm2, %%xmm14\n"
2310           "addps %%xmm6, %%xmm10\n"
2311           "subps %%xmm6, %%xmm14\n"
2312           "movaps %%xmm3, %%xmm11\n"
2313           "movaps %%xmm3, %%xmm15\n"
2314           "addps %%xmm7, %%xmm11\n"
2315           "subps %%xmm7, %%xmm15\n"
2316           "movups %%xmm8, (%0)\n"
2317           "movups %%xmm9, (%1)\n"
2318           "movups %%xmm10, (%2)\n"
2319           "movups %%xmm11, (%3)\n"
2320           "movups %%xmm12, (%4)\n"
2321           "movups %%xmm13, (%5)\n"
2322           "movups %%xmm14, (%6)\n"
2323           "movups %%xmm15, (%7)\n"
2324           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
2325         );
2326       }
2327     }
2328     for (int j = 0; j < 128; j += 128) {
2329       for (int k = 0; k < 32; k += 4) {
2330         __asm__ volatile (
2331           "movups (%0), %%xmm0\n"
2332           "movups (%1), %%xmm1\n"
2333           "movups (%2), %%xmm2\n"
2334           "movups (%3), %%xmm3\n"
2335           "movaps %%xmm0, %%xmm8\n"
2336           "movaps %%xmm0, %%xmm9\n"
2337           "addps %%xmm1, %%xmm8\n"
2338           "subps %%xmm1, %%xmm9\n"
2339           "movaps %%xmm2, %%xmm10\n"
2340           "movaps %%xmm2, %%xmm11\n"
2341           "addps %%xmm3, %%xmm10\n"
2342           "subps %%xmm3, %%xmm11\n"
2343           "movaps %%xmm8, %%xmm0\n"
2344           "movaps %%xmm8, %%xmm2\n"
2345           "addps %%xmm10, %%xmm0\n"
2346           "subps %%xmm10, %%xmm2\n"
2347           "movaps %%xmm9, %%xmm1\n"
2348           "movaps %%xmm9, %%xmm3\n"
2349           "addps %%xmm11, %%xmm1\n"
2350           "subps %%xmm11, %%xmm3\n"
2351           "movups %%xmm0, (%0)\n"
2352           "movups %%xmm1, (%1)\n"
2353           "movups %%xmm2, (%2)\n"
2354           "movups %%xmm3, (%3)\n"
2355           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
2356         );
2357       }
2358     }
2359     return;
2360   }
2361   if (depth == 10) {
2362     helper_float_12_recursive(buf + 0, 7);
2363     helper_float_12_recursive(buf + 128, 7);
2364     helper_float_12_recursive(buf + 256, 7);
2365     helper_float_12_recursive(buf + 384, 7);
2366     helper_float_12_recursive(buf + 512, 7);
2367     helper_float_12_recursive(buf + 640, 7);
2368     helper_float_12_recursive(buf + 768, 7);
2369     helper_float_12_recursive(buf + 896, 7);
2370     for (int j = 0; j < 1024; j += 1024) {
2371       for (int k = 0; k < 128; k += 4) {
2372         __asm__ volatile (
2373           "movups (%0), %%xmm0\n"
2374           "movups (%1), %%xmm1\n"
2375           "movups (%2), %%xmm2\n"
2376           "movups (%3), %%xmm3\n"
2377           "movups (%4), %%xmm4\n"
2378           "movups (%5), %%xmm5\n"
2379           "movups (%6), %%xmm6\n"
2380           "movups (%7), %%xmm7\n"
2381           "movaps %%xmm0, %%xmm8\n"
2382           "movaps %%xmm0, %%xmm9\n"
2383           "addps %%xmm1, %%xmm8\n"
2384           "subps %%xmm1, %%xmm9\n"
2385           "movaps %%xmm2, %%xmm10\n"
2386           "movaps %%xmm2, %%xmm11\n"
2387           "addps %%xmm3, %%xmm10\n"
2388           "subps %%xmm3, %%xmm11\n"
2389           "movaps %%xmm4, %%xmm12\n"
2390           "movaps %%xmm4, %%xmm13\n"
2391           "addps %%xmm5, %%xmm12\n"
2392           "subps %%xmm5, %%xmm13\n"
2393           "movaps %%xmm6, %%xmm14\n"
2394           "movaps %%xmm6, %%xmm15\n"
2395           "addps %%xmm7, %%xmm14\n"
2396           "subps %%xmm7, %%xmm15\n"
2397           "movaps %%xmm8, %%xmm0\n"
2398           "movaps %%xmm8, %%xmm2\n"
2399           "addps %%xmm10, %%xmm0\n"
2400           "subps %%xmm10, %%xmm2\n"
2401           "movaps %%xmm9, %%xmm1\n"
2402           "movaps %%xmm9, %%xmm3\n"
2403           "addps %%xmm11, %%xmm1\n"
2404           "subps %%xmm11, %%xmm3\n"
2405           "movaps %%xmm12, %%xmm4\n"
2406           "movaps %%xmm12, %%xmm6\n"
2407           "addps %%xmm14, %%xmm4\n"
2408           "subps %%xmm14, %%xmm6\n"
2409           "movaps %%xmm13, %%xmm5\n"
2410           "movaps %%xmm13, %%xmm7\n"
2411           "addps %%xmm15, %%xmm5\n"
2412           "subps %%xmm15, %%xmm7\n"
2413           "movaps %%xmm0, %%xmm8\n"
2414           "movaps %%xmm0, %%xmm12\n"
2415           "addps %%xmm4, %%xmm8\n"
2416           "subps %%xmm4, %%xmm12\n"
2417           "movaps %%xmm1, %%xmm9\n"
2418           "movaps %%xmm1, %%xmm13\n"
2419           "addps %%xmm5, %%xmm9\n"
2420           "subps %%xmm5, %%xmm13\n"
2421           "movaps %%xmm2, %%xmm10\n"
2422           "movaps %%xmm2, %%xmm14\n"
2423           "addps %%xmm6, %%xmm10\n"
2424           "subps %%xmm6, %%xmm14\n"
2425           "movaps %%xmm3, %%xmm11\n"
2426           "movaps %%xmm3, %%xmm15\n"
2427           "addps %%xmm7, %%xmm11\n"
2428           "subps %%xmm7, %%xmm15\n"
2429           "movups %%xmm8, (%0)\n"
2430           "movups %%xmm9, (%1)\n"
2431           "movups %%xmm10, (%2)\n"
2432           "movups %%xmm11, (%3)\n"
2433           "movups %%xmm12, (%4)\n"
2434           "movups %%xmm13, (%5)\n"
2435           "movups %%xmm14, (%6)\n"
2436           "movups %%xmm15, (%7)\n"
2437           :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
2438         );
2439       }
2440     }
2441     return;
2442   }
2443   if (depth == 12) {
2444     helper_float_12_recursive(buf + 0, 10);
2445     helper_float_12_recursive(buf + 1024, 10);
2446     helper_float_12_recursive(buf + 2048, 10);
2447     helper_float_12_recursive(buf + 3072, 10);
2448     for (int j = 0; j < 4096; j += 4096) {
2449       for (int k = 0; k < 1024; k += 4) {
2450         __asm__ volatile (
2451           "movups (%0), %%xmm0\n"
2452           "movups (%1), %%xmm1\n"
2453           "movups (%2), %%xmm2\n"
2454           "movups (%3), %%xmm3\n"
2455           "movaps %%xmm0, %%xmm8\n"
2456           "movaps %%xmm0, %%xmm9\n"
2457           "addps %%xmm1, %%xmm8\n"
2458           "subps %%xmm1, %%xmm9\n"
2459           "movaps %%xmm2, %%xmm10\n"
2460           "movaps %%xmm2, %%xmm11\n"
2461           "addps %%xmm3, %%xmm10\n"
2462           "subps %%xmm3, %%xmm11\n"
2463           "movaps %%xmm8, %%xmm0\n"
2464           "movaps %%xmm8, %%xmm2\n"
2465           "addps %%xmm10, %%xmm0\n"
2466           "subps %%xmm10, %%xmm2\n"
2467           "movaps %%xmm9, %%xmm1\n"
2468           "movaps %%xmm9, %%xmm3\n"
2469           "addps %%xmm11, %%xmm1\n"
2470           "subps %%xmm11, %%xmm3\n"
2471           "movups %%xmm0, (%0)\n"
2472           "movups %%xmm1, (%1)\n"
2473           "movups %%xmm2, (%2)\n"
2474           "movups %%xmm3, (%3)\n"
2475           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
2476         );
2477       }
2478     }
2479     return;
2480   }
2481 }
2482 void helper_float_12(float *buf);
helper_float_12(float * buf)2483 void helper_float_12(float *buf) {
2484   helper_float_12_recursive(buf, 12);
2485 }
2486 void helper_float_13_recursive(float *buf, int depth);
helper_float_13_recursive(float * buf,int depth)2487 void helper_float_13_recursive(float *buf, int depth) {
2488   if (depth == 11) {
2489     for (int j = 0; j < 2048; j += 32) {
2490       for (int k = 0; k < 4; k += 4) {
2491         __asm__ volatile (
2492           "movups (%0), %%xmm0\n"
2493           "movups (%1), %%xmm1\n"
2494           "movups (%2), %%xmm2\n"
2495           "movups (%3), %%xmm3\n"
2496           "movups (%4), %%xmm4\n"
2497           "movups (%5), %%xmm5\n"
2498           "movups (%6), %%xmm6\n"
2499           "movups (%7), %%xmm7\n"
2500           "movaps %%xmm0, %%xmm8\n"
2501           "shufps $160, %%xmm8, %%xmm8\n"
2502           "shufps $245, %%xmm0, %%xmm0\n"
2503           "xorps %%xmm9, %%xmm9\n"
2504           "subps %%xmm0, %%xmm9\n"
2505           "addsubps %%xmm9, %%xmm8\n"
2506           "movaps %%xmm8, %%xmm0\n"
2507           "movaps %%xmm1, %%xmm8\n"
2508           "shufps $160, %%xmm8, %%xmm8\n"
2509           "shufps $245, %%xmm1, %%xmm1\n"
2510           "xorps %%xmm9, %%xmm9\n"
2511           "subps %%xmm1, %%xmm9\n"
2512           "addsubps %%xmm9, %%xmm8\n"
2513           "movaps %%xmm8, %%xmm1\n"
2514           "movaps %%xmm2, %%xmm8\n"
2515           "shufps $160, %%xmm8, %%xmm8\n"
2516           "shufps $245, %%xmm2, %%xmm2\n"
2517           "xorps %%xmm9, %%xmm9\n"
2518           "subps %%xmm2, %%xmm9\n"
2519           "addsubps %%xmm9, %%xmm8\n"
2520           "movaps %%xmm8, %%xmm2\n"
2521           "movaps %%xmm3, %%xmm8\n"
2522           "shufps $160, %%xmm8, %%xmm8\n"
2523           "shufps $245, %%xmm3, %%xmm3\n"
2524           "xorps %%xmm9, %%xmm9\n"
2525           "subps %%xmm3, %%xmm9\n"
2526           "addsubps %%xmm9, %%xmm8\n"
2527           "movaps %%xmm8, %%xmm3\n"
2528           "movaps %%xmm4, %%xmm8\n"
2529           "shufps $160, %%xmm8, %%xmm8\n"
2530           "shufps $245, %%xmm4, %%xmm4\n"
2531           "xorps %%xmm9, %%xmm9\n"
2532           "subps %%xmm4, %%xmm9\n"
2533           "addsubps %%xmm9, %%xmm8\n"
2534           "movaps %%xmm8, %%xmm4\n"
2535           "movaps %%xmm5, %%xmm8\n"
2536           "shufps $160, %%xmm8, %%xmm8\n"
2537           "shufps $245, %%xmm5, %%xmm5\n"
2538           "xorps %%xmm9, %%xmm9\n"
2539           "subps %%xmm5, %%xmm9\n"
2540           "addsubps %%xmm9, %%xmm8\n"
2541           "movaps %%xmm8, %%xmm5\n"
2542           "movaps %%xmm6, %%xmm8\n"
2543           "shufps $160, %%xmm8, %%xmm8\n"
2544           "shufps $245, %%xmm6, %%xmm6\n"
2545           "xorps %%xmm9, %%xmm9\n"
2546           "subps %%xmm6, %%xmm9\n"
2547           "addsubps %%xmm9, %%xmm8\n"
2548           "movaps %%xmm8, %%xmm6\n"
2549           "movaps %%xmm7, %%xmm8\n"
2550           "shufps $160, %%xmm8, %%xmm8\n"
2551           "shufps $245, %%xmm7, %%xmm7\n"
2552           "xorps %%xmm9, %%xmm9\n"
2553           "subps %%xmm7, %%xmm9\n"
2554           "addsubps %%xmm9, %%xmm8\n"
2555           "movaps %%xmm8, %%xmm7\n"
2556           "movaps %%xmm0, %%xmm8\n"
2557           "shufps $68, %%xmm8, %%xmm8\n"
2558           "xorps %%xmm9, %%xmm9\n"
2559           "movaps %%xmm0, %%xmm10\n"
2560           "shufps $14, %%xmm9, %%xmm10\n"
2561           "movaps %%xmm0, %%xmm11\n"
2562           "shufps $224, %%xmm11, %%xmm9\n"
2563           "addps %%xmm8, %%xmm10\n"
2564           "subps %%xmm9, %%xmm10\n"
2565           "movaps %%xmm10, %%xmm0\n"
2566           "movaps %%xmm1, %%xmm8\n"
2567           "shufps $68, %%xmm8, %%xmm8\n"
2568           "xorps %%xmm9, %%xmm9\n"
2569           "movaps %%xmm1, %%xmm10\n"
2570           "shufps $14, %%xmm9, %%xmm10\n"
2571           "movaps %%xmm1, %%xmm11\n"
2572           "shufps $224, %%xmm11, %%xmm9\n"
2573           "addps %%xmm8, %%xmm10\n"
2574           "subps %%xmm9, %%xmm10\n"
2575           "movaps %%xmm10, %%xmm1\n"
2576           "movaps %%xmm2, %%xmm8\n"
2577           "shufps $68, %%xmm8, %%xmm8\n"
2578           "xorps %%xmm9, %%xmm9\n"
2579           "movaps %%xmm2, %%xmm10\n"
2580           "shufps $14, %%xmm9, %%xmm10\n"
2581           "movaps %%xmm2, %%xmm11\n"
2582           "shufps $224, %%xmm11, %%xmm9\n"
2583           "addps %%xmm8, %%xmm10\n"
2584           "subps %%xmm9, %%xmm10\n"
2585           "movaps %%xmm10, %%xmm2\n"
2586           "movaps %%xmm3, %%xmm8\n"
2587           "shufps $68, %%xmm8, %%xmm8\n"
2588           "xorps %%xmm9, %%xmm9\n"
2589           "movaps %%xmm3, %%xmm10\n"
2590           "shufps $14, %%xmm9, %%xmm10\n"
2591           "movaps %%xmm3, %%xmm11\n"
2592           "shufps $224, %%xmm11, %%xmm9\n"
2593           "addps %%xmm8, %%xmm10\n"
2594           "subps %%xmm9, %%xmm10\n"
2595           "movaps %%xmm10, %%xmm3\n"
2596           "movaps %%xmm4, %%xmm8\n"
2597           "shufps $68, %%xmm8, %%xmm8\n"
2598           "xorps %%xmm9, %%xmm9\n"
2599           "movaps %%xmm4, %%xmm10\n"
2600           "shufps $14, %%xmm9, %%xmm10\n"
2601           "movaps %%xmm4, %%xmm11\n"
2602           "shufps $224, %%xmm11, %%xmm9\n"
2603           "addps %%xmm8, %%xmm10\n"
2604           "subps %%xmm9, %%xmm10\n"
2605           "movaps %%xmm10, %%xmm4\n"
2606           "movaps %%xmm5, %%xmm8\n"
2607           "shufps $68, %%xmm8, %%xmm8\n"
2608           "xorps %%xmm9, %%xmm9\n"
2609           "movaps %%xmm5, %%xmm10\n"
2610           "shufps $14, %%xmm9, %%xmm10\n"
2611           "movaps %%xmm5, %%xmm11\n"
2612           "shufps $224, %%xmm11, %%xmm9\n"
2613           "addps %%xmm8, %%xmm10\n"
2614           "subps %%xmm9, %%xmm10\n"
2615           "movaps %%xmm10, %%xmm5\n"
2616           "movaps %%xmm6, %%xmm8\n"
2617           "shufps $68, %%xmm8, %%xmm8\n"
2618           "xorps %%xmm9, %%xmm9\n"
2619           "movaps %%xmm6, %%xmm10\n"
2620           "shufps $14, %%xmm9, %%xmm10\n"
2621           "movaps %%xmm6, %%xmm11\n"
2622           "shufps $224, %%xmm11, %%xmm9\n"
2623           "addps %%xmm8, %%xmm10\n"
2624           "subps %%xmm9, %%xmm10\n"
2625           "movaps %%xmm10, %%xmm6\n"
2626           "movaps %%xmm7, %%xmm8\n"
2627           "shufps $68, %%xmm8, %%xmm8\n"
2628           "xorps %%xmm9, %%xmm9\n"
2629           "movaps %%xmm7, %%xmm10\n"
2630           "shufps $14, %%xmm9, %%xmm10\n"
2631           "movaps %%xmm7, %%xmm11\n"
2632           "shufps $224, %%xmm11, %%xmm9\n"
2633           "addps %%xmm8, %%xmm10\n"
2634           "subps %%xmm9, %%xmm10\n"
2635           "movaps %%xmm10, %%xmm7\n"
2636           "movaps %%xmm0, %%xmm8\n"
2637           "movaps %%xmm0, %%xmm9\n"
2638           "addps %%xmm1, %%xmm8\n"
2639           "subps %%xmm1, %%xmm9\n"
2640           "movaps %%xmm2, %%xmm10\n"
2641           "movaps %%xmm2, %%xmm11\n"
2642           "addps %%xmm3, %%xmm10\n"
2643           "subps %%xmm3, %%xmm11\n"
2644           "movaps %%xmm4, %%xmm12\n"
2645           "movaps %%xmm4, %%xmm13\n"
2646           "addps %%xmm5, %%xmm12\n"
2647           "subps %%xmm5, %%xmm13\n"
2648           "movaps %%xmm6, %%xmm14\n"
2649           "movaps %%xmm6, %%xmm15\n"
2650           "addps %%xmm7, %%xmm14\n"
2651           "subps %%xmm7, %%xmm15\n"
2652           "movaps %%xmm8, %%xmm0\n"
2653           "movaps %%xmm8, %%xmm2\n"
2654           "addps %%xmm10, %%xmm0\n"
2655           "subps %%xmm10, %%xmm2\n"
2656           "movaps %%xmm9, %%xmm1\n"
2657           "movaps %%xmm9, %%xmm3\n"
2658           "addps %%xmm11, %%xmm1\n"
2659           "subps %%xmm11, %%xmm3\n"
2660           "movaps %%xmm12, %%xmm4\n"
2661           "movaps %%xmm12, %%xmm6\n"
2662           "addps %%xmm14, %%xmm4\n"
2663           "subps %%xmm14, %%xmm6\n"
2664           "movaps %%xmm13, %%xmm5\n"
2665           "movaps %%xmm13, %%xmm7\n"
2666           "addps %%xmm15, %%xmm5\n"
2667           "subps %%xmm15, %%xmm7\n"
2668           "movaps %%xmm0, %%xmm8\n"
2669           "movaps %%xmm0, %%xmm12\n"
2670           "addps %%xmm4, %%xmm8\n"
2671           "subps %%xmm4, %%xmm12\n"
2672           "movaps %%xmm1, %%xmm9\n"
2673           "movaps %%xmm1, %%xmm13\n"
2674           "addps %%xmm5, %%xmm9\n"
2675           "subps %%xmm5, %%xmm13\n"
2676           "movaps %%xmm2, %%xmm10\n"
2677           "movaps %%xmm2, %%xmm14\n"
2678           "addps %%xmm6, %%xmm10\n"
2679           "subps %%xmm6, %%xmm14\n"
2680           "movaps %%xmm3, %%xmm11\n"
2681           "movaps %%xmm3, %%xmm15\n"
2682           "addps %%xmm7, %%xmm11\n"
2683           "subps %%xmm7, %%xmm15\n"
2684           "movups %%xmm8, (%0)\n"
2685           "movups %%xmm9, (%1)\n"
2686           "movups %%xmm10, (%2)\n"
2687           "movups %%xmm11, (%3)\n"
2688           "movups %%xmm12, (%4)\n"
2689           "movups %%xmm13, (%5)\n"
2690           "movups %%xmm14, (%6)\n"
2691           "movups %%xmm15, (%7)\n"
2692           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
2693         );
2694       }
2695     }
2696     for (int j = 0; j < 2048; j += 256) {
2697       for (int k = 0; k < 32; k += 4) {
2698         __asm__ volatile (
2699           "movups (%0), %%xmm0\n"
2700           "movups (%1), %%xmm1\n"
2701           "movups (%2), %%xmm2\n"
2702           "movups (%3), %%xmm3\n"
2703           "movups (%4), %%xmm4\n"
2704           "movups (%5), %%xmm5\n"
2705           "movups (%6), %%xmm6\n"
2706           "movups (%7), %%xmm7\n"
2707           "movaps %%xmm0, %%xmm8\n"
2708           "movaps %%xmm0, %%xmm9\n"
2709           "addps %%xmm1, %%xmm8\n"
2710           "subps %%xmm1, %%xmm9\n"
2711           "movaps %%xmm2, %%xmm10\n"
2712           "movaps %%xmm2, %%xmm11\n"
2713           "addps %%xmm3, %%xmm10\n"
2714           "subps %%xmm3, %%xmm11\n"
2715           "movaps %%xmm4, %%xmm12\n"
2716           "movaps %%xmm4, %%xmm13\n"
2717           "addps %%xmm5, %%xmm12\n"
2718           "subps %%xmm5, %%xmm13\n"
2719           "movaps %%xmm6, %%xmm14\n"
2720           "movaps %%xmm6, %%xmm15\n"
2721           "addps %%xmm7, %%xmm14\n"
2722           "subps %%xmm7, %%xmm15\n"
2723           "movaps %%xmm8, %%xmm0\n"
2724           "movaps %%xmm8, %%xmm2\n"
2725           "addps %%xmm10, %%xmm0\n"
2726           "subps %%xmm10, %%xmm2\n"
2727           "movaps %%xmm9, %%xmm1\n"
2728           "movaps %%xmm9, %%xmm3\n"
2729           "addps %%xmm11, %%xmm1\n"
2730           "subps %%xmm11, %%xmm3\n"
2731           "movaps %%xmm12, %%xmm4\n"
2732           "movaps %%xmm12, %%xmm6\n"
2733           "addps %%xmm14, %%xmm4\n"
2734           "subps %%xmm14, %%xmm6\n"
2735           "movaps %%xmm13, %%xmm5\n"
2736           "movaps %%xmm13, %%xmm7\n"
2737           "addps %%xmm15, %%xmm5\n"
2738           "subps %%xmm15, %%xmm7\n"
2739           "movaps %%xmm0, %%xmm8\n"
2740           "movaps %%xmm0, %%xmm12\n"
2741           "addps %%xmm4, %%xmm8\n"
2742           "subps %%xmm4, %%xmm12\n"
2743           "movaps %%xmm1, %%xmm9\n"
2744           "movaps %%xmm1, %%xmm13\n"
2745           "addps %%xmm5, %%xmm9\n"
2746           "subps %%xmm5, %%xmm13\n"
2747           "movaps %%xmm2, %%xmm10\n"
2748           "movaps %%xmm2, %%xmm14\n"
2749           "addps %%xmm6, %%xmm10\n"
2750           "subps %%xmm6, %%xmm14\n"
2751           "movaps %%xmm3, %%xmm11\n"
2752           "movaps %%xmm3, %%xmm15\n"
2753           "addps %%xmm7, %%xmm11\n"
2754           "subps %%xmm7, %%xmm15\n"
2755           "movups %%xmm8, (%0)\n"
2756           "movups %%xmm9, (%1)\n"
2757           "movups %%xmm10, (%2)\n"
2758           "movups %%xmm11, (%3)\n"
2759           "movups %%xmm12, (%4)\n"
2760           "movups %%xmm13, (%5)\n"
2761           "movups %%xmm14, (%6)\n"
2762           "movups %%xmm15, (%7)\n"
2763           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
2764         );
2765       }
2766     }
2767     for (int j = 0; j < 2048; j += 2048) {
2768       for (int k = 0; k < 256; k += 4) {
2769         __asm__ volatile (
2770           "movups (%0), %%xmm0\n"
2771           "movups (%1), %%xmm1\n"
2772           "movups (%2), %%xmm2\n"
2773           "movups (%3), %%xmm3\n"
2774           "movups (%4), %%xmm4\n"
2775           "movups (%5), %%xmm5\n"
2776           "movups (%6), %%xmm6\n"
2777           "movups (%7), %%xmm7\n"
2778           "movaps %%xmm0, %%xmm8\n"
2779           "movaps %%xmm0, %%xmm9\n"
2780           "addps %%xmm1, %%xmm8\n"
2781           "subps %%xmm1, %%xmm9\n"
2782           "movaps %%xmm2, %%xmm10\n"
2783           "movaps %%xmm2, %%xmm11\n"
2784           "addps %%xmm3, %%xmm10\n"
2785           "subps %%xmm3, %%xmm11\n"
2786           "movaps %%xmm4, %%xmm12\n"
2787           "movaps %%xmm4, %%xmm13\n"
2788           "addps %%xmm5, %%xmm12\n"
2789           "subps %%xmm5, %%xmm13\n"
2790           "movaps %%xmm6, %%xmm14\n"
2791           "movaps %%xmm6, %%xmm15\n"
2792           "addps %%xmm7, %%xmm14\n"
2793           "subps %%xmm7, %%xmm15\n"
2794           "movaps %%xmm8, %%xmm0\n"
2795           "movaps %%xmm8, %%xmm2\n"
2796           "addps %%xmm10, %%xmm0\n"
2797           "subps %%xmm10, %%xmm2\n"
2798           "movaps %%xmm9, %%xmm1\n"
2799           "movaps %%xmm9, %%xmm3\n"
2800           "addps %%xmm11, %%xmm1\n"
2801           "subps %%xmm11, %%xmm3\n"
2802           "movaps %%xmm12, %%xmm4\n"
2803           "movaps %%xmm12, %%xmm6\n"
2804           "addps %%xmm14, %%xmm4\n"
2805           "subps %%xmm14, %%xmm6\n"
2806           "movaps %%xmm13, %%xmm5\n"
2807           "movaps %%xmm13, %%xmm7\n"
2808           "addps %%xmm15, %%xmm5\n"
2809           "subps %%xmm15, %%xmm7\n"
2810           "movaps %%xmm0, %%xmm8\n"
2811           "movaps %%xmm0, %%xmm12\n"
2812           "addps %%xmm4, %%xmm8\n"
2813           "subps %%xmm4, %%xmm12\n"
2814           "movaps %%xmm1, %%xmm9\n"
2815           "movaps %%xmm1, %%xmm13\n"
2816           "addps %%xmm5, %%xmm9\n"
2817           "subps %%xmm5, %%xmm13\n"
2818           "movaps %%xmm2, %%xmm10\n"
2819           "movaps %%xmm2, %%xmm14\n"
2820           "addps %%xmm6, %%xmm10\n"
2821           "subps %%xmm6, %%xmm14\n"
2822           "movaps %%xmm3, %%xmm11\n"
2823           "movaps %%xmm3, %%xmm15\n"
2824           "addps %%xmm7, %%xmm11\n"
2825           "subps %%xmm7, %%xmm15\n"
2826           "movups %%xmm8, (%0)\n"
2827           "movups %%xmm9, (%1)\n"
2828           "movups %%xmm10, (%2)\n"
2829           "movups %%xmm11, (%3)\n"
2830           "movups %%xmm12, (%4)\n"
2831           "movups %%xmm13, (%5)\n"
2832           "movups %%xmm14, (%6)\n"
2833           "movups %%xmm15, (%7)\n"
2834           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
2835         );
2836       }
2837     }
2838     return;
2839   }
2840   if (depth == 13) {
2841     helper_float_13_recursive(buf + 0, 11);
2842     helper_float_13_recursive(buf + 2048, 11);
2843     helper_float_13_recursive(buf + 4096, 11);
2844     helper_float_13_recursive(buf + 6144, 11);
2845     for (int j = 0; j < 8192; j += 8192) {
2846       for (int k = 0; k < 2048; k += 4) {
2847         __asm__ volatile (
2848           "movups (%0), %%xmm0\n"
2849           "movups (%1), %%xmm1\n"
2850           "movups (%2), %%xmm2\n"
2851           "movups (%3), %%xmm3\n"
2852           "movaps %%xmm0, %%xmm8\n"
2853           "movaps %%xmm0, %%xmm9\n"
2854           "addps %%xmm1, %%xmm8\n"
2855           "subps %%xmm1, %%xmm9\n"
2856           "movaps %%xmm2, %%xmm10\n"
2857           "movaps %%xmm2, %%xmm11\n"
2858           "addps %%xmm3, %%xmm10\n"
2859           "subps %%xmm3, %%xmm11\n"
2860           "movaps %%xmm8, %%xmm0\n"
2861           "movaps %%xmm8, %%xmm2\n"
2862           "addps %%xmm10, %%xmm0\n"
2863           "subps %%xmm10, %%xmm2\n"
2864           "movaps %%xmm9, %%xmm1\n"
2865           "movaps %%xmm9, %%xmm3\n"
2866           "addps %%xmm11, %%xmm1\n"
2867           "subps %%xmm11, %%xmm3\n"
2868           "movups %%xmm0, (%0)\n"
2869           "movups %%xmm1, (%1)\n"
2870           "movups %%xmm2, (%2)\n"
2871           "movups %%xmm3, (%3)\n"
2872           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
2873         );
2874       }
2875     }
2876     return;
2877   }
2878 }
2879 void helper_float_13(float *buf);
helper_float_13(float * buf)2880 void helper_float_13(float *buf) {
2881   helper_float_13_recursive(buf, 13);
2882 }
2883 void helper_float_14_recursive(float *buf, int depth);
helper_float_14_recursive(float * buf,int depth)2884 void helper_float_14_recursive(float *buf, int depth) {
2885   if (depth == 11) {
2886     for (int j = 0; j < 2048; j += 32) {
2887       for (int k = 0; k < 4; k += 4) {
2888         __asm__ volatile (
2889           "movups (%0), %%xmm0\n"
2890           "movups (%1), %%xmm1\n"
2891           "movups (%2), %%xmm2\n"
2892           "movups (%3), %%xmm3\n"
2893           "movups (%4), %%xmm4\n"
2894           "movups (%5), %%xmm5\n"
2895           "movups (%6), %%xmm6\n"
2896           "movups (%7), %%xmm7\n"
2897           "movaps %%xmm0, %%xmm8\n"
2898           "shufps $160, %%xmm8, %%xmm8\n"
2899           "shufps $245, %%xmm0, %%xmm0\n"
2900           "xorps %%xmm9, %%xmm9\n"
2901           "subps %%xmm0, %%xmm9\n"
2902           "addsubps %%xmm9, %%xmm8\n"
2903           "movaps %%xmm8, %%xmm0\n"
2904           "movaps %%xmm1, %%xmm8\n"
2905           "shufps $160, %%xmm8, %%xmm8\n"
2906           "shufps $245, %%xmm1, %%xmm1\n"
2907           "xorps %%xmm9, %%xmm9\n"
2908           "subps %%xmm1, %%xmm9\n"
2909           "addsubps %%xmm9, %%xmm8\n"
2910           "movaps %%xmm8, %%xmm1\n"
2911           "movaps %%xmm2, %%xmm8\n"
2912           "shufps $160, %%xmm8, %%xmm8\n"
2913           "shufps $245, %%xmm2, %%xmm2\n"
2914           "xorps %%xmm9, %%xmm9\n"
2915           "subps %%xmm2, %%xmm9\n"
2916           "addsubps %%xmm9, %%xmm8\n"
2917           "movaps %%xmm8, %%xmm2\n"
2918           "movaps %%xmm3, %%xmm8\n"
2919           "shufps $160, %%xmm8, %%xmm8\n"
2920           "shufps $245, %%xmm3, %%xmm3\n"
2921           "xorps %%xmm9, %%xmm9\n"
2922           "subps %%xmm3, %%xmm9\n"
2923           "addsubps %%xmm9, %%xmm8\n"
2924           "movaps %%xmm8, %%xmm3\n"
2925           "movaps %%xmm4, %%xmm8\n"
2926           "shufps $160, %%xmm8, %%xmm8\n"
2927           "shufps $245, %%xmm4, %%xmm4\n"
2928           "xorps %%xmm9, %%xmm9\n"
2929           "subps %%xmm4, %%xmm9\n"
2930           "addsubps %%xmm9, %%xmm8\n"
2931           "movaps %%xmm8, %%xmm4\n"
2932           "movaps %%xmm5, %%xmm8\n"
2933           "shufps $160, %%xmm8, %%xmm8\n"
2934           "shufps $245, %%xmm5, %%xmm5\n"
2935           "xorps %%xmm9, %%xmm9\n"
2936           "subps %%xmm5, %%xmm9\n"
2937           "addsubps %%xmm9, %%xmm8\n"
2938           "movaps %%xmm8, %%xmm5\n"
2939           "movaps %%xmm6, %%xmm8\n"
2940           "shufps $160, %%xmm8, %%xmm8\n"
2941           "shufps $245, %%xmm6, %%xmm6\n"
2942           "xorps %%xmm9, %%xmm9\n"
2943           "subps %%xmm6, %%xmm9\n"
2944           "addsubps %%xmm9, %%xmm8\n"
2945           "movaps %%xmm8, %%xmm6\n"
2946           "movaps %%xmm7, %%xmm8\n"
2947           "shufps $160, %%xmm8, %%xmm8\n"
2948           "shufps $245, %%xmm7, %%xmm7\n"
2949           "xorps %%xmm9, %%xmm9\n"
2950           "subps %%xmm7, %%xmm9\n"
2951           "addsubps %%xmm9, %%xmm8\n"
2952           "movaps %%xmm8, %%xmm7\n"
2953           "movaps %%xmm0, %%xmm8\n"
2954           "shufps $68, %%xmm8, %%xmm8\n"
2955           "xorps %%xmm9, %%xmm9\n"
2956           "movaps %%xmm0, %%xmm10\n"
2957           "shufps $14, %%xmm9, %%xmm10\n"
2958           "movaps %%xmm0, %%xmm11\n"
2959           "shufps $224, %%xmm11, %%xmm9\n"
2960           "addps %%xmm8, %%xmm10\n"
2961           "subps %%xmm9, %%xmm10\n"
2962           "movaps %%xmm10, %%xmm0\n"
2963           "movaps %%xmm1, %%xmm8\n"
2964           "shufps $68, %%xmm8, %%xmm8\n"
2965           "xorps %%xmm9, %%xmm9\n"
2966           "movaps %%xmm1, %%xmm10\n"
2967           "shufps $14, %%xmm9, %%xmm10\n"
2968           "movaps %%xmm1, %%xmm11\n"
2969           "shufps $224, %%xmm11, %%xmm9\n"
2970           "addps %%xmm8, %%xmm10\n"
2971           "subps %%xmm9, %%xmm10\n"
2972           "movaps %%xmm10, %%xmm1\n"
2973           "movaps %%xmm2, %%xmm8\n"
2974           "shufps $68, %%xmm8, %%xmm8\n"
2975           "xorps %%xmm9, %%xmm9\n"
2976           "movaps %%xmm2, %%xmm10\n"
2977           "shufps $14, %%xmm9, %%xmm10\n"
2978           "movaps %%xmm2, %%xmm11\n"
2979           "shufps $224, %%xmm11, %%xmm9\n"
2980           "addps %%xmm8, %%xmm10\n"
2981           "subps %%xmm9, %%xmm10\n"
2982           "movaps %%xmm10, %%xmm2\n"
2983           "movaps %%xmm3, %%xmm8\n"
2984           "shufps $68, %%xmm8, %%xmm8\n"
2985           "xorps %%xmm9, %%xmm9\n"
2986           "movaps %%xmm3, %%xmm10\n"
2987           "shufps $14, %%xmm9, %%xmm10\n"
2988           "movaps %%xmm3, %%xmm11\n"
2989           "shufps $224, %%xmm11, %%xmm9\n"
2990           "addps %%xmm8, %%xmm10\n"
2991           "subps %%xmm9, %%xmm10\n"
2992           "movaps %%xmm10, %%xmm3\n"
2993           "movaps %%xmm4, %%xmm8\n"
2994           "shufps $68, %%xmm8, %%xmm8\n"
2995           "xorps %%xmm9, %%xmm9\n"
2996           "movaps %%xmm4, %%xmm10\n"
2997           "shufps $14, %%xmm9, %%xmm10\n"
2998           "movaps %%xmm4, %%xmm11\n"
2999           "shufps $224, %%xmm11, %%xmm9\n"
3000           "addps %%xmm8, %%xmm10\n"
3001           "subps %%xmm9, %%xmm10\n"
3002           "movaps %%xmm10, %%xmm4\n"
3003           "movaps %%xmm5, %%xmm8\n"
3004           "shufps $68, %%xmm8, %%xmm8\n"
3005           "xorps %%xmm9, %%xmm9\n"
3006           "movaps %%xmm5, %%xmm10\n"
3007           "shufps $14, %%xmm9, %%xmm10\n"
3008           "movaps %%xmm5, %%xmm11\n"
3009           "shufps $224, %%xmm11, %%xmm9\n"
3010           "addps %%xmm8, %%xmm10\n"
3011           "subps %%xmm9, %%xmm10\n"
3012           "movaps %%xmm10, %%xmm5\n"
3013           "movaps %%xmm6, %%xmm8\n"
3014           "shufps $68, %%xmm8, %%xmm8\n"
3015           "xorps %%xmm9, %%xmm9\n"
3016           "movaps %%xmm6, %%xmm10\n"
3017           "shufps $14, %%xmm9, %%xmm10\n"
3018           "movaps %%xmm6, %%xmm11\n"
3019           "shufps $224, %%xmm11, %%xmm9\n"
3020           "addps %%xmm8, %%xmm10\n"
3021           "subps %%xmm9, %%xmm10\n"
3022           "movaps %%xmm10, %%xmm6\n"
3023           "movaps %%xmm7, %%xmm8\n"
3024           "shufps $68, %%xmm8, %%xmm8\n"
3025           "xorps %%xmm9, %%xmm9\n"
3026           "movaps %%xmm7, %%xmm10\n"
3027           "shufps $14, %%xmm9, %%xmm10\n"
3028           "movaps %%xmm7, %%xmm11\n"
3029           "shufps $224, %%xmm11, %%xmm9\n"
3030           "addps %%xmm8, %%xmm10\n"
3031           "subps %%xmm9, %%xmm10\n"
3032           "movaps %%xmm10, %%xmm7\n"
3033           "movaps %%xmm0, %%xmm8\n"
3034           "movaps %%xmm0, %%xmm9\n"
3035           "addps %%xmm1, %%xmm8\n"
3036           "subps %%xmm1, %%xmm9\n"
3037           "movaps %%xmm2, %%xmm10\n"
3038           "movaps %%xmm2, %%xmm11\n"
3039           "addps %%xmm3, %%xmm10\n"
3040           "subps %%xmm3, %%xmm11\n"
3041           "movaps %%xmm4, %%xmm12\n"
3042           "movaps %%xmm4, %%xmm13\n"
3043           "addps %%xmm5, %%xmm12\n"
3044           "subps %%xmm5, %%xmm13\n"
3045           "movaps %%xmm6, %%xmm14\n"
3046           "movaps %%xmm6, %%xmm15\n"
3047           "addps %%xmm7, %%xmm14\n"
3048           "subps %%xmm7, %%xmm15\n"
3049           "movaps %%xmm8, %%xmm0\n"
3050           "movaps %%xmm8, %%xmm2\n"
3051           "addps %%xmm10, %%xmm0\n"
3052           "subps %%xmm10, %%xmm2\n"
3053           "movaps %%xmm9, %%xmm1\n"
3054           "movaps %%xmm9, %%xmm3\n"
3055           "addps %%xmm11, %%xmm1\n"
3056           "subps %%xmm11, %%xmm3\n"
3057           "movaps %%xmm12, %%xmm4\n"
3058           "movaps %%xmm12, %%xmm6\n"
3059           "addps %%xmm14, %%xmm4\n"
3060           "subps %%xmm14, %%xmm6\n"
3061           "movaps %%xmm13, %%xmm5\n"
3062           "movaps %%xmm13, %%xmm7\n"
3063           "addps %%xmm15, %%xmm5\n"
3064           "subps %%xmm15, %%xmm7\n"
3065           "movaps %%xmm0, %%xmm8\n"
3066           "movaps %%xmm0, %%xmm12\n"
3067           "addps %%xmm4, %%xmm8\n"
3068           "subps %%xmm4, %%xmm12\n"
3069           "movaps %%xmm1, %%xmm9\n"
3070           "movaps %%xmm1, %%xmm13\n"
3071           "addps %%xmm5, %%xmm9\n"
3072           "subps %%xmm5, %%xmm13\n"
3073           "movaps %%xmm2, %%xmm10\n"
3074           "movaps %%xmm2, %%xmm14\n"
3075           "addps %%xmm6, %%xmm10\n"
3076           "subps %%xmm6, %%xmm14\n"
3077           "movaps %%xmm3, %%xmm11\n"
3078           "movaps %%xmm3, %%xmm15\n"
3079           "addps %%xmm7, %%xmm11\n"
3080           "subps %%xmm7, %%xmm15\n"
3081           "movups %%xmm8, (%0)\n"
3082           "movups %%xmm9, (%1)\n"
3083           "movups %%xmm10, (%2)\n"
3084           "movups %%xmm11, (%3)\n"
3085           "movups %%xmm12, (%4)\n"
3086           "movups %%xmm13, (%5)\n"
3087           "movups %%xmm14, (%6)\n"
3088           "movups %%xmm15, (%7)\n"
3089           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
3090         );
3091       }
3092     }
3093     for (int j = 0; j < 2048; j += 256) {
3094       for (int k = 0; k < 32; k += 4) {
3095         __asm__ volatile (
3096           "movups (%0), %%xmm0\n"
3097           "movups (%1), %%xmm1\n"
3098           "movups (%2), %%xmm2\n"
3099           "movups (%3), %%xmm3\n"
3100           "movups (%4), %%xmm4\n"
3101           "movups (%5), %%xmm5\n"
3102           "movups (%6), %%xmm6\n"
3103           "movups (%7), %%xmm7\n"
3104           "movaps %%xmm0, %%xmm8\n"
3105           "movaps %%xmm0, %%xmm9\n"
3106           "addps %%xmm1, %%xmm8\n"
3107           "subps %%xmm1, %%xmm9\n"
3108           "movaps %%xmm2, %%xmm10\n"
3109           "movaps %%xmm2, %%xmm11\n"
3110           "addps %%xmm3, %%xmm10\n"
3111           "subps %%xmm3, %%xmm11\n"
3112           "movaps %%xmm4, %%xmm12\n"
3113           "movaps %%xmm4, %%xmm13\n"
3114           "addps %%xmm5, %%xmm12\n"
3115           "subps %%xmm5, %%xmm13\n"
3116           "movaps %%xmm6, %%xmm14\n"
3117           "movaps %%xmm6, %%xmm15\n"
3118           "addps %%xmm7, %%xmm14\n"
3119           "subps %%xmm7, %%xmm15\n"
3120           "movaps %%xmm8, %%xmm0\n"
3121           "movaps %%xmm8, %%xmm2\n"
3122           "addps %%xmm10, %%xmm0\n"
3123           "subps %%xmm10, %%xmm2\n"
3124           "movaps %%xmm9, %%xmm1\n"
3125           "movaps %%xmm9, %%xmm3\n"
3126           "addps %%xmm11, %%xmm1\n"
3127           "subps %%xmm11, %%xmm3\n"
3128           "movaps %%xmm12, %%xmm4\n"
3129           "movaps %%xmm12, %%xmm6\n"
3130           "addps %%xmm14, %%xmm4\n"
3131           "subps %%xmm14, %%xmm6\n"
3132           "movaps %%xmm13, %%xmm5\n"
3133           "movaps %%xmm13, %%xmm7\n"
3134           "addps %%xmm15, %%xmm5\n"
3135           "subps %%xmm15, %%xmm7\n"
3136           "movaps %%xmm0, %%xmm8\n"
3137           "movaps %%xmm0, %%xmm12\n"
3138           "addps %%xmm4, %%xmm8\n"
3139           "subps %%xmm4, %%xmm12\n"
3140           "movaps %%xmm1, %%xmm9\n"
3141           "movaps %%xmm1, %%xmm13\n"
3142           "addps %%xmm5, %%xmm9\n"
3143           "subps %%xmm5, %%xmm13\n"
3144           "movaps %%xmm2, %%xmm10\n"
3145           "movaps %%xmm2, %%xmm14\n"
3146           "addps %%xmm6, %%xmm10\n"
3147           "subps %%xmm6, %%xmm14\n"
3148           "movaps %%xmm3, %%xmm11\n"
3149           "movaps %%xmm3, %%xmm15\n"
3150           "addps %%xmm7, %%xmm11\n"
3151           "subps %%xmm7, %%xmm15\n"
3152           "movups %%xmm8, (%0)\n"
3153           "movups %%xmm9, (%1)\n"
3154           "movups %%xmm10, (%2)\n"
3155           "movups %%xmm11, (%3)\n"
3156           "movups %%xmm12, (%4)\n"
3157           "movups %%xmm13, (%5)\n"
3158           "movups %%xmm14, (%6)\n"
3159           "movups %%xmm15, (%7)\n"
3160           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
3161         );
3162       }
3163     }
3164     for (int j = 0; j < 2048; j += 2048) {
3165       for (int k = 0; k < 256; k += 4) {
3166         __asm__ volatile (
3167           "movups (%0), %%xmm0\n"
3168           "movups (%1), %%xmm1\n"
3169           "movups (%2), %%xmm2\n"
3170           "movups (%3), %%xmm3\n"
3171           "movups (%4), %%xmm4\n"
3172           "movups (%5), %%xmm5\n"
3173           "movups (%6), %%xmm6\n"
3174           "movups (%7), %%xmm7\n"
3175           "movaps %%xmm0, %%xmm8\n"
3176           "movaps %%xmm0, %%xmm9\n"
3177           "addps %%xmm1, %%xmm8\n"
3178           "subps %%xmm1, %%xmm9\n"
3179           "movaps %%xmm2, %%xmm10\n"
3180           "movaps %%xmm2, %%xmm11\n"
3181           "addps %%xmm3, %%xmm10\n"
3182           "subps %%xmm3, %%xmm11\n"
3183           "movaps %%xmm4, %%xmm12\n"
3184           "movaps %%xmm4, %%xmm13\n"
3185           "addps %%xmm5, %%xmm12\n"
3186           "subps %%xmm5, %%xmm13\n"
3187           "movaps %%xmm6, %%xmm14\n"
3188           "movaps %%xmm6, %%xmm15\n"
3189           "addps %%xmm7, %%xmm14\n"
3190           "subps %%xmm7, %%xmm15\n"
3191           "movaps %%xmm8, %%xmm0\n"
3192           "movaps %%xmm8, %%xmm2\n"
3193           "addps %%xmm10, %%xmm0\n"
3194           "subps %%xmm10, %%xmm2\n"
3195           "movaps %%xmm9, %%xmm1\n"
3196           "movaps %%xmm9, %%xmm3\n"
3197           "addps %%xmm11, %%xmm1\n"
3198           "subps %%xmm11, %%xmm3\n"
3199           "movaps %%xmm12, %%xmm4\n"
3200           "movaps %%xmm12, %%xmm6\n"
3201           "addps %%xmm14, %%xmm4\n"
3202           "subps %%xmm14, %%xmm6\n"
3203           "movaps %%xmm13, %%xmm5\n"
3204           "movaps %%xmm13, %%xmm7\n"
3205           "addps %%xmm15, %%xmm5\n"
3206           "subps %%xmm15, %%xmm7\n"
3207           "movaps %%xmm0, %%xmm8\n"
3208           "movaps %%xmm0, %%xmm12\n"
3209           "addps %%xmm4, %%xmm8\n"
3210           "subps %%xmm4, %%xmm12\n"
3211           "movaps %%xmm1, %%xmm9\n"
3212           "movaps %%xmm1, %%xmm13\n"
3213           "addps %%xmm5, %%xmm9\n"
3214           "subps %%xmm5, %%xmm13\n"
3215           "movaps %%xmm2, %%xmm10\n"
3216           "movaps %%xmm2, %%xmm14\n"
3217           "addps %%xmm6, %%xmm10\n"
3218           "subps %%xmm6, %%xmm14\n"
3219           "movaps %%xmm3, %%xmm11\n"
3220           "movaps %%xmm3, %%xmm15\n"
3221           "addps %%xmm7, %%xmm11\n"
3222           "subps %%xmm7, %%xmm15\n"
3223           "movups %%xmm8, (%0)\n"
3224           "movups %%xmm9, (%1)\n"
3225           "movups %%xmm10, (%2)\n"
3226           "movups %%xmm11, (%3)\n"
3227           "movups %%xmm12, (%4)\n"
3228           "movups %%xmm13, (%5)\n"
3229           "movups %%xmm14, (%6)\n"
3230           "movups %%xmm15, (%7)\n"
3231           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
3232         );
3233       }
3234     }
3235     return;
3236   }
3237   if (depth == 14) {
3238     helper_float_14_recursive(buf + 0, 11);
3239     helper_float_14_recursive(buf + 2048, 11);
3240     helper_float_14_recursive(buf + 4096, 11);
3241     helper_float_14_recursive(buf + 6144, 11);
3242     helper_float_14_recursive(buf + 8192, 11);
3243     helper_float_14_recursive(buf + 10240, 11);
3244     helper_float_14_recursive(buf + 12288, 11);
3245     helper_float_14_recursive(buf + 14336, 11);
3246     for (int j = 0; j < 16384; j += 16384) {
3247       for (int k = 0; k < 2048; k += 4) {
3248         __asm__ volatile (
3249           "movups (%0), %%xmm0\n"
3250           "movups (%1), %%xmm1\n"
3251           "movups (%2), %%xmm2\n"
3252           "movups (%3), %%xmm3\n"
3253           "movups (%4), %%xmm4\n"
3254           "movups (%5), %%xmm5\n"
3255           "movups (%6), %%xmm6\n"
3256           "movups (%7), %%xmm7\n"
3257           "movaps %%xmm0, %%xmm8\n"
3258           "movaps %%xmm0, %%xmm9\n"
3259           "addps %%xmm1, %%xmm8\n"
3260           "subps %%xmm1, %%xmm9\n"
3261           "movaps %%xmm2, %%xmm10\n"
3262           "movaps %%xmm2, %%xmm11\n"
3263           "addps %%xmm3, %%xmm10\n"
3264           "subps %%xmm3, %%xmm11\n"
3265           "movaps %%xmm4, %%xmm12\n"
3266           "movaps %%xmm4, %%xmm13\n"
3267           "addps %%xmm5, %%xmm12\n"
3268           "subps %%xmm5, %%xmm13\n"
3269           "movaps %%xmm6, %%xmm14\n"
3270           "movaps %%xmm6, %%xmm15\n"
3271           "addps %%xmm7, %%xmm14\n"
3272           "subps %%xmm7, %%xmm15\n"
3273           "movaps %%xmm8, %%xmm0\n"
3274           "movaps %%xmm8, %%xmm2\n"
3275           "addps %%xmm10, %%xmm0\n"
3276           "subps %%xmm10, %%xmm2\n"
3277           "movaps %%xmm9, %%xmm1\n"
3278           "movaps %%xmm9, %%xmm3\n"
3279           "addps %%xmm11, %%xmm1\n"
3280           "subps %%xmm11, %%xmm3\n"
3281           "movaps %%xmm12, %%xmm4\n"
3282           "movaps %%xmm12, %%xmm6\n"
3283           "addps %%xmm14, %%xmm4\n"
3284           "subps %%xmm14, %%xmm6\n"
3285           "movaps %%xmm13, %%xmm5\n"
3286           "movaps %%xmm13, %%xmm7\n"
3287           "addps %%xmm15, %%xmm5\n"
3288           "subps %%xmm15, %%xmm7\n"
3289           "movaps %%xmm0, %%xmm8\n"
3290           "movaps %%xmm0, %%xmm12\n"
3291           "addps %%xmm4, %%xmm8\n"
3292           "subps %%xmm4, %%xmm12\n"
3293           "movaps %%xmm1, %%xmm9\n"
3294           "movaps %%xmm1, %%xmm13\n"
3295           "addps %%xmm5, %%xmm9\n"
3296           "subps %%xmm5, %%xmm13\n"
3297           "movaps %%xmm2, %%xmm10\n"
3298           "movaps %%xmm2, %%xmm14\n"
3299           "addps %%xmm6, %%xmm10\n"
3300           "subps %%xmm6, %%xmm14\n"
3301           "movaps %%xmm3, %%xmm11\n"
3302           "movaps %%xmm3, %%xmm15\n"
3303           "addps %%xmm7, %%xmm11\n"
3304           "subps %%xmm7, %%xmm15\n"
3305           "movups %%xmm8, (%0)\n"
3306           "movups %%xmm9, (%1)\n"
3307           "movups %%xmm10, (%2)\n"
3308           "movups %%xmm11, (%3)\n"
3309           "movups %%xmm12, (%4)\n"
3310           "movups %%xmm13, (%5)\n"
3311           "movups %%xmm14, (%6)\n"
3312           "movups %%xmm15, (%7)\n"
3313           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
3314         );
3315       }
3316     }
3317     return;
3318   }
3319 }
3320 void helper_float_14(float *buf);
helper_float_14(float * buf)3321 void helper_float_14(float *buf) {
3322   helper_float_14_recursive(buf, 14);
3323 }
3324 void helper_float_15_recursive(float *buf, int depth);
helper_float_15_recursive(float * buf,int depth)3325 void helper_float_15_recursive(float *buf, int depth) {
3326   if (depth == 13) {
3327     for (int j = 0; j < 8192; j += 32) {
3328       for (int k = 0; k < 4; k += 4) {
3329         __asm__ volatile (
3330           "movups (%0), %%xmm0\n"
3331           "movups (%1), %%xmm1\n"
3332           "movups (%2), %%xmm2\n"
3333           "movups (%3), %%xmm3\n"
3334           "movups (%4), %%xmm4\n"
3335           "movups (%5), %%xmm5\n"
3336           "movups (%6), %%xmm6\n"
3337           "movups (%7), %%xmm7\n"
3338           "movaps %%xmm0, %%xmm8\n"
3339           "shufps $160, %%xmm8, %%xmm8\n"
3340           "shufps $245, %%xmm0, %%xmm0\n"
3341           "xorps %%xmm9, %%xmm9\n"
3342           "subps %%xmm0, %%xmm9\n"
3343           "addsubps %%xmm9, %%xmm8\n"
3344           "movaps %%xmm8, %%xmm0\n"
3345           "movaps %%xmm1, %%xmm8\n"
3346           "shufps $160, %%xmm8, %%xmm8\n"
3347           "shufps $245, %%xmm1, %%xmm1\n"
3348           "xorps %%xmm9, %%xmm9\n"
3349           "subps %%xmm1, %%xmm9\n"
3350           "addsubps %%xmm9, %%xmm8\n"
3351           "movaps %%xmm8, %%xmm1\n"
3352           "movaps %%xmm2, %%xmm8\n"
3353           "shufps $160, %%xmm8, %%xmm8\n"
3354           "shufps $245, %%xmm2, %%xmm2\n"
3355           "xorps %%xmm9, %%xmm9\n"
3356           "subps %%xmm2, %%xmm9\n"
3357           "addsubps %%xmm9, %%xmm8\n"
3358           "movaps %%xmm8, %%xmm2\n"
3359           "movaps %%xmm3, %%xmm8\n"
3360           "shufps $160, %%xmm8, %%xmm8\n"
3361           "shufps $245, %%xmm3, %%xmm3\n"
3362           "xorps %%xmm9, %%xmm9\n"
3363           "subps %%xmm3, %%xmm9\n"
3364           "addsubps %%xmm9, %%xmm8\n"
3365           "movaps %%xmm8, %%xmm3\n"
3366           "movaps %%xmm4, %%xmm8\n"
3367           "shufps $160, %%xmm8, %%xmm8\n"
3368           "shufps $245, %%xmm4, %%xmm4\n"
3369           "xorps %%xmm9, %%xmm9\n"
3370           "subps %%xmm4, %%xmm9\n"
3371           "addsubps %%xmm9, %%xmm8\n"
3372           "movaps %%xmm8, %%xmm4\n"
3373           "movaps %%xmm5, %%xmm8\n"
3374           "shufps $160, %%xmm8, %%xmm8\n"
3375           "shufps $245, %%xmm5, %%xmm5\n"
3376           "xorps %%xmm9, %%xmm9\n"
3377           "subps %%xmm5, %%xmm9\n"
3378           "addsubps %%xmm9, %%xmm8\n"
3379           "movaps %%xmm8, %%xmm5\n"
3380           "movaps %%xmm6, %%xmm8\n"
3381           "shufps $160, %%xmm8, %%xmm8\n"
3382           "shufps $245, %%xmm6, %%xmm6\n"
3383           "xorps %%xmm9, %%xmm9\n"
3384           "subps %%xmm6, %%xmm9\n"
3385           "addsubps %%xmm9, %%xmm8\n"
3386           "movaps %%xmm8, %%xmm6\n"
3387           "movaps %%xmm7, %%xmm8\n"
3388           "shufps $160, %%xmm8, %%xmm8\n"
3389           "shufps $245, %%xmm7, %%xmm7\n"
3390           "xorps %%xmm9, %%xmm9\n"
3391           "subps %%xmm7, %%xmm9\n"
3392           "addsubps %%xmm9, %%xmm8\n"
3393           "movaps %%xmm8, %%xmm7\n"
3394           "movaps %%xmm0, %%xmm8\n"
3395           "shufps $68, %%xmm8, %%xmm8\n"
3396           "xorps %%xmm9, %%xmm9\n"
3397           "movaps %%xmm0, %%xmm10\n"
3398           "shufps $14, %%xmm9, %%xmm10\n"
3399           "movaps %%xmm0, %%xmm11\n"
3400           "shufps $224, %%xmm11, %%xmm9\n"
3401           "addps %%xmm8, %%xmm10\n"
3402           "subps %%xmm9, %%xmm10\n"
3403           "movaps %%xmm10, %%xmm0\n"
3404           "movaps %%xmm1, %%xmm8\n"
3405           "shufps $68, %%xmm8, %%xmm8\n"
3406           "xorps %%xmm9, %%xmm9\n"
3407           "movaps %%xmm1, %%xmm10\n"
3408           "shufps $14, %%xmm9, %%xmm10\n"
3409           "movaps %%xmm1, %%xmm11\n"
3410           "shufps $224, %%xmm11, %%xmm9\n"
3411           "addps %%xmm8, %%xmm10\n"
3412           "subps %%xmm9, %%xmm10\n"
3413           "movaps %%xmm10, %%xmm1\n"
3414           "movaps %%xmm2, %%xmm8\n"
3415           "shufps $68, %%xmm8, %%xmm8\n"
3416           "xorps %%xmm9, %%xmm9\n"
3417           "movaps %%xmm2, %%xmm10\n"
3418           "shufps $14, %%xmm9, %%xmm10\n"
3419           "movaps %%xmm2, %%xmm11\n"
3420           "shufps $224, %%xmm11, %%xmm9\n"
3421           "addps %%xmm8, %%xmm10\n"
3422           "subps %%xmm9, %%xmm10\n"
3423           "movaps %%xmm10, %%xmm2\n"
3424           "movaps %%xmm3, %%xmm8\n"
3425           "shufps $68, %%xmm8, %%xmm8\n"
3426           "xorps %%xmm9, %%xmm9\n"
3427           "movaps %%xmm3, %%xmm10\n"
3428           "shufps $14, %%xmm9, %%xmm10\n"
3429           "movaps %%xmm3, %%xmm11\n"
3430           "shufps $224, %%xmm11, %%xmm9\n"
3431           "addps %%xmm8, %%xmm10\n"
3432           "subps %%xmm9, %%xmm10\n"
3433           "movaps %%xmm10, %%xmm3\n"
3434           "movaps %%xmm4, %%xmm8\n"
3435           "shufps $68, %%xmm8, %%xmm8\n"
3436           "xorps %%xmm9, %%xmm9\n"
3437           "movaps %%xmm4, %%xmm10\n"
3438           "shufps $14, %%xmm9, %%xmm10\n"
3439           "movaps %%xmm4, %%xmm11\n"
3440           "shufps $224, %%xmm11, %%xmm9\n"
3441           "addps %%xmm8, %%xmm10\n"
3442           "subps %%xmm9, %%xmm10\n"
3443           "movaps %%xmm10, %%xmm4\n"
3444           "movaps %%xmm5, %%xmm8\n"
3445           "shufps $68, %%xmm8, %%xmm8\n"
3446           "xorps %%xmm9, %%xmm9\n"
3447           "movaps %%xmm5, %%xmm10\n"
3448           "shufps $14, %%xmm9, %%xmm10\n"
3449           "movaps %%xmm5, %%xmm11\n"
3450           "shufps $224, %%xmm11, %%xmm9\n"
3451           "addps %%xmm8, %%xmm10\n"
3452           "subps %%xmm9, %%xmm10\n"
3453           "movaps %%xmm10, %%xmm5\n"
3454           "movaps %%xmm6, %%xmm8\n"
3455           "shufps $68, %%xmm8, %%xmm8\n"
3456           "xorps %%xmm9, %%xmm9\n"
3457           "movaps %%xmm6, %%xmm10\n"
3458           "shufps $14, %%xmm9, %%xmm10\n"
3459           "movaps %%xmm6, %%xmm11\n"
3460           "shufps $224, %%xmm11, %%xmm9\n"
3461           "addps %%xmm8, %%xmm10\n"
3462           "subps %%xmm9, %%xmm10\n"
3463           "movaps %%xmm10, %%xmm6\n"
3464           "movaps %%xmm7, %%xmm8\n"
3465           "shufps $68, %%xmm8, %%xmm8\n"
3466           "xorps %%xmm9, %%xmm9\n"
3467           "movaps %%xmm7, %%xmm10\n"
3468           "shufps $14, %%xmm9, %%xmm10\n"
3469           "movaps %%xmm7, %%xmm11\n"
3470           "shufps $224, %%xmm11, %%xmm9\n"
3471           "addps %%xmm8, %%xmm10\n"
3472           "subps %%xmm9, %%xmm10\n"
3473           "movaps %%xmm10, %%xmm7\n"
3474           "movaps %%xmm0, %%xmm8\n"
3475           "movaps %%xmm0, %%xmm9\n"
3476           "addps %%xmm1, %%xmm8\n"
3477           "subps %%xmm1, %%xmm9\n"
3478           "movaps %%xmm2, %%xmm10\n"
3479           "movaps %%xmm2, %%xmm11\n"
3480           "addps %%xmm3, %%xmm10\n"
3481           "subps %%xmm3, %%xmm11\n"
3482           "movaps %%xmm4, %%xmm12\n"
3483           "movaps %%xmm4, %%xmm13\n"
3484           "addps %%xmm5, %%xmm12\n"
3485           "subps %%xmm5, %%xmm13\n"
3486           "movaps %%xmm6, %%xmm14\n"
3487           "movaps %%xmm6, %%xmm15\n"
3488           "addps %%xmm7, %%xmm14\n"
3489           "subps %%xmm7, %%xmm15\n"
3490           "movaps %%xmm8, %%xmm0\n"
3491           "movaps %%xmm8, %%xmm2\n"
3492           "addps %%xmm10, %%xmm0\n"
3493           "subps %%xmm10, %%xmm2\n"
3494           "movaps %%xmm9, %%xmm1\n"
3495           "movaps %%xmm9, %%xmm3\n"
3496           "addps %%xmm11, %%xmm1\n"
3497           "subps %%xmm11, %%xmm3\n"
3498           "movaps %%xmm12, %%xmm4\n"
3499           "movaps %%xmm12, %%xmm6\n"
3500           "addps %%xmm14, %%xmm4\n"
3501           "subps %%xmm14, %%xmm6\n"
3502           "movaps %%xmm13, %%xmm5\n"
3503           "movaps %%xmm13, %%xmm7\n"
3504           "addps %%xmm15, %%xmm5\n"
3505           "subps %%xmm15, %%xmm7\n"
3506           "movaps %%xmm0, %%xmm8\n"
3507           "movaps %%xmm0, %%xmm12\n"
3508           "addps %%xmm4, %%xmm8\n"
3509           "subps %%xmm4, %%xmm12\n"
3510           "movaps %%xmm1, %%xmm9\n"
3511           "movaps %%xmm1, %%xmm13\n"
3512           "addps %%xmm5, %%xmm9\n"
3513           "subps %%xmm5, %%xmm13\n"
3514           "movaps %%xmm2, %%xmm10\n"
3515           "movaps %%xmm2, %%xmm14\n"
3516           "addps %%xmm6, %%xmm10\n"
3517           "subps %%xmm6, %%xmm14\n"
3518           "movaps %%xmm3, %%xmm11\n"
3519           "movaps %%xmm3, %%xmm15\n"
3520           "addps %%xmm7, %%xmm11\n"
3521           "subps %%xmm7, %%xmm15\n"
3522           "movups %%xmm8, (%0)\n"
3523           "movups %%xmm9, (%1)\n"
3524           "movups %%xmm10, (%2)\n"
3525           "movups %%xmm11, (%3)\n"
3526           "movups %%xmm12, (%4)\n"
3527           "movups %%xmm13, (%5)\n"
3528           "movups %%xmm14, (%6)\n"
3529           "movups %%xmm15, (%7)\n"
3530           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
3531         );
3532       }
3533     }
3534     for (int j = 0; j < 8192; j += 256) {
3535       for (int k = 0; k < 32; k += 4) {
3536         __asm__ volatile (
3537           "movups (%0), %%xmm0\n"
3538           "movups (%1), %%xmm1\n"
3539           "movups (%2), %%xmm2\n"
3540           "movups (%3), %%xmm3\n"
3541           "movups (%4), %%xmm4\n"
3542           "movups (%5), %%xmm5\n"
3543           "movups (%6), %%xmm6\n"
3544           "movups (%7), %%xmm7\n"
3545           "movaps %%xmm0, %%xmm8\n"
3546           "movaps %%xmm0, %%xmm9\n"
3547           "addps %%xmm1, %%xmm8\n"
3548           "subps %%xmm1, %%xmm9\n"
3549           "movaps %%xmm2, %%xmm10\n"
3550           "movaps %%xmm2, %%xmm11\n"
3551           "addps %%xmm3, %%xmm10\n"
3552           "subps %%xmm3, %%xmm11\n"
3553           "movaps %%xmm4, %%xmm12\n"
3554           "movaps %%xmm4, %%xmm13\n"
3555           "addps %%xmm5, %%xmm12\n"
3556           "subps %%xmm5, %%xmm13\n"
3557           "movaps %%xmm6, %%xmm14\n"
3558           "movaps %%xmm6, %%xmm15\n"
3559           "addps %%xmm7, %%xmm14\n"
3560           "subps %%xmm7, %%xmm15\n"
3561           "movaps %%xmm8, %%xmm0\n"
3562           "movaps %%xmm8, %%xmm2\n"
3563           "addps %%xmm10, %%xmm0\n"
3564           "subps %%xmm10, %%xmm2\n"
3565           "movaps %%xmm9, %%xmm1\n"
3566           "movaps %%xmm9, %%xmm3\n"
3567           "addps %%xmm11, %%xmm1\n"
3568           "subps %%xmm11, %%xmm3\n"
3569           "movaps %%xmm12, %%xmm4\n"
3570           "movaps %%xmm12, %%xmm6\n"
3571           "addps %%xmm14, %%xmm4\n"
3572           "subps %%xmm14, %%xmm6\n"
3573           "movaps %%xmm13, %%xmm5\n"
3574           "movaps %%xmm13, %%xmm7\n"
3575           "addps %%xmm15, %%xmm5\n"
3576           "subps %%xmm15, %%xmm7\n"
3577           "movaps %%xmm0, %%xmm8\n"
3578           "movaps %%xmm0, %%xmm12\n"
3579           "addps %%xmm4, %%xmm8\n"
3580           "subps %%xmm4, %%xmm12\n"
3581           "movaps %%xmm1, %%xmm9\n"
3582           "movaps %%xmm1, %%xmm13\n"
3583           "addps %%xmm5, %%xmm9\n"
3584           "subps %%xmm5, %%xmm13\n"
3585           "movaps %%xmm2, %%xmm10\n"
3586           "movaps %%xmm2, %%xmm14\n"
3587           "addps %%xmm6, %%xmm10\n"
3588           "subps %%xmm6, %%xmm14\n"
3589           "movaps %%xmm3, %%xmm11\n"
3590           "movaps %%xmm3, %%xmm15\n"
3591           "addps %%xmm7, %%xmm11\n"
3592           "subps %%xmm7, %%xmm15\n"
3593           "movups %%xmm8, (%0)\n"
3594           "movups %%xmm9, (%1)\n"
3595           "movups %%xmm10, (%2)\n"
3596           "movups %%xmm11, (%3)\n"
3597           "movups %%xmm12, (%4)\n"
3598           "movups %%xmm13, (%5)\n"
3599           "movups %%xmm14, (%6)\n"
3600           "movups %%xmm15, (%7)\n"
3601           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
3602         );
3603       }
3604     }
3605     for (int j = 0; j < 8192; j += 2048) {
3606       for (int k = 0; k < 256; k += 4) {
3607         __asm__ volatile (
3608           "movups (%0), %%xmm0\n"
3609           "movups (%1), %%xmm1\n"
3610           "movups (%2), %%xmm2\n"
3611           "movups (%3), %%xmm3\n"
3612           "movups (%4), %%xmm4\n"
3613           "movups (%5), %%xmm5\n"
3614           "movups (%6), %%xmm6\n"
3615           "movups (%7), %%xmm7\n"
3616           "movaps %%xmm0, %%xmm8\n"
3617           "movaps %%xmm0, %%xmm9\n"
3618           "addps %%xmm1, %%xmm8\n"
3619           "subps %%xmm1, %%xmm9\n"
3620           "movaps %%xmm2, %%xmm10\n"
3621           "movaps %%xmm2, %%xmm11\n"
3622           "addps %%xmm3, %%xmm10\n"
3623           "subps %%xmm3, %%xmm11\n"
3624           "movaps %%xmm4, %%xmm12\n"
3625           "movaps %%xmm4, %%xmm13\n"
3626           "addps %%xmm5, %%xmm12\n"
3627           "subps %%xmm5, %%xmm13\n"
3628           "movaps %%xmm6, %%xmm14\n"
3629           "movaps %%xmm6, %%xmm15\n"
3630           "addps %%xmm7, %%xmm14\n"
3631           "subps %%xmm7, %%xmm15\n"
3632           "movaps %%xmm8, %%xmm0\n"
3633           "movaps %%xmm8, %%xmm2\n"
3634           "addps %%xmm10, %%xmm0\n"
3635           "subps %%xmm10, %%xmm2\n"
3636           "movaps %%xmm9, %%xmm1\n"
3637           "movaps %%xmm9, %%xmm3\n"
3638           "addps %%xmm11, %%xmm1\n"
3639           "subps %%xmm11, %%xmm3\n"
3640           "movaps %%xmm12, %%xmm4\n"
3641           "movaps %%xmm12, %%xmm6\n"
3642           "addps %%xmm14, %%xmm4\n"
3643           "subps %%xmm14, %%xmm6\n"
3644           "movaps %%xmm13, %%xmm5\n"
3645           "movaps %%xmm13, %%xmm7\n"
3646           "addps %%xmm15, %%xmm5\n"
3647           "subps %%xmm15, %%xmm7\n"
3648           "movaps %%xmm0, %%xmm8\n"
3649           "movaps %%xmm0, %%xmm12\n"
3650           "addps %%xmm4, %%xmm8\n"
3651           "subps %%xmm4, %%xmm12\n"
3652           "movaps %%xmm1, %%xmm9\n"
3653           "movaps %%xmm1, %%xmm13\n"
3654           "addps %%xmm5, %%xmm9\n"
3655           "subps %%xmm5, %%xmm13\n"
3656           "movaps %%xmm2, %%xmm10\n"
3657           "movaps %%xmm2, %%xmm14\n"
3658           "addps %%xmm6, %%xmm10\n"
3659           "subps %%xmm6, %%xmm14\n"
3660           "movaps %%xmm3, %%xmm11\n"
3661           "movaps %%xmm3, %%xmm15\n"
3662           "addps %%xmm7, %%xmm11\n"
3663           "subps %%xmm7, %%xmm15\n"
3664           "movups %%xmm8, (%0)\n"
3665           "movups %%xmm9, (%1)\n"
3666           "movups %%xmm10, (%2)\n"
3667           "movups %%xmm11, (%3)\n"
3668           "movups %%xmm12, (%4)\n"
3669           "movups %%xmm13, (%5)\n"
3670           "movups %%xmm14, (%6)\n"
3671           "movups %%xmm15, (%7)\n"
3672           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
3673         );
3674       }
3675     }
3676     for (int j = 0; j < 8192; j += 8192) {
3677       for (int k = 0; k < 2048; k += 4) {
3678         __asm__ volatile (
3679           "movups (%0), %%xmm0\n"
3680           "movups (%1), %%xmm1\n"
3681           "movups (%2), %%xmm2\n"
3682           "movups (%3), %%xmm3\n"
3683           "movaps %%xmm0, %%xmm8\n"
3684           "movaps %%xmm0, %%xmm9\n"
3685           "addps %%xmm1, %%xmm8\n"
3686           "subps %%xmm1, %%xmm9\n"
3687           "movaps %%xmm2, %%xmm10\n"
3688           "movaps %%xmm2, %%xmm11\n"
3689           "addps %%xmm3, %%xmm10\n"
3690           "subps %%xmm3, %%xmm11\n"
3691           "movaps %%xmm8, %%xmm0\n"
3692           "movaps %%xmm8, %%xmm2\n"
3693           "addps %%xmm10, %%xmm0\n"
3694           "subps %%xmm10, %%xmm2\n"
3695           "movaps %%xmm9, %%xmm1\n"
3696           "movaps %%xmm9, %%xmm3\n"
3697           "addps %%xmm11, %%xmm1\n"
3698           "subps %%xmm11, %%xmm3\n"
3699           "movups %%xmm0, (%0)\n"
3700           "movups %%xmm1, (%1)\n"
3701           "movups %%xmm2, (%2)\n"
3702           "movups %%xmm3, (%3)\n"
3703           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
3704         );
3705       }
3706     }
3707     return;
3708   }
3709   if (depth == 15) {
3710     helper_float_15_recursive(buf + 0, 13);
3711     helper_float_15_recursive(buf + 8192, 13);
3712     helper_float_15_recursive(buf + 16384, 13);
3713     helper_float_15_recursive(buf + 24576, 13);
3714     for (int j = 0; j < 32768; j += 32768) {
3715       for (int k = 0; k < 8192; k += 4) {
3716         __asm__ volatile (
3717           "movups (%0), %%xmm0\n"
3718           "movups (%1), %%xmm1\n"
3719           "movups (%2), %%xmm2\n"
3720           "movups (%3), %%xmm3\n"
3721           "movaps %%xmm0, %%xmm8\n"
3722           "movaps %%xmm0, %%xmm9\n"
3723           "addps %%xmm1, %%xmm8\n"
3724           "subps %%xmm1, %%xmm9\n"
3725           "movaps %%xmm2, %%xmm10\n"
3726           "movaps %%xmm2, %%xmm11\n"
3727           "addps %%xmm3, %%xmm10\n"
3728           "subps %%xmm3, %%xmm11\n"
3729           "movaps %%xmm8, %%xmm0\n"
3730           "movaps %%xmm8, %%xmm2\n"
3731           "addps %%xmm10, %%xmm0\n"
3732           "subps %%xmm10, %%xmm2\n"
3733           "movaps %%xmm9, %%xmm1\n"
3734           "movaps %%xmm9, %%xmm3\n"
3735           "addps %%xmm11, %%xmm1\n"
3736           "subps %%xmm11, %%xmm3\n"
3737           "movups %%xmm0, (%0)\n"
3738           "movups %%xmm1, (%1)\n"
3739           "movups %%xmm2, (%2)\n"
3740           "movups %%xmm3, (%3)\n"
3741           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
3742         );
3743       }
3744     }
3745     return;
3746   }
3747 }
3748 void helper_float_15(float *buf);
helper_float_15(float * buf)3749 void helper_float_15(float *buf) {
3750   helper_float_15_recursive(buf, 15);
3751 }
3752 void helper_float_16_recursive(float *buf, int depth);
helper_float_16_recursive(float * buf,int depth)3753 void helper_float_16_recursive(float *buf, int depth) {
3754   if (depth == 11) {
3755     for (int j = 0; j < 2048; j += 32) {
3756       for (int k = 0; k < 4; k += 4) {
3757         __asm__ volatile (
3758           "movups (%0), %%xmm0\n"
3759           "movups (%1), %%xmm1\n"
3760           "movups (%2), %%xmm2\n"
3761           "movups (%3), %%xmm3\n"
3762           "movups (%4), %%xmm4\n"
3763           "movups (%5), %%xmm5\n"
3764           "movups (%6), %%xmm6\n"
3765           "movups (%7), %%xmm7\n"
3766           "movaps %%xmm0, %%xmm8\n"
3767           "shufps $160, %%xmm8, %%xmm8\n"
3768           "shufps $245, %%xmm0, %%xmm0\n"
3769           "xorps %%xmm9, %%xmm9\n"
3770           "subps %%xmm0, %%xmm9\n"
3771           "addsubps %%xmm9, %%xmm8\n"
3772           "movaps %%xmm8, %%xmm0\n"
3773           "movaps %%xmm1, %%xmm8\n"
3774           "shufps $160, %%xmm8, %%xmm8\n"
3775           "shufps $245, %%xmm1, %%xmm1\n"
3776           "xorps %%xmm9, %%xmm9\n"
3777           "subps %%xmm1, %%xmm9\n"
3778           "addsubps %%xmm9, %%xmm8\n"
3779           "movaps %%xmm8, %%xmm1\n"
3780           "movaps %%xmm2, %%xmm8\n"
3781           "shufps $160, %%xmm8, %%xmm8\n"
3782           "shufps $245, %%xmm2, %%xmm2\n"
3783           "xorps %%xmm9, %%xmm9\n"
3784           "subps %%xmm2, %%xmm9\n"
3785           "addsubps %%xmm9, %%xmm8\n"
3786           "movaps %%xmm8, %%xmm2\n"
3787           "movaps %%xmm3, %%xmm8\n"
3788           "shufps $160, %%xmm8, %%xmm8\n"
3789           "shufps $245, %%xmm3, %%xmm3\n"
3790           "xorps %%xmm9, %%xmm9\n"
3791           "subps %%xmm3, %%xmm9\n"
3792           "addsubps %%xmm9, %%xmm8\n"
3793           "movaps %%xmm8, %%xmm3\n"
3794           "movaps %%xmm4, %%xmm8\n"
3795           "shufps $160, %%xmm8, %%xmm8\n"
3796           "shufps $245, %%xmm4, %%xmm4\n"
3797           "xorps %%xmm9, %%xmm9\n"
3798           "subps %%xmm4, %%xmm9\n"
3799           "addsubps %%xmm9, %%xmm8\n"
3800           "movaps %%xmm8, %%xmm4\n"
3801           "movaps %%xmm5, %%xmm8\n"
3802           "shufps $160, %%xmm8, %%xmm8\n"
3803           "shufps $245, %%xmm5, %%xmm5\n"
3804           "xorps %%xmm9, %%xmm9\n"
3805           "subps %%xmm5, %%xmm9\n"
3806           "addsubps %%xmm9, %%xmm8\n"
3807           "movaps %%xmm8, %%xmm5\n"
3808           "movaps %%xmm6, %%xmm8\n"
3809           "shufps $160, %%xmm8, %%xmm8\n"
3810           "shufps $245, %%xmm6, %%xmm6\n"
3811           "xorps %%xmm9, %%xmm9\n"
3812           "subps %%xmm6, %%xmm9\n"
3813           "addsubps %%xmm9, %%xmm8\n"
3814           "movaps %%xmm8, %%xmm6\n"
3815           "movaps %%xmm7, %%xmm8\n"
3816           "shufps $160, %%xmm8, %%xmm8\n"
3817           "shufps $245, %%xmm7, %%xmm7\n"
3818           "xorps %%xmm9, %%xmm9\n"
3819           "subps %%xmm7, %%xmm9\n"
3820           "addsubps %%xmm9, %%xmm8\n"
3821           "movaps %%xmm8, %%xmm7\n"
3822           "movaps %%xmm0, %%xmm8\n"
3823           "shufps $68, %%xmm8, %%xmm8\n"
3824           "xorps %%xmm9, %%xmm9\n"
3825           "movaps %%xmm0, %%xmm10\n"
3826           "shufps $14, %%xmm9, %%xmm10\n"
3827           "movaps %%xmm0, %%xmm11\n"
3828           "shufps $224, %%xmm11, %%xmm9\n"
3829           "addps %%xmm8, %%xmm10\n"
3830           "subps %%xmm9, %%xmm10\n"
3831           "movaps %%xmm10, %%xmm0\n"
3832           "movaps %%xmm1, %%xmm8\n"
3833           "shufps $68, %%xmm8, %%xmm8\n"
3834           "xorps %%xmm9, %%xmm9\n"
3835           "movaps %%xmm1, %%xmm10\n"
3836           "shufps $14, %%xmm9, %%xmm10\n"
3837           "movaps %%xmm1, %%xmm11\n"
3838           "shufps $224, %%xmm11, %%xmm9\n"
3839           "addps %%xmm8, %%xmm10\n"
3840           "subps %%xmm9, %%xmm10\n"
3841           "movaps %%xmm10, %%xmm1\n"
3842           "movaps %%xmm2, %%xmm8\n"
3843           "shufps $68, %%xmm8, %%xmm8\n"
3844           "xorps %%xmm9, %%xmm9\n"
3845           "movaps %%xmm2, %%xmm10\n"
3846           "shufps $14, %%xmm9, %%xmm10\n"
3847           "movaps %%xmm2, %%xmm11\n"
3848           "shufps $224, %%xmm11, %%xmm9\n"
3849           "addps %%xmm8, %%xmm10\n"
3850           "subps %%xmm9, %%xmm10\n"
3851           "movaps %%xmm10, %%xmm2\n"
3852           "movaps %%xmm3, %%xmm8\n"
3853           "shufps $68, %%xmm8, %%xmm8\n"
3854           "xorps %%xmm9, %%xmm9\n"
3855           "movaps %%xmm3, %%xmm10\n"
3856           "shufps $14, %%xmm9, %%xmm10\n"
3857           "movaps %%xmm3, %%xmm11\n"
3858           "shufps $224, %%xmm11, %%xmm9\n"
3859           "addps %%xmm8, %%xmm10\n"
3860           "subps %%xmm9, %%xmm10\n"
3861           "movaps %%xmm10, %%xmm3\n"
3862           "movaps %%xmm4, %%xmm8\n"
3863           "shufps $68, %%xmm8, %%xmm8\n"
3864           "xorps %%xmm9, %%xmm9\n"
3865           "movaps %%xmm4, %%xmm10\n"
3866           "shufps $14, %%xmm9, %%xmm10\n"
3867           "movaps %%xmm4, %%xmm11\n"
3868           "shufps $224, %%xmm11, %%xmm9\n"
3869           "addps %%xmm8, %%xmm10\n"
3870           "subps %%xmm9, %%xmm10\n"
3871           "movaps %%xmm10, %%xmm4\n"
3872           "movaps %%xmm5, %%xmm8\n"
3873           "shufps $68, %%xmm8, %%xmm8\n"
3874           "xorps %%xmm9, %%xmm9\n"
3875           "movaps %%xmm5, %%xmm10\n"
3876           "shufps $14, %%xmm9, %%xmm10\n"
3877           "movaps %%xmm5, %%xmm11\n"
3878           "shufps $224, %%xmm11, %%xmm9\n"
3879           "addps %%xmm8, %%xmm10\n"
3880           "subps %%xmm9, %%xmm10\n"
3881           "movaps %%xmm10, %%xmm5\n"
3882           "movaps %%xmm6, %%xmm8\n"
3883           "shufps $68, %%xmm8, %%xmm8\n"
3884           "xorps %%xmm9, %%xmm9\n"
3885           "movaps %%xmm6, %%xmm10\n"
3886           "shufps $14, %%xmm9, %%xmm10\n"
3887           "movaps %%xmm6, %%xmm11\n"
3888           "shufps $224, %%xmm11, %%xmm9\n"
3889           "addps %%xmm8, %%xmm10\n"
3890           "subps %%xmm9, %%xmm10\n"
3891           "movaps %%xmm10, %%xmm6\n"
3892           "movaps %%xmm7, %%xmm8\n"
3893           "shufps $68, %%xmm8, %%xmm8\n"
3894           "xorps %%xmm9, %%xmm9\n"
3895           "movaps %%xmm7, %%xmm10\n"
3896           "shufps $14, %%xmm9, %%xmm10\n"
3897           "movaps %%xmm7, %%xmm11\n"
3898           "shufps $224, %%xmm11, %%xmm9\n"
3899           "addps %%xmm8, %%xmm10\n"
3900           "subps %%xmm9, %%xmm10\n"
3901           "movaps %%xmm10, %%xmm7\n"
3902           "movaps %%xmm0, %%xmm8\n"
3903           "movaps %%xmm0, %%xmm9\n"
3904           "addps %%xmm1, %%xmm8\n"
3905           "subps %%xmm1, %%xmm9\n"
3906           "movaps %%xmm2, %%xmm10\n"
3907           "movaps %%xmm2, %%xmm11\n"
3908           "addps %%xmm3, %%xmm10\n"
3909           "subps %%xmm3, %%xmm11\n"
3910           "movaps %%xmm4, %%xmm12\n"
3911           "movaps %%xmm4, %%xmm13\n"
3912           "addps %%xmm5, %%xmm12\n"
3913           "subps %%xmm5, %%xmm13\n"
3914           "movaps %%xmm6, %%xmm14\n"
3915           "movaps %%xmm6, %%xmm15\n"
3916           "addps %%xmm7, %%xmm14\n"
3917           "subps %%xmm7, %%xmm15\n"
3918           "movaps %%xmm8, %%xmm0\n"
3919           "movaps %%xmm8, %%xmm2\n"
3920           "addps %%xmm10, %%xmm0\n"
3921           "subps %%xmm10, %%xmm2\n"
3922           "movaps %%xmm9, %%xmm1\n"
3923           "movaps %%xmm9, %%xmm3\n"
3924           "addps %%xmm11, %%xmm1\n"
3925           "subps %%xmm11, %%xmm3\n"
3926           "movaps %%xmm12, %%xmm4\n"
3927           "movaps %%xmm12, %%xmm6\n"
3928           "addps %%xmm14, %%xmm4\n"
3929           "subps %%xmm14, %%xmm6\n"
3930           "movaps %%xmm13, %%xmm5\n"
3931           "movaps %%xmm13, %%xmm7\n"
3932           "addps %%xmm15, %%xmm5\n"
3933           "subps %%xmm15, %%xmm7\n"
3934           "movaps %%xmm0, %%xmm8\n"
3935           "movaps %%xmm0, %%xmm12\n"
3936           "addps %%xmm4, %%xmm8\n"
3937           "subps %%xmm4, %%xmm12\n"
3938           "movaps %%xmm1, %%xmm9\n"
3939           "movaps %%xmm1, %%xmm13\n"
3940           "addps %%xmm5, %%xmm9\n"
3941           "subps %%xmm5, %%xmm13\n"
3942           "movaps %%xmm2, %%xmm10\n"
3943           "movaps %%xmm2, %%xmm14\n"
3944           "addps %%xmm6, %%xmm10\n"
3945           "subps %%xmm6, %%xmm14\n"
3946           "movaps %%xmm3, %%xmm11\n"
3947           "movaps %%xmm3, %%xmm15\n"
3948           "addps %%xmm7, %%xmm11\n"
3949           "subps %%xmm7, %%xmm15\n"
3950           "movups %%xmm8, (%0)\n"
3951           "movups %%xmm9, (%1)\n"
3952           "movups %%xmm10, (%2)\n"
3953           "movups %%xmm11, (%3)\n"
3954           "movups %%xmm12, (%4)\n"
3955           "movups %%xmm13, (%5)\n"
3956           "movups %%xmm14, (%6)\n"
3957           "movups %%xmm15, (%7)\n"
3958           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
3959         );
3960       }
3961     }
3962     for (int j = 0; j < 2048; j += 256) {
3963       for (int k = 0; k < 32; k += 4) {
3964         __asm__ volatile (
3965           "movups (%0), %%xmm0\n"
3966           "movups (%1), %%xmm1\n"
3967           "movups (%2), %%xmm2\n"
3968           "movups (%3), %%xmm3\n"
3969           "movups (%4), %%xmm4\n"
3970           "movups (%5), %%xmm5\n"
3971           "movups (%6), %%xmm6\n"
3972           "movups (%7), %%xmm7\n"
3973           "movaps %%xmm0, %%xmm8\n"
3974           "movaps %%xmm0, %%xmm9\n"
3975           "addps %%xmm1, %%xmm8\n"
3976           "subps %%xmm1, %%xmm9\n"
3977           "movaps %%xmm2, %%xmm10\n"
3978           "movaps %%xmm2, %%xmm11\n"
3979           "addps %%xmm3, %%xmm10\n"
3980           "subps %%xmm3, %%xmm11\n"
3981           "movaps %%xmm4, %%xmm12\n"
3982           "movaps %%xmm4, %%xmm13\n"
3983           "addps %%xmm5, %%xmm12\n"
3984           "subps %%xmm5, %%xmm13\n"
3985           "movaps %%xmm6, %%xmm14\n"
3986           "movaps %%xmm6, %%xmm15\n"
3987           "addps %%xmm7, %%xmm14\n"
3988           "subps %%xmm7, %%xmm15\n"
3989           "movaps %%xmm8, %%xmm0\n"
3990           "movaps %%xmm8, %%xmm2\n"
3991           "addps %%xmm10, %%xmm0\n"
3992           "subps %%xmm10, %%xmm2\n"
3993           "movaps %%xmm9, %%xmm1\n"
3994           "movaps %%xmm9, %%xmm3\n"
3995           "addps %%xmm11, %%xmm1\n"
3996           "subps %%xmm11, %%xmm3\n"
3997           "movaps %%xmm12, %%xmm4\n"
3998           "movaps %%xmm12, %%xmm6\n"
3999           "addps %%xmm14, %%xmm4\n"
4000           "subps %%xmm14, %%xmm6\n"
4001           "movaps %%xmm13, %%xmm5\n"
4002           "movaps %%xmm13, %%xmm7\n"
4003           "addps %%xmm15, %%xmm5\n"
4004           "subps %%xmm15, %%xmm7\n"
4005           "movaps %%xmm0, %%xmm8\n"
4006           "movaps %%xmm0, %%xmm12\n"
4007           "addps %%xmm4, %%xmm8\n"
4008           "subps %%xmm4, %%xmm12\n"
4009           "movaps %%xmm1, %%xmm9\n"
4010           "movaps %%xmm1, %%xmm13\n"
4011           "addps %%xmm5, %%xmm9\n"
4012           "subps %%xmm5, %%xmm13\n"
4013           "movaps %%xmm2, %%xmm10\n"
4014           "movaps %%xmm2, %%xmm14\n"
4015           "addps %%xmm6, %%xmm10\n"
4016           "subps %%xmm6, %%xmm14\n"
4017           "movaps %%xmm3, %%xmm11\n"
4018           "movaps %%xmm3, %%xmm15\n"
4019           "addps %%xmm7, %%xmm11\n"
4020           "subps %%xmm7, %%xmm15\n"
4021           "movups %%xmm8, (%0)\n"
4022           "movups %%xmm9, (%1)\n"
4023           "movups %%xmm10, (%2)\n"
4024           "movups %%xmm11, (%3)\n"
4025           "movups %%xmm12, (%4)\n"
4026           "movups %%xmm13, (%5)\n"
4027           "movups %%xmm14, (%6)\n"
4028           "movups %%xmm15, (%7)\n"
4029           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
4030         );
4031       }
4032     }
4033     for (int j = 0; j < 2048; j += 2048) {
4034       for (int k = 0; k < 256; k += 4) {
4035         __asm__ volatile (
4036           "movups (%0), %%xmm0\n"
4037           "movups (%1), %%xmm1\n"
4038           "movups (%2), %%xmm2\n"
4039           "movups (%3), %%xmm3\n"
4040           "movups (%4), %%xmm4\n"
4041           "movups (%5), %%xmm5\n"
4042           "movups (%6), %%xmm6\n"
4043           "movups (%7), %%xmm7\n"
4044           "movaps %%xmm0, %%xmm8\n"
4045           "movaps %%xmm0, %%xmm9\n"
4046           "addps %%xmm1, %%xmm8\n"
4047           "subps %%xmm1, %%xmm9\n"
4048           "movaps %%xmm2, %%xmm10\n"
4049           "movaps %%xmm2, %%xmm11\n"
4050           "addps %%xmm3, %%xmm10\n"
4051           "subps %%xmm3, %%xmm11\n"
4052           "movaps %%xmm4, %%xmm12\n"
4053           "movaps %%xmm4, %%xmm13\n"
4054           "addps %%xmm5, %%xmm12\n"
4055           "subps %%xmm5, %%xmm13\n"
4056           "movaps %%xmm6, %%xmm14\n"
4057           "movaps %%xmm6, %%xmm15\n"
4058           "addps %%xmm7, %%xmm14\n"
4059           "subps %%xmm7, %%xmm15\n"
4060           "movaps %%xmm8, %%xmm0\n"
4061           "movaps %%xmm8, %%xmm2\n"
4062           "addps %%xmm10, %%xmm0\n"
4063           "subps %%xmm10, %%xmm2\n"
4064           "movaps %%xmm9, %%xmm1\n"
4065           "movaps %%xmm9, %%xmm3\n"
4066           "addps %%xmm11, %%xmm1\n"
4067           "subps %%xmm11, %%xmm3\n"
4068           "movaps %%xmm12, %%xmm4\n"
4069           "movaps %%xmm12, %%xmm6\n"
4070           "addps %%xmm14, %%xmm4\n"
4071           "subps %%xmm14, %%xmm6\n"
4072           "movaps %%xmm13, %%xmm5\n"
4073           "movaps %%xmm13, %%xmm7\n"
4074           "addps %%xmm15, %%xmm5\n"
4075           "subps %%xmm15, %%xmm7\n"
4076           "movaps %%xmm0, %%xmm8\n"
4077           "movaps %%xmm0, %%xmm12\n"
4078           "addps %%xmm4, %%xmm8\n"
4079           "subps %%xmm4, %%xmm12\n"
4080           "movaps %%xmm1, %%xmm9\n"
4081           "movaps %%xmm1, %%xmm13\n"
4082           "addps %%xmm5, %%xmm9\n"
4083           "subps %%xmm5, %%xmm13\n"
4084           "movaps %%xmm2, %%xmm10\n"
4085           "movaps %%xmm2, %%xmm14\n"
4086           "addps %%xmm6, %%xmm10\n"
4087           "subps %%xmm6, %%xmm14\n"
4088           "movaps %%xmm3, %%xmm11\n"
4089           "movaps %%xmm3, %%xmm15\n"
4090           "addps %%xmm7, %%xmm11\n"
4091           "subps %%xmm7, %%xmm15\n"
4092           "movups %%xmm8, (%0)\n"
4093           "movups %%xmm9, (%1)\n"
4094           "movups %%xmm10, (%2)\n"
4095           "movups %%xmm11, (%3)\n"
4096           "movups %%xmm12, (%4)\n"
4097           "movups %%xmm13, (%5)\n"
4098           "movups %%xmm14, (%6)\n"
4099           "movups %%xmm15, (%7)\n"
4100           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
4101         );
4102       }
4103     }
4104     return;
4105   }
4106   if (depth == 14) {
4107     helper_float_16_recursive(buf + 0, 11);
4108     helper_float_16_recursive(buf + 2048, 11);
4109     helper_float_16_recursive(buf + 4096, 11);
4110     helper_float_16_recursive(buf + 6144, 11);
4111     helper_float_16_recursive(buf + 8192, 11);
4112     helper_float_16_recursive(buf + 10240, 11);
4113     helper_float_16_recursive(buf + 12288, 11);
4114     helper_float_16_recursive(buf + 14336, 11);
4115     for (int j = 0; j < 16384; j += 16384) {
4116       for (int k = 0; k < 2048; k += 4) {
4117         __asm__ volatile (
4118           "movups (%0), %%xmm0\n"
4119           "movups (%1), %%xmm1\n"
4120           "movups (%2), %%xmm2\n"
4121           "movups (%3), %%xmm3\n"
4122           "movups (%4), %%xmm4\n"
4123           "movups (%5), %%xmm5\n"
4124           "movups (%6), %%xmm6\n"
4125           "movups (%7), %%xmm7\n"
4126           "movaps %%xmm0, %%xmm8\n"
4127           "movaps %%xmm0, %%xmm9\n"
4128           "addps %%xmm1, %%xmm8\n"
4129           "subps %%xmm1, %%xmm9\n"
4130           "movaps %%xmm2, %%xmm10\n"
4131           "movaps %%xmm2, %%xmm11\n"
4132           "addps %%xmm3, %%xmm10\n"
4133           "subps %%xmm3, %%xmm11\n"
4134           "movaps %%xmm4, %%xmm12\n"
4135           "movaps %%xmm4, %%xmm13\n"
4136           "addps %%xmm5, %%xmm12\n"
4137           "subps %%xmm5, %%xmm13\n"
4138           "movaps %%xmm6, %%xmm14\n"
4139           "movaps %%xmm6, %%xmm15\n"
4140           "addps %%xmm7, %%xmm14\n"
4141           "subps %%xmm7, %%xmm15\n"
4142           "movaps %%xmm8, %%xmm0\n"
4143           "movaps %%xmm8, %%xmm2\n"
4144           "addps %%xmm10, %%xmm0\n"
4145           "subps %%xmm10, %%xmm2\n"
4146           "movaps %%xmm9, %%xmm1\n"
4147           "movaps %%xmm9, %%xmm3\n"
4148           "addps %%xmm11, %%xmm1\n"
4149           "subps %%xmm11, %%xmm3\n"
4150           "movaps %%xmm12, %%xmm4\n"
4151           "movaps %%xmm12, %%xmm6\n"
4152           "addps %%xmm14, %%xmm4\n"
4153           "subps %%xmm14, %%xmm6\n"
4154           "movaps %%xmm13, %%xmm5\n"
4155           "movaps %%xmm13, %%xmm7\n"
4156           "addps %%xmm15, %%xmm5\n"
4157           "subps %%xmm15, %%xmm7\n"
4158           "movaps %%xmm0, %%xmm8\n"
4159           "movaps %%xmm0, %%xmm12\n"
4160           "addps %%xmm4, %%xmm8\n"
4161           "subps %%xmm4, %%xmm12\n"
4162           "movaps %%xmm1, %%xmm9\n"
4163           "movaps %%xmm1, %%xmm13\n"
4164           "addps %%xmm5, %%xmm9\n"
4165           "subps %%xmm5, %%xmm13\n"
4166           "movaps %%xmm2, %%xmm10\n"
4167           "movaps %%xmm2, %%xmm14\n"
4168           "addps %%xmm6, %%xmm10\n"
4169           "subps %%xmm6, %%xmm14\n"
4170           "movaps %%xmm3, %%xmm11\n"
4171           "movaps %%xmm3, %%xmm15\n"
4172           "addps %%xmm7, %%xmm11\n"
4173           "subps %%xmm7, %%xmm15\n"
4174           "movups %%xmm8, (%0)\n"
4175           "movups %%xmm9, (%1)\n"
4176           "movups %%xmm10, (%2)\n"
4177           "movups %%xmm11, (%3)\n"
4178           "movups %%xmm12, (%4)\n"
4179           "movups %%xmm13, (%5)\n"
4180           "movups %%xmm14, (%6)\n"
4181           "movups %%xmm15, (%7)\n"
4182           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
4183         );
4184       }
4185     }
4186     return;
4187   }
4188   if (depth == 16) {
4189     helper_float_16_recursive(buf + 0, 14);
4190     helper_float_16_recursive(buf + 16384, 14);
4191     helper_float_16_recursive(buf + 32768, 14);
4192     helper_float_16_recursive(buf + 49152, 14);
4193     for (int j = 0; j < 65536; j += 65536) {
4194       for (int k = 0; k < 16384; k += 4) {
4195         __asm__ volatile (
4196           "movups (%0), %%xmm0\n"
4197           "movups (%1), %%xmm1\n"
4198           "movups (%2), %%xmm2\n"
4199           "movups (%3), %%xmm3\n"
4200           "movaps %%xmm0, %%xmm8\n"
4201           "movaps %%xmm0, %%xmm9\n"
4202           "addps %%xmm1, %%xmm8\n"
4203           "subps %%xmm1, %%xmm9\n"
4204           "movaps %%xmm2, %%xmm10\n"
4205           "movaps %%xmm2, %%xmm11\n"
4206           "addps %%xmm3, %%xmm10\n"
4207           "subps %%xmm3, %%xmm11\n"
4208           "movaps %%xmm8, %%xmm0\n"
4209           "movaps %%xmm8, %%xmm2\n"
4210           "addps %%xmm10, %%xmm0\n"
4211           "subps %%xmm10, %%xmm2\n"
4212           "movaps %%xmm9, %%xmm1\n"
4213           "movaps %%xmm9, %%xmm3\n"
4214           "addps %%xmm11, %%xmm1\n"
4215           "subps %%xmm11, %%xmm3\n"
4216           "movups %%xmm0, (%0)\n"
4217           "movups %%xmm1, (%1)\n"
4218           "movups %%xmm2, (%2)\n"
4219           "movups %%xmm3, (%3)\n"
4220           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
4221         );
4222       }
4223     }
4224     return;
4225   }
4226 }
4227 void helper_float_16(float *buf);
helper_float_16(float * buf)4228 void helper_float_16(float *buf) {
4229   helper_float_16_recursive(buf, 16);
4230 }
4231 void helper_float_17_recursive(float *buf, int depth);
helper_float_17_recursive(float * buf,int depth)4232 void helper_float_17_recursive(float *buf, int depth) {
4233   if (depth == 11) {
4234     for (int j = 0; j < 2048; j += 32) {
4235       for (int k = 0; k < 4; k += 4) {
4236         __asm__ volatile (
4237           "movups (%0), %%xmm0\n"
4238           "movups (%1), %%xmm1\n"
4239           "movups (%2), %%xmm2\n"
4240           "movups (%3), %%xmm3\n"
4241           "movups (%4), %%xmm4\n"
4242           "movups (%5), %%xmm5\n"
4243           "movups (%6), %%xmm6\n"
4244           "movups (%7), %%xmm7\n"
4245           "movaps %%xmm0, %%xmm8\n"
4246           "shufps $160, %%xmm8, %%xmm8\n"
4247           "shufps $245, %%xmm0, %%xmm0\n"
4248           "xorps %%xmm9, %%xmm9\n"
4249           "subps %%xmm0, %%xmm9\n"
4250           "addsubps %%xmm9, %%xmm8\n"
4251           "movaps %%xmm8, %%xmm0\n"
4252           "movaps %%xmm1, %%xmm8\n"
4253           "shufps $160, %%xmm8, %%xmm8\n"
4254           "shufps $245, %%xmm1, %%xmm1\n"
4255           "xorps %%xmm9, %%xmm9\n"
4256           "subps %%xmm1, %%xmm9\n"
4257           "addsubps %%xmm9, %%xmm8\n"
4258           "movaps %%xmm8, %%xmm1\n"
4259           "movaps %%xmm2, %%xmm8\n"
4260           "shufps $160, %%xmm8, %%xmm8\n"
4261           "shufps $245, %%xmm2, %%xmm2\n"
4262           "xorps %%xmm9, %%xmm9\n"
4263           "subps %%xmm2, %%xmm9\n"
4264           "addsubps %%xmm9, %%xmm8\n"
4265           "movaps %%xmm8, %%xmm2\n"
4266           "movaps %%xmm3, %%xmm8\n"
4267           "shufps $160, %%xmm8, %%xmm8\n"
4268           "shufps $245, %%xmm3, %%xmm3\n"
4269           "xorps %%xmm9, %%xmm9\n"
4270           "subps %%xmm3, %%xmm9\n"
4271           "addsubps %%xmm9, %%xmm8\n"
4272           "movaps %%xmm8, %%xmm3\n"
4273           "movaps %%xmm4, %%xmm8\n"
4274           "shufps $160, %%xmm8, %%xmm8\n"
4275           "shufps $245, %%xmm4, %%xmm4\n"
4276           "xorps %%xmm9, %%xmm9\n"
4277           "subps %%xmm4, %%xmm9\n"
4278           "addsubps %%xmm9, %%xmm8\n"
4279           "movaps %%xmm8, %%xmm4\n"
4280           "movaps %%xmm5, %%xmm8\n"
4281           "shufps $160, %%xmm8, %%xmm8\n"
4282           "shufps $245, %%xmm5, %%xmm5\n"
4283           "xorps %%xmm9, %%xmm9\n"
4284           "subps %%xmm5, %%xmm9\n"
4285           "addsubps %%xmm9, %%xmm8\n"
4286           "movaps %%xmm8, %%xmm5\n"
4287           "movaps %%xmm6, %%xmm8\n"
4288           "shufps $160, %%xmm8, %%xmm8\n"
4289           "shufps $245, %%xmm6, %%xmm6\n"
4290           "xorps %%xmm9, %%xmm9\n"
4291           "subps %%xmm6, %%xmm9\n"
4292           "addsubps %%xmm9, %%xmm8\n"
4293           "movaps %%xmm8, %%xmm6\n"
4294           "movaps %%xmm7, %%xmm8\n"
4295           "shufps $160, %%xmm8, %%xmm8\n"
4296           "shufps $245, %%xmm7, %%xmm7\n"
4297           "xorps %%xmm9, %%xmm9\n"
4298           "subps %%xmm7, %%xmm9\n"
4299           "addsubps %%xmm9, %%xmm8\n"
4300           "movaps %%xmm8, %%xmm7\n"
4301           "movaps %%xmm0, %%xmm8\n"
4302           "shufps $68, %%xmm8, %%xmm8\n"
4303           "xorps %%xmm9, %%xmm9\n"
4304           "movaps %%xmm0, %%xmm10\n"
4305           "shufps $14, %%xmm9, %%xmm10\n"
4306           "movaps %%xmm0, %%xmm11\n"
4307           "shufps $224, %%xmm11, %%xmm9\n"
4308           "addps %%xmm8, %%xmm10\n"
4309           "subps %%xmm9, %%xmm10\n"
4310           "movaps %%xmm10, %%xmm0\n"
4311           "movaps %%xmm1, %%xmm8\n"
4312           "shufps $68, %%xmm8, %%xmm8\n"
4313           "xorps %%xmm9, %%xmm9\n"
4314           "movaps %%xmm1, %%xmm10\n"
4315           "shufps $14, %%xmm9, %%xmm10\n"
4316           "movaps %%xmm1, %%xmm11\n"
4317           "shufps $224, %%xmm11, %%xmm9\n"
4318           "addps %%xmm8, %%xmm10\n"
4319           "subps %%xmm9, %%xmm10\n"
4320           "movaps %%xmm10, %%xmm1\n"
4321           "movaps %%xmm2, %%xmm8\n"
4322           "shufps $68, %%xmm8, %%xmm8\n"
4323           "xorps %%xmm9, %%xmm9\n"
4324           "movaps %%xmm2, %%xmm10\n"
4325           "shufps $14, %%xmm9, %%xmm10\n"
4326           "movaps %%xmm2, %%xmm11\n"
4327           "shufps $224, %%xmm11, %%xmm9\n"
4328           "addps %%xmm8, %%xmm10\n"
4329           "subps %%xmm9, %%xmm10\n"
4330           "movaps %%xmm10, %%xmm2\n"
4331           "movaps %%xmm3, %%xmm8\n"
4332           "shufps $68, %%xmm8, %%xmm8\n"
4333           "xorps %%xmm9, %%xmm9\n"
4334           "movaps %%xmm3, %%xmm10\n"
4335           "shufps $14, %%xmm9, %%xmm10\n"
4336           "movaps %%xmm3, %%xmm11\n"
4337           "shufps $224, %%xmm11, %%xmm9\n"
4338           "addps %%xmm8, %%xmm10\n"
4339           "subps %%xmm9, %%xmm10\n"
4340           "movaps %%xmm10, %%xmm3\n"
4341           "movaps %%xmm4, %%xmm8\n"
4342           "shufps $68, %%xmm8, %%xmm8\n"
4343           "xorps %%xmm9, %%xmm9\n"
4344           "movaps %%xmm4, %%xmm10\n"
4345           "shufps $14, %%xmm9, %%xmm10\n"
4346           "movaps %%xmm4, %%xmm11\n"
4347           "shufps $224, %%xmm11, %%xmm9\n"
4348           "addps %%xmm8, %%xmm10\n"
4349           "subps %%xmm9, %%xmm10\n"
4350           "movaps %%xmm10, %%xmm4\n"
4351           "movaps %%xmm5, %%xmm8\n"
4352           "shufps $68, %%xmm8, %%xmm8\n"
4353           "xorps %%xmm9, %%xmm9\n"
4354           "movaps %%xmm5, %%xmm10\n"
4355           "shufps $14, %%xmm9, %%xmm10\n"
4356           "movaps %%xmm5, %%xmm11\n"
4357           "shufps $224, %%xmm11, %%xmm9\n"
4358           "addps %%xmm8, %%xmm10\n"
4359           "subps %%xmm9, %%xmm10\n"
4360           "movaps %%xmm10, %%xmm5\n"
4361           "movaps %%xmm6, %%xmm8\n"
4362           "shufps $68, %%xmm8, %%xmm8\n"
4363           "xorps %%xmm9, %%xmm9\n"
4364           "movaps %%xmm6, %%xmm10\n"
4365           "shufps $14, %%xmm9, %%xmm10\n"
4366           "movaps %%xmm6, %%xmm11\n"
4367           "shufps $224, %%xmm11, %%xmm9\n"
4368           "addps %%xmm8, %%xmm10\n"
4369           "subps %%xmm9, %%xmm10\n"
4370           "movaps %%xmm10, %%xmm6\n"
4371           "movaps %%xmm7, %%xmm8\n"
4372           "shufps $68, %%xmm8, %%xmm8\n"
4373           "xorps %%xmm9, %%xmm9\n"
4374           "movaps %%xmm7, %%xmm10\n"
4375           "shufps $14, %%xmm9, %%xmm10\n"
4376           "movaps %%xmm7, %%xmm11\n"
4377           "shufps $224, %%xmm11, %%xmm9\n"
4378           "addps %%xmm8, %%xmm10\n"
4379           "subps %%xmm9, %%xmm10\n"
4380           "movaps %%xmm10, %%xmm7\n"
4381           "movaps %%xmm0, %%xmm8\n"
4382           "movaps %%xmm0, %%xmm9\n"
4383           "addps %%xmm1, %%xmm8\n"
4384           "subps %%xmm1, %%xmm9\n"
4385           "movaps %%xmm2, %%xmm10\n"
4386           "movaps %%xmm2, %%xmm11\n"
4387           "addps %%xmm3, %%xmm10\n"
4388           "subps %%xmm3, %%xmm11\n"
4389           "movaps %%xmm4, %%xmm12\n"
4390           "movaps %%xmm4, %%xmm13\n"
4391           "addps %%xmm5, %%xmm12\n"
4392           "subps %%xmm5, %%xmm13\n"
4393           "movaps %%xmm6, %%xmm14\n"
4394           "movaps %%xmm6, %%xmm15\n"
4395           "addps %%xmm7, %%xmm14\n"
4396           "subps %%xmm7, %%xmm15\n"
4397           "movaps %%xmm8, %%xmm0\n"
4398           "movaps %%xmm8, %%xmm2\n"
4399           "addps %%xmm10, %%xmm0\n"
4400           "subps %%xmm10, %%xmm2\n"
4401           "movaps %%xmm9, %%xmm1\n"
4402           "movaps %%xmm9, %%xmm3\n"
4403           "addps %%xmm11, %%xmm1\n"
4404           "subps %%xmm11, %%xmm3\n"
4405           "movaps %%xmm12, %%xmm4\n"
4406           "movaps %%xmm12, %%xmm6\n"
4407           "addps %%xmm14, %%xmm4\n"
4408           "subps %%xmm14, %%xmm6\n"
4409           "movaps %%xmm13, %%xmm5\n"
4410           "movaps %%xmm13, %%xmm7\n"
4411           "addps %%xmm15, %%xmm5\n"
4412           "subps %%xmm15, %%xmm7\n"
4413           "movaps %%xmm0, %%xmm8\n"
4414           "movaps %%xmm0, %%xmm12\n"
4415           "addps %%xmm4, %%xmm8\n"
4416           "subps %%xmm4, %%xmm12\n"
4417           "movaps %%xmm1, %%xmm9\n"
4418           "movaps %%xmm1, %%xmm13\n"
4419           "addps %%xmm5, %%xmm9\n"
4420           "subps %%xmm5, %%xmm13\n"
4421           "movaps %%xmm2, %%xmm10\n"
4422           "movaps %%xmm2, %%xmm14\n"
4423           "addps %%xmm6, %%xmm10\n"
4424           "subps %%xmm6, %%xmm14\n"
4425           "movaps %%xmm3, %%xmm11\n"
4426           "movaps %%xmm3, %%xmm15\n"
4427           "addps %%xmm7, %%xmm11\n"
4428           "subps %%xmm7, %%xmm15\n"
4429           "movups %%xmm8, (%0)\n"
4430           "movups %%xmm9, (%1)\n"
4431           "movups %%xmm10, (%2)\n"
4432           "movups %%xmm11, (%3)\n"
4433           "movups %%xmm12, (%4)\n"
4434           "movups %%xmm13, (%5)\n"
4435           "movups %%xmm14, (%6)\n"
4436           "movups %%xmm15, (%7)\n"
4437           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
4438         );
4439       }
4440     }
4441     for (int j = 0; j < 2048; j += 256) {
4442       for (int k = 0; k < 32; k += 4) {
4443         __asm__ volatile (
4444           "movups (%0), %%xmm0\n"
4445           "movups (%1), %%xmm1\n"
4446           "movups (%2), %%xmm2\n"
4447           "movups (%3), %%xmm3\n"
4448           "movups (%4), %%xmm4\n"
4449           "movups (%5), %%xmm5\n"
4450           "movups (%6), %%xmm6\n"
4451           "movups (%7), %%xmm7\n"
4452           "movaps %%xmm0, %%xmm8\n"
4453           "movaps %%xmm0, %%xmm9\n"
4454           "addps %%xmm1, %%xmm8\n"
4455           "subps %%xmm1, %%xmm9\n"
4456           "movaps %%xmm2, %%xmm10\n"
4457           "movaps %%xmm2, %%xmm11\n"
4458           "addps %%xmm3, %%xmm10\n"
4459           "subps %%xmm3, %%xmm11\n"
4460           "movaps %%xmm4, %%xmm12\n"
4461           "movaps %%xmm4, %%xmm13\n"
4462           "addps %%xmm5, %%xmm12\n"
4463           "subps %%xmm5, %%xmm13\n"
4464           "movaps %%xmm6, %%xmm14\n"
4465           "movaps %%xmm6, %%xmm15\n"
4466           "addps %%xmm7, %%xmm14\n"
4467           "subps %%xmm7, %%xmm15\n"
4468           "movaps %%xmm8, %%xmm0\n"
4469           "movaps %%xmm8, %%xmm2\n"
4470           "addps %%xmm10, %%xmm0\n"
4471           "subps %%xmm10, %%xmm2\n"
4472           "movaps %%xmm9, %%xmm1\n"
4473           "movaps %%xmm9, %%xmm3\n"
4474           "addps %%xmm11, %%xmm1\n"
4475           "subps %%xmm11, %%xmm3\n"
4476           "movaps %%xmm12, %%xmm4\n"
4477           "movaps %%xmm12, %%xmm6\n"
4478           "addps %%xmm14, %%xmm4\n"
4479           "subps %%xmm14, %%xmm6\n"
4480           "movaps %%xmm13, %%xmm5\n"
4481           "movaps %%xmm13, %%xmm7\n"
4482           "addps %%xmm15, %%xmm5\n"
4483           "subps %%xmm15, %%xmm7\n"
4484           "movaps %%xmm0, %%xmm8\n"
4485           "movaps %%xmm0, %%xmm12\n"
4486           "addps %%xmm4, %%xmm8\n"
4487           "subps %%xmm4, %%xmm12\n"
4488           "movaps %%xmm1, %%xmm9\n"
4489           "movaps %%xmm1, %%xmm13\n"
4490           "addps %%xmm5, %%xmm9\n"
4491           "subps %%xmm5, %%xmm13\n"
4492           "movaps %%xmm2, %%xmm10\n"
4493           "movaps %%xmm2, %%xmm14\n"
4494           "addps %%xmm6, %%xmm10\n"
4495           "subps %%xmm6, %%xmm14\n"
4496           "movaps %%xmm3, %%xmm11\n"
4497           "movaps %%xmm3, %%xmm15\n"
4498           "addps %%xmm7, %%xmm11\n"
4499           "subps %%xmm7, %%xmm15\n"
4500           "movups %%xmm8, (%0)\n"
4501           "movups %%xmm9, (%1)\n"
4502           "movups %%xmm10, (%2)\n"
4503           "movups %%xmm11, (%3)\n"
4504           "movups %%xmm12, (%4)\n"
4505           "movups %%xmm13, (%5)\n"
4506           "movups %%xmm14, (%6)\n"
4507           "movups %%xmm15, (%7)\n"
4508           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
4509         );
4510       }
4511     }
4512     for (int j = 0; j < 2048; j += 2048) {
4513       for (int k = 0; k < 256; k += 4) {
4514         __asm__ volatile (
4515           "movups (%0), %%xmm0\n"
4516           "movups (%1), %%xmm1\n"
4517           "movups (%2), %%xmm2\n"
4518           "movups (%3), %%xmm3\n"
4519           "movups (%4), %%xmm4\n"
4520           "movups (%5), %%xmm5\n"
4521           "movups (%6), %%xmm6\n"
4522           "movups (%7), %%xmm7\n"
4523           "movaps %%xmm0, %%xmm8\n"
4524           "movaps %%xmm0, %%xmm9\n"
4525           "addps %%xmm1, %%xmm8\n"
4526           "subps %%xmm1, %%xmm9\n"
4527           "movaps %%xmm2, %%xmm10\n"
4528           "movaps %%xmm2, %%xmm11\n"
4529           "addps %%xmm3, %%xmm10\n"
4530           "subps %%xmm3, %%xmm11\n"
4531           "movaps %%xmm4, %%xmm12\n"
4532           "movaps %%xmm4, %%xmm13\n"
4533           "addps %%xmm5, %%xmm12\n"
4534           "subps %%xmm5, %%xmm13\n"
4535           "movaps %%xmm6, %%xmm14\n"
4536           "movaps %%xmm6, %%xmm15\n"
4537           "addps %%xmm7, %%xmm14\n"
4538           "subps %%xmm7, %%xmm15\n"
4539           "movaps %%xmm8, %%xmm0\n"
4540           "movaps %%xmm8, %%xmm2\n"
4541           "addps %%xmm10, %%xmm0\n"
4542           "subps %%xmm10, %%xmm2\n"
4543           "movaps %%xmm9, %%xmm1\n"
4544           "movaps %%xmm9, %%xmm3\n"
4545           "addps %%xmm11, %%xmm1\n"
4546           "subps %%xmm11, %%xmm3\n"
4547           "movaps %%xmm12, %%xmm4\n"
4548           "movaps %%xmm12, %%xmm6\n"
4549           "addps %%xmm14, %%xmm4\n"
4550           "subps %%xmm14, %%xmm6\n"
4551           "movaps %%xmm13, %%xmm5\n"
4552           "movaps %%xmm13, %%xmm7\n"
4553           "addps %%xmm15, %%xmm5\n"
4554           "subps %%xmm15, %%xmm7\n"
4555           "movaps %%xmm0, %%xmm8\n"
4556           "movaps %%xmm0, %%xmm12\n"
4557           "addps %%xmm4, %%xmm8\n"
4558           "subps %%xmm4, %%xmm12\n"
4559           "movaps %%xmm1, %%xmm9\n"
4560           "movaps %%xmm1, %%xmm13\n"
4561           "addps %%xmm5, %%xmm9\n"
4562           "subps %%xmm5, %%xmm13\n"
4563           "movaps %%xmm2, %%xmm10\n"
4564           "movaps %%xmm2, %%xmm14\n"
4565           "addps %%xmm6, %%xmm10\n"
4566           "subps %%xmm6, %%xmm14\n"
4567           "movaps %%xmm3, %%xmm11\n"
4568           "movaps %%xmm3, %%xmm15\n"
4569           "addps %%xmm7, %%xmm11\n"
4570           "subps %%xmm7, %%xmm15\n"
4571           "movups %%xmm8, (%0)\n"
4572           "movups %%xmm9, (%1)\n"
4573           "movups %%xmm10, (%2)\n"
4574           "movups %%xmm11, (%3)\n"
4575           "movups %%xmm12, (%4)\n"
4576           "movups %%xmm13, (%5)\n"
4577           "movups %%xmm14, (%6)\n"
4578           "movups %%xmm15, (%7)\n"
4579           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
4580         );
4581       }
4582     }
4583     return;
4584   }
4585   if (depth == 14) {
4586     helper_float_17_recursive(buf + 0, 11);
4587     helper_float_17_recursive(buf + 2048, 11);
4588     helper_float_17_recursive(buf + 4096, 11);
4589     helper_float_17_recursive(buf + 6144, 11);
4590     helper_float_17_recursive(buf + 8192, 11);
4591     helper_float_17_recursive(buf + 10240, 11);
4592     helper_float_17_recursive(buf + 12288, 11);
4593     helper_float_17_recursive(buf + 14336, 11);
4594     for (int j = 0; j < 16384; j += 16384) {
4595       for (int k = 0; k < 2048; k += 4) {
4596         __asm__ volatile (
4597           "movups (%0), %%xmm0\n"
4598           "movups (%1), %%xmm1\n"
4599           "movups (%2), %%xmm2\n"
4600           "movups (%3), %%xmm3\n"
4601           "movups (%4), %%xmm4\n"
4602           "movups (%5), %%xmm5\n"
4603           "movups (%6), %%xmm6\n"
4604           "movups (%7), %%xmm7\n"
4605           "movaps %%xmm0, %%xmm8\n"
4606           "movaps %%xmm0, %%xmm9\n"
4607           "addps %%xmm1, %%xmm8\n"
4608           "subps %%xmm1, %%xmm9\n"
4609           "movaps %%xmm2, %%xmm10\n"
4610           "movaps %%xmm2, %%xmm11\n"
4611           "addps %%xmm3, %%xmm10\n"
4612           "subps %%xmm3, %%xmm11\n"
4613           "movaps %%xmm4, %%xmm12\n"
4614           "movaps %%xmm4, %%xmm13\n"
4615           "addps %%xmm5, %%xmm12\n"
4616           "subps %%xmm5, %%xmm13\n"
4617           "movaps %%xmm6, %%xmm14\n"
4618           "movaps %%xmm6, %%xmm15\n"
4619           "addps %%xmm7, %%xmm14\n"
4620           "subps %%xmm7, %%xmm15\n"
4621           "movaps %%xmm8, %%xmm0\n"
4622           "movaps %%xmm8, %%xmm2\n"
4623           "addps %%xmm10, %%xmm0\n"
4624           "subps %%xmm10, %%xmm2\n"
4625           "movaps %%xmm9, %%xmm1\n"
4626           "movaps %%xmm9, %%xmm3\n"
4627           "addps %%xmm11, %%xmm1\n"
4628           "subps %%xmm11, %%xmm3\n"
4629           "movaps %%xmm12, %%xmm4\n"
4630           "movaps %%xmm12, %%xmm6\n"
4631           "addps %%xmm14, %%xmm4\n"
4632           "subps %%xmm14, %%xmm6\n"
4633           "movaps %%xmm13, %%xmm5\n"
4634           "movaps %%xmm13, %%xmm7\n"
4635           "addps %%xmm15, %%xmm5\n"
4636           "subps %%xmm15, %%xmm7\n"
4637           "movaps %%xmm0, %%xmm8\n"
4638           "movaps %%xmm0, %%xmm12\n"
4639           "addps %%xmm4, %%xmm8\n"
4640           "subps %%xmm4, %%xmm12\n"
4641           "movaps %%xmm1, %%xmm9\n"
4642           "movaps %%xmm1, %%xmm13\n"
4643           "addps %%xmm5, %%xmm9\n"
4644           "subps %%xmm5, %%xmm13\n"
4645           "movaps %%xmm2, %%xmm10\n"
4646           "movaps %%xmm2, %%xmm14\n"
4647           "addps %%xmm6, %%xmm10\n"
4648           "subps %%xmm6, %%xmm14\n"
4649           "movaps %%xmm3, %%xmm11\n"
4650           "movaps %%xmm3, %%xmm15\n"
4651           "addps %%xmm7, %%xmm11\n"
4652           "subps %%xmm7, %%xmm15\n"
4653           "movups %%xmm8, (%0)\n"
4654           "movups %%xmm9, (%1)\n"
4655           "movups %%xmm10, (%2)\n"
4656           "movups %%xmm11, (%3)\n"
4657           "movups %%xmm12, (%4)\n"
4658           "movups %%xmm13, (%5)\n"
4659           "movups %%xmm14, (%6)\n"
4660           "movups %%xmm15, (%7)\n"
4661           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
4662         );
4663       }
4664     }
4665     return;
4666   }
4667   if (depth == 17) {
4668     helper_float_17_recursive(buf + 0, 14);
4669     helper_float_17_recursive(buf + 16384, 14);
4670     helper_float_17_recursive(buf + 32768, 14);
4671     helper_float_17_recursive(buf + 49152, 14);
4672     helper_float_17_recursive(buf + 65536, 14);
4673     helper_float_17_recursive(buf + 81920, 14);
4674     helper_float_17_recursive(buf + 98304, 14);
4675     helper_float_17_recursive(buf + 114688, 14);
4676     for (int j = 0; j < 131072; j += 131072) {
4677       for (int k = 0; k < 16384; k += 4) {
4678         __asm__ volatile (
4679           "movups (%0), %%xmm0\n"
4680           "movups (%1), %%xmm1\n"
4681           "movups (%2), %%xmm2\n"
4682           "movups (%3), %%xmm3\n"
4683           "movups (%4), %%xmm4\n"
4684           "movups (%5), %%xmm5\n"
4685           "movups (%6), %%xmm6\n"
4686           "movups (%7), %%xmm7\n"
4687           "movaps %%xmm0, %%xmm8\n"
4688           "movaps %%xmm0, %%xmm9\n"
4689           "addps %%xmm1, %%xmm8\n"
4690           "subps %%xmm1, %%xmm9\n"
4691           "movaps %%xmm2, %%xmm10\n"
4692           "movaps %%xmm2, %%xmm11\n"
4693           "addps %%xmm3, %%xmm10\n"
4694           "subps %%xmm3, %%xmm11\n"
4695           "movaps %%xmm4, %%xmm12\n"
4696           "movaps %%xmm4, %%xmm13\n"
4697           "addps %%xmm5, %%xmm12\n"
4698           "subps %%xmm5, %%xmm13\n"
4699           "movaps %%xmm6, %%xmm14\n"
4700           "movaps %%xmm6, %%xmm15\n"
4701           "addps %%xmm7, %%xmm14\n"
4702           "subps %%xmm7, %%xmm15\n"
4703           "movaps %%xmm8, %%xmm0\n"
4704           "movaps %%xmm8, %%xmm2\n"
4705           "addps %%xmm10, %%xmm0\n"
4706           "subps %%xmm10, %%xmm2\n"
4707           "movaps %%xmm9, %%xmm1\n"
4708           "movaps %%xmm9, %%xmm3\n"
4709           "addps %%xmm11, %%xmm1\n"
4710           "subps %%xmm11, %%xmm3\n"
4711           "movaps %%xmm12, %%xmm4\n"
4712           "movaps %%xmm12, %%xmm6\n"
4713           "addps %%xmm14, %%xmm4\n"
4714           "subps %%xmm14, %%xmm6\n"
4715           "movaps %%xmm13, %%xmm5\n"
4716           "movaps %%xmm13, %%xmm7\n"
4717           "addps %%xmm15, %%xmm5\n"
4718           "subps %%xmm15, %%xmm7\n"
4719           "movaps %%xmm0, %%xmm8\n"
4720           "movaps %%xmm0, %%xmm12\n"
4721           "addps %%xmm4, %%xmm8\n"
4722           "subps %%xmm4, %%xmm12\n"
4723           "movaps %%xmm1, %%xmm9\n"
4724           "movaps %%xmm1, %%xmm13\n"
4725           "addps %%xmm5, %%xmm9\n"
4726           "subps %%xmm5, %%xmm13\n"
4727           "movaps %%xmm2, %%xmm10\n"
4728           "movaps %%xmm2, %%xmm14\n"
4729           "addps %%xmm6, %%xmm10\n"
4730           "subps %%xmm6, %%xmm14\n"
4731           "movaps %%xmm3, %%xmm11\n"
4732           "movaps %%xmm3, %%xmm15\n"
4733           "addps %%xmm7, %%xmm11\n"
4734           "subps %%xmm7, %%xmm15\n"
4735           "movups %%xmm8, (%0)\n"
4736           "movups %%xmm9, (%1)\n"
4737           "movups %%xmm10, (%2)\n"
4738           "movups %%xmm11, (%3)\n"
4739           "movups %%xmm12, (%4)\n"
4740           "movups %%xmm13, (%5)\n"
4741           "movups %%xmm14, (%6)\n"
4742           "movups %%xmm15, (%7)\n"
4743           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
4744         );
4745       }
4746     }
4747     return;
4748   }
4749 }
4750 void helper_float_17(float *buf);
helper_float_17(float * buf)4751 void helper_float_17(float *buf) {
4752   helper_float_17_recursive(buf, 17);
4753 }
4754 void helper_float_18_recursive(float *buf, int depth);
helper_float_18_recursive(float * buf,int depth)4755 void helper_float_18_recursive(float *buf, int depth) {
4756   if (depth == 13) {
4757     for (int j = 0; j < 8192; j += 32) {
4758       for (int k = 0; k < 4; k += 4) {
4759         __asm__ volatile (
4760           "movups (%0), %%xmm0\n"
4761           "movups (%1), %%xmm1\n"
4762           "movups (%2), %%xmm2\n"
4763           "movups (%3), %%xmm3\n"
4764           "movups (%4), %%xmm4\n"
4765           "movups (%5), %%xmm5\n"
4766           "movups (%6), %%xmm6\n"
4767           "movups (%7), %%xmm7\n"
4768           "movaps %%xmm0, %%xmm8\n"
4769           "shufps $160, %%xmm8, %%xmm8\n"
4770           "shufps $245, %%xmm0, %%xmm0\n"
4771           "xorps %%xmm9, %%xmm9\n"
4772           "subps %%xmm0, %%xmm9\n"
4773           "addsubps %%xmm9, %%xmm8\n"
4774           "movaps %%xmm8, %%xmm0\n"
4775           "movaps %%xmm1, %%xmm8\n"
4776           "shufps $160, %%xmm8, %%xmm8\n"
4777           "shufps $245, %%xmm1, %%xmm1\n"
4778           "xorps %%xmm9, %%xmm9\n"
4779           "subps %%xmm1, %%xmm9\n"
4780           "addsubps %%xmm9, %%xmm8\n"
4781           "movaps %%xmm8, %%xmm1\n"
4782           "movaps %%xmm2, %%xmm8\n"
4783           "shufps $160, %%xmm8, %%xmm8\n"
4784           "shufps $245, %%xmm2, %%xmm2\n"
4785           "xorps %%xmm9, %%xmm9\n"
4786           "subps %%xmm2, %%xmm9\n"
4787           "addsubps %%xmm9, %%xmm8\n"
4788           "movaps %%xmm8, %%xmm2\n"
4789           "movaps %%xmm3, %%xmm8\n"
4790           "shufps $160, %%xmm8, %%xmm8\n"
4791           "shufps $245, %%xmm3, %%xmm3\n"
4792           "xorps %%xmm9, %%xmm9\n"
4793           "subps %%xmm3, %%xmm9\n"
4794           "addsubps %%xmm9, %%xmm8\n"
4795           "movaps %%xmm8, %%xmm3\n"
4796           "movaps %%xmm4, %%xmm8\n"
4797           "shufps $160, %%xmm8, %%xmm8\n"
4798           "shufps $245, %%xmm4, %%xmm4\n"
4799           "xorps %%xmm9, %%xmm9\n"
4800           "subps %%xmm4, %%xmm9\n"
4801           "addsubps %%xmm9, %%xmm8\n"
4802           "movaps %%xmm8, %%xmm4\n"
4803           "movaps %%xmm5, %%xmm8\n"
4804           "shufps $160, %%xmm8, %%xmm8\n"
4805           "shufps $245, %%xmm5, %%xmm5\n"
4806           "xorps %%xmm9, %%xmm9\n"
4807           "subps %%xmm5, %%xmm9\n"
4808           "addsubps %%xmm9, %%xmm8\n"
4809           "movaps %%xmm8, %%xmm5\n"
4810           "movaps %%xmm6, %%xmm8\n"
4811           "shufps $160, %%xmm8, %%xmm8\n"
4812           "shufps $245, %%xmm6, %%xmm6\n"
4813           "xorps %%xmm9, %%xmm9\n"
4814           "subps %%xmm6, %%xmm9\n"
4815           "addsubps %%xmm9, %%xmm8\n"
4816           "movaps %%xmm8, %%xmm6\n"
4817           "movaps %%xmm7, %%xmm8\n"
4818           "shufps $160, %%xmm8, %%xmm8\n"
4819           "shufps $245, %%xmm7, %%xmm7\n"
4820           "xorps %%xmm9, %%xmm9\n"
4821           "subps %%xmm7, %%xmm9\n"
4822           "addsubps %%xmm9, %%xmm8\n"
4823           "movaps %%xmm8, %%xmm7\n"
4824           "movaps %%xmm0, %%xmm8\n"
4825           "shufps $68, %%xmm8, %%xmm8\n"
4826           "xorps %%xmm9, %%xmm9\n"
4827           "movaps %%xmm0, %%xmm10\n"
4828           "shufps $14, %%xmm9, %%xmm10\n"
4829           "movaps %%xmm0, %%xmm11\n"
4830           "shufps $224, %%xmm11, %%xmm9\n"
4831           "addps %%xmm8, %%xmm10\n"
4832           "subps %%xmm9, %%xmm10\n"
4833           "movaps %%xmm10, %%xmm0\n"
4834           "movaps %%xmm1, %%xmm8\n"
4835           "shufps $68, %%xmm8, %%xmm8\n"
4836           "xorps %%xmm9, %%xmm9\n"
4837           "movaps %%xmm1, %%xmm10\n"
4838           "shufps $14, %%xmm9, %%xmm10\n"
4839           "movaps %%xmm1, %%xmm11\n"
4840           "shufps $224, %%xmm11, %%xmm9\n"
4841           "addps %%xmm8, %%xmm10\n"
4842           "subps %%xmm9, %%xmm10\n"
4843           "movaps %%xmm10, %%xmm1\n"
4844           "movaps %%xmm2, %%xmm8\n"
4845           "shufps $68, %%xmm8, %%xmm8\n"
4846           "xorps %%xmm9, %%xmm9\n"
4847           "movaps %%xmm2, %%xmm10\n"
4848           "shufps $14, %%xmm9, %%xmm10\n"
4849           "movaps %%xmm2, %%xmm11\n"
4850           "shufps $224, %%xmm11, %%xmm9\n"
4851           "addps %%xmm8, %%xmm10\n"
4852           "subps %%xmm9, %%xmm10\n"
4853           "movaps %%xmm10, %%xmm2\n"
4854           "movaps %%xmm3, %%xmm8\n"
4855           "shufps $68, %%xmm8, %%xmm8\n"
4856           "xorps %%xmm9, %%xmm9\n"
4857           "movaps %%xmm3, %%xmm10\n"
4858           "shufps $14, %%xmm9, %%xmm10\n"
4859           "movaps %%xmm3, %%xmm11\n"
4860           "shufps $224, %%xmm11, %%xmm9\n"
4861           "addps %%xmm8, %%xmm10\n"
4862           "subps %%xmm9, %%xmm10\n"
4863           "movaps %%xmm10, %%xmm3\n"
4864           "movaps %%xmm4, %%xmm8\n"
4865           "shufps $68, %%xmm8, %%xmm8\n"
4866           "xorps %%xmm9, %%xmm9\n"
4867           "movaps %%xmm4, %%xmm10\n"
4868           "shufps $14, %%xmm9, %%xmm10\n"
4869           "movaps %%xmm4, %%xmm11\n"
4870           "shufps $224, %%xmm11, %%xmm9\n"
4871           "addps %%xmm8, %%xmm10\n"
4872           "subps %%xmm9, %%xmm10\n"
4873           "movaps %%xmm10, %%xmm4\n"
4874           "movaps %%xmm5, %%xmm8\n"
4875           "shufps $68, %%xmm8, %%xmm8\n"
4876           "xorps %%xmm9, %%xmm9\n"
4877           "movaps %%xmm5, %%xmm10\n"
4878           "shufps $14, %%xmm9, %%xmm10\n"
4879           "movaps %%xmm5, %%xmm11\n"
4880           "shufps $224, %%xmm11, %%xmm9\n"
4881           "addps %%xmm8, %%xmm10\n"
4882           "subps %%xmm9, %%xmm10\n"
4883           "movaps %%xmm10, %%xmm5\n"
4884           "movaps %%xmm6, %%xmm8\n"
4885           "shufps $68, %%xmm8, %%xmm8\n"
4886           "xorps %%xmm9, %%xmm9\n"
4887           "movaps %%xmm6, %%xmm10\n"
4888           "shufps $14, %%xmm9, %%xmm10\n"
4889           "movaps %%xmm6, %%xmm11\n"
4890           "shufps $224, %%xmm11, %%xmm9\n"
4891           "addps %%xmm8, %%xmm10\n"
4892           "subps %%xmm9, %%xmm10\n"
4893           "movaps %%xmm10, %%xmm6\n"
4894           "movaps %%xmm7, %%xmm8\n"
4895           "shufps $68, %%xmm8, %%xmm8\n"
4896           "xorps %%xmm9, %%xmm9\n"
4897           "movaps %%xmm7, %%xmm10\n"
4898           "shufps $14, %%xmm9, %%xmm10\n"
4899           "movaps %%xmm7, %%xmm11\n"
4900           "shufps $224, %%xmm11, %%xmm9\n"
4901           "addps %%xmm8, %%xmm10\n"
4902           "subps %%xmm9, %%xmm10\n"
4903           "movaps %%xmm10, %%xmm7\n"
4904           "movaps %%xmm0, %%xmm8\n"
4905           "movaps %%xmm0, %%xmm9\n"
4906           "addps %%xmm1, %%xmm8\n"
4907           "subps %%xmm1, %%xmm9\n"
4908           "movaps %%xmm2, %%xmm10\n"
4909           "movaps %%xmm2, %%xmm11\n"
4910           "addps %%xmm3, %%xmm10\n"
4911           "subps %%xmm3, %%xmm11\n"
4912           "movaps %%xmm4, %%xmm12\n"
4913           "movaps %%xmm4, %%xmm13\n"
4914           "addps %%xmm5, %%xmm12\n"
4915           "subps %%xmm5, %%xmm13\n"
4916           "movaps %%xmm6, %%xmm14\n"
4917           "movaps %%xmm6, %%xmm15\n"
4918           "addps %%xmm7, %%xmm14\n"
4919           "subps %%xmm7, %%xmm15\n"
4920           "movaps %%xmm8, %%xmm0\n"
4921           "movaps %%xmm8, %%xmm2\n"
4922           "addps %%xmm10, %%xmm0\n"
4923           "subps %%xmm10, %%xmm2\n"
4924           "movaps %%xmm9, %%xmm1\n"
4925           "movaps %%xmm9, %%xmm3\n"
4926           "addps %%xmm11, %%xmm1\n"
4927           "subps %%xmm11, %%xmm3\n"
4928           "movaps %%xmm12, %%xmm4\n"
4929           "movaps %%xmm12, %%xmm6\n"
4930           "addps %%xmm14, %%xmm4\n"
4931           "subps %%xmm14, %%xmm6\n"
4932           "movaps %%xmm13, %%xmm5\n"
4933           "movaps %%xmm13, %%xmm7\n"
4934           "addps %%xmm15, %%xmm5\n"
4935           "subps %%xmm15, %%xmm7\n"
4936           "movaps %%xmm0, %%xmm8\n"
4937           "movaps %%xmm0, %%xmm12\n"
4938           "addps %%xmm4, %%xmm8\n"
4939           "subps %%xmm4, %%xmm12\n"
4940           "movaps %%xmm1, %%xmm9\n"
4941           "movaps %%xmm1, %%xmm13\n"
4942           "addps %%xmm5, %%xmm9\n"
4943           "subps %%xmm5, %%xmm13\n"
4944           "movaps %%xmm2, %%xmm10\n"
4945           "movaps %%xmm2, %%xmm14\n"
4946           "addps %%xmm6, %%xmm10\n"
4947           "subps %%xmm6, %%xmm14\n"
4948           "movaps %%xmm3, %%xmm11\n"
4949           "movaps %%xmm3, %%xmm15\n"
4950           "addps %%xmm7, %%xmm11\n"
4951           "subps %%xmm7, %%xmm15\n"
4952           "movups %%xmm8, (%0)\n"
4953           "movups %%xmm9, (%1)\n"
4954           "movups %%xmm10, (%2)\n"
4955           "movups %%xmm11, (%3)\n"
4956           "movups %%xmm12, (%4)\n"
4957           "movups %%xmm13, (%5)\n"
4958           "movups %%xmm14, (%6)\n"
4959           "movups %%xmm15, (%7)\n"
4960           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
4961         );
4962       }
4963     }
4964     for (int j = 0; j < 8192; j += 256) {
4965       for (int k = 0; k < 32; k += 4) {
4966         __asm__ volatile (
4967           "movups (%0), %%xmm0\n"
4968           "movups (%1), %%xmm1\n"
4969           "movups (%2), %%xmm2\n"
4970           "movups (%3), %%xmm3\n"
4971           "movups (%4), %%xmm4\n"
4972           "movups (%5), %%xmm5\n"
4973           "movups (%6), %%xmm6\n"
4974           "movups (%7), %%xmm7\n"
4975           "movaps %%xmm0, %%xmm8\n"
4976           "movaps %%xmm0, %%xmm9\n"
4977           "addps %%xmm1, %%xmm8\n"
4978           "subps %%xmm1, %%xmm9\n"
4979           "movaps %%xmm2, %%xmm10\n"
4980           "movaps %%xmm2, %%xmm11\n"
4981           "addps %%xmm3, %%xmm10\n"
4982           "subps %%xmm3, %%xmm11\n"
4983           "movaps %%xmm4, %%xmm12\n"
4984           "movaps %%xmm4, %%xmm13\n"
4985           "addps %%xmm5, %%xmm12\n"
4986           "subps %%xmm5, %%xmm13\n"
4987           "movaps %%xmm6, %%xmm14\n"
4988           "movaps %%xmm6, %%xmm15\n"
4989           "addps %%xmm7, %%xmm14\n"
4990           "subps %%xmm7, %%xmm15\n"
4991           "movaps %%xmm8, %%xmm0\n"
4992           "movaps %%xmm8, %%xmm2\n"
4993           "addps %%xmm10, %%xmm0\n"
4994           "subps %%xmm10, %%xmm2\n"
4995           "movaps %%xmm9, %%xmm1\n"
4996           "movaps %%xmm9, %%xmm3\n"
4997           "addps %%xmm11, %%xmm1\n"
4998           "subps %%xmm11, %%xmm3\n"
4999           "movaps %%xmm12, %%xmm4\n"
5000           "movaps %%xmm12, %%xmm6\n"
5001           "addps %%xmm14, %%xmm4\n"
5002           "subps %%xmm14, %%xmm6\n"
5003           "movaps %%xmm13, %%xmm5\n"
5004           "movaps %%xmm13, %%xmm7\n"
5005           "addps %%xmm15, %%xmm5\n"
5006           "subps %%xmm15, %%xmm7\n"
5007           "movaps %%xmm0, %%xmm8\n"
5008           "movaps %%xmm0, %%xmm12\n"
5009           "addps %%xmm4, %%xmm8\n"
5010           "subps %%xmm4, %%xmm12\n"
5011           "movaps %%xmm1, %%xmm9\n"
5012           "movaps %%xmm1, %%xmm13\n"
5013           "addps %%xmm5, %%xmm9\n"
5014           "subps %%xmm5, %%xmm13\n"
5015           "movaps %%xmm2, %%xmm10\n"
5016           "movaps %%xmm2, %%xmm14\n"
5017           "addps %%xmm6, %%xmm10\n"
5018           "subps %%xmm6, %%xmm14\n"
5019           "movaps %%xmm3, %%xmm11\n"
5020           "movaps %%xmm3, %%xmm15\n"
5021           "addps %%xmm7, %%xmm11\n"
5022           "subps %%xmm7, %%xmm15\n"
5023           "movups %%xmm8, (%0)\n"
5024           "movups %%xmm9, (%1)\n"
5025           "movups %%xmm10, (%2)\n"
5026           "movups %%xmm11, (%3)\n"
5027           "movups %%xmm12, (%4)\n"
5028           "movups %%xmm13, (%5)\n"
5029           "movups %%xmm14, (%6)\n"
5030           "movups %%xmm15, (%7)\n"
5031           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5032         );
5033       }
5034     }
5035     for (int j = 0; j < 8192; j += 2048) {
5036       for (int k = 0; k < 256; k += 4) {
5037         __asm__ volatile (
5038           "movups (%0), %%xmm0\n"
5039           "movups (%1), %%xmm1\n"
5040           "movups (%2), %%xmm2\n"
5041           "movups (%3), %%xmm3\n"
5042           "movups (%4), %%xmm4\n"
5043           "movups (%5), %%xmm5\n"
5044           "movups (%6), %%xmm6\n"
5045           "movups (%7), %%xmm7\n"
5046           "movaps %%xmm0, %%xmm8\n"
5047           "movaps %%xmm0, %%xmm9\n"
5048           "addps %%xmm1, %%xmm8\n"
5049           "subps %%xmm1, %%xmm9\n"
5050           "movaps %%xmm2, %%xmm10\n"
5051           "movaps %%xmm2, %%xmm11\n"
5052           "addps %%xmm3, %%xmm10\n"
5053           "subps %%xmm3, %%xmm11\n"
5054           "movaps %%xmm4, %%xmm12\n"
5055           "movaps %%xmm4, %%xmm13\n"
5056           "addps %%xmm5, %%xmm12\n"
5057           "subps %%xmm5, %%xmm13\n"
5058           "movaps %%xmm6, %%xmm14\n"
5059           "movaps %%xmm6, %%xmm15\n"
5060           "addps %%xmm7, %%xmm14\n"
5061           "subps %%xmm7, %%xmm15\n"
5062           "movaps %%xmm8, %%xmm0\n"
5063           "movaps %%xmm8, %%xmm2\n"
5064           "addps %%xmm10, %%xmm0\n"
5065           "subps %%xmm10, %%xmm2\n"
5066           "movaps %%xmm9, %%xmm1\n"
5067           "movaps %%xmm9, %%xmm3\n"
5068           "addps %%xmm11, %%xmm1\n"
5069           "subps %%xmm11, %%xmm3\n"
5070           "movaps %%xmm12, %%xmm4\n"
5071           "movaps %%xmm12, %%xmm6\n"
5072           "addps %%xmm14, %%xmm4\n"
5073           "subps %%xmm14, %%xmm6\n"
5074           "movaps %%xmm13, %%xmm5\n"
5075           "movaps %%xmm13, %%xmm7\n"
5076           "addps %%xmm15, %%xmm5\n"
5077           "subps %%xmm15, %%xmm7\n"
5078           "movaps %%xmm0, %%xmm8\n"
5079           "movaps %%xmm0, %%xmm12\n"
5080           "addps %%xmm4, %%xmm8\n"
5081           "subps %%xmm4, %%xmm12\n"
5082           "movaps %%xmm1, %%xmm9\n"
5083           "movaps %%xmm1, %%xmm13\n"
5084           "addps %%xmm5, %%xmm9\n"
5085           "subps %%xmm5, %%xmm13\n"
5086           "movaps %%xmm2, %%xmm10\n"
5087           "movaps %%xmm2, %%xmm14\n"
5088           "addps %%xmm6, %%xmm10\n"
5089           "subps %%xmm6, %%xmm14\n"
5090           "movaps %%xmm3, %%xmm11\n"
5091           "movaps %%xmm3, %%xmm15\n"
5092           "addps %%xmm7, %%xmm11\n"
5093           "subps %%xmm7, %%xmm15\n"
5094           "movups %%xmm8, (%0)\n"
5095           "movups %%xmm9, (%1)\n"
5096           "movups %%xmm10, (%2)\n"
5097           "movups %%xmm11, (%3)\n"
5098           "movups %%xmm12, (%4)\n"
5099           "movups %%xmm13, (%5)\n"
5100           "movups %%xmm14, (%6)\n"
5101           "movups %%xmm15, (%7)\n"
5102           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5103         );
5104       }
5105     }
5106     for (int j = 0; j < 8192; j += 8192) {
5107       for (int k = 0; k < 2048; k += 4) {
5108         __asm__ volatile (
5109           "movups (%0), %%xmm0\n"
5110           "movups (%1), %%xmm1\n"
5111           "movups (%2), %%xmm2\n"
5112           "movups (%3), %%xmm3\n"
5113           "movaps %%xmm0, %%xmm8\n"
5114           "movaps %%xmm0, %%xmm9\n"
5115           "addps %%xmm1, %%xmm8\n"
5116           "subps %%xmm1, %%xmm9\n"
5117           "movaps %%xmm2, %%xmm10\n"
5118           "movaps %%xmm2, %%xmm11\n"
5119           "addps %%xmm3, %%xmm10\n"
5120           "subps %%xmm3, %%xmm11\n"
5121           "movaps %%xmm8, %%xmm0\n"
5122           "movaps %%xmm8, %%xmm2\n"
5123           "addps %%xmm10, %%xmm0\n"
5124           "subps %%xmm10, %%xmm2\n"
5125           "movaps %%xmm9, %%xmm1\n"
5126           "movaps %%xmm9, %%xmm3\n"
5127           "addps %%xmm11, %%xmm1\n"
5128           "subps %%xmm11, %%xmm3\n"
5129           "movups %%xmm0, (%0)\n"
5130           "movups %%xmm1, (%1)\n"
5131           "movups %%xmm2, (%2)\n"
5132           "movups %%xmm3, (%3)\n"
5133           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5134         );
5135       }
5136     }
5137     return;
5138   }
5139   if (depth == 16) {
5140     helper_float_18_recursive(buf + 0, 13);
5141     helper_float_18_recursive(buf + 8192, 13);
5142     helper_float_18_recursive(buf + 16384, 13);
5143     helper_float_18_recursive(buf + 24576, 13);
5144     helper_float_18_recursive(buf + 32768, 13);
5145     helper_float_18_recursive(buf + 40960, 13);
5146     helper_float_18_recursive(buf + 49152, 13);
5147     helper_float_18_recursive(buf + 57344, 13);
5148     for (int j = 0; j < 65536; j += 65536) {
5149       for (int k = 0; k < 8192; k += 4) {
5150         __asm__ volatile (
5151           "movups (%0), %%xmm0\n"
5152           "movups (%1), %%xmm1\n"
5153           "movups (%2), %%xmm2\n"
5154           "movups (%3), %%xmm3\n"
5155           "movups (%4), %%xmm4\n"
5156           "movups (%5), %%xmm5\n"
5157           "movups (%6), %%xmm6\n"
5158           "movups (%7), %%xmm7\n"
5159           "movaps %%xmm0, %%xmm8\n"
5160           "movaps %%xmm0, %%xmm9\n"
5161           "addps %%xmm1, %%xmm8\n"
5162           "subps %%xmm1, %%xmm9\n"
5163           "movaps %%xmm2, %%xmm10\n"
5164           "movaps %%xmm2, %%xmm11\n"
5165           "addps %%xmm3, %%xmm10\n"
5166           "subps %%xmm3, %%xmm11\n"
5167           "movaps %%xmm4, %%xmm12\n"
5168           "movaps %%xmm4, %%xmm13\n"
5169           "addps %%xmm5, %%xmm12\n"
5170           "subps %%xmm5, %%xmm13\n"
5171           "movaps %%xmm6, %%xmm14\n"
5172           "movaps %%xmm6, %%xmm15\n"
5173           "addps %%xmm7, %%xmm14\n"
5174           "subps %%xmm7, %%xmm15\n"
5175           "movaps %%xmm8, %%xmm0\n"
5176           "movaps %%xmm8, %%xmm2\n"
5177           "addps %%xmm10, %%xmm0\n"
5178           "subps %%xmm10, %%xmm2\n"
5179           "movaps %%xmm9, %%xmm1\n"
5180           "movaps %%xmm9, %%xmm3\n"
5181           "addps %%xmm11, %%xmm1\n"
5182           "subps %%xmm11, %%xmm3\n"
5183           "movaps %%xmm12, %%xmm4\n"
5184           "movaps %%xmm12, %%xmm6\n"
5185           "addps %%xmm14, %%xmm4\n"
5186           "subps %%xmm14, %%xmm6\n"
5187           "movaps %%xmm13, %%xmm5\n"
5188           "movaps %%xmm13, %%xmm7\n"
5189           "addps %%xmm15, %%xmm5\n"
5190           "subps %%xmm15, %%xmm7\n"
5191           "movaps %%xmm0, %%xmm8\n"
5192           "movaps %%xmm0, %%xmm12\n"
5193           "addps %%xmm4, %%xmm8\n"
5194           "subps %%xmm4, %%xmm12\n"
5195           "movaps %%xmm1, %%xmm9\n"
5196           "movaps %%xmm1, %%xmm13\n"
5197           "addps %%xmm5, %%xmm9\n"
5198           "subps %%xmm5, %%xmm13\n"
5199           "movaps %%xmm2, %%xmm10\n"
5200           "movaps %%xmm2, %%xmm14\n"
5201           "addps %%xmm6, %%xmm10\n"
5202           "subps %%xmm6, %%xmm14\n"
5203           "movaps %%xmm3, %%xmm11\n"
5204           "movaps %%xmm3, %%xmm15\n"
5205           "addps %%xmm7, %%xmm11\n"
5206           "subps %%xmm7, %%xmm15\n"
5207           "movups %%xmm8, (%0)\n"
5208           "movups %%xmm9, (%1)\n"
5209           "movups %%xmm10, (%2)\n"
5210           "movups %%xmm11, (%3)\n"
5211           "movups %%xmm12, (%4)\n"
5212           "movups %%xmm13, (%5)\n"
5213           "movups %%xmm14, (%6)\n"
5214           "movups %%xmm15, (%7)\n"
5215           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5216         );
5217       }
5218     }
5219     return;
5220   }
5221   if (depth == 18) {
5222     helper_float_18_recursive(buf + 0, 16);
5223     helper_float_18_recursive(buf + 65536, 16);
5224     helper_float_18_recursive(buf + 131072, 16);
5225     helper_float_18_recursive(buf + 196608, 16);
5226     for (int j = 0; j < 262144; j += 262144) {
5227       for (int k = 0; k < 65536; k += 4) {
5228         __asm__ volatile (
5229           "movups (%0), %%xmm0\n"
5230           "movups (%1), %%xmm1\n"
5231           "movups (%2), %%xmm2\n"
5232           "movups (%3), %%xmm3\n"
5233           "movaps %%xmm0, %%xmm8\n"
5234           "movaps %%xmm0, %%xmm9\n"
5235           "addps %%xmm1, %%xmm8\n"
5236           "subps %%xmm1, %%xmm9\n"
5237           "movaps %%xmm2, %%xmm10\n"
5238           "movaps %%xmm2, %%xmm11\n"
5239           "addps %%xmm3, %%xmm10\n"
5240           "subps %%xmm3, %%xmm11\n"
5241           "movaps %%xmm8, %%xmm0\n"
5242           "movaps %%xmm8, %%xmm2\n"
5243           "addps %%xmm10, %%xmm0\n"
5244           "subps %%xmm10, %%xmm2\n"
5245           "movaps %%xmm9, %%xmm1\n"
5246           "movaps %%xmm9, %%xmm3\n"
5247           "addps %%xmm11, %%xmm1\n"
5248           "subps %%xmm11, %%xmm3\n"
5249           "movups %%xmm0, (%0)\n"
5250           "movups %%xmm1, (%1)\n"
5251           "movups %%xmm2, (%2)\n"
5252           "movups %%xmm3, (%3)\n"
5253           :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5254         );
5255       }
5256     }
5257     return;
5258   }
5259 }
5260 void helper_float_18(float *buf);
helper_float_18(float * buf)5261 void helper_float_18(float *buf) {
5262   helper_float_18_recursive(buf, 18);
5263 }
5264 void helper_float_19_recursive(float *buf, int depth);
helper_float_19_recursive(float * buf,int depth)5265 void helper_float_19_recursive(float *buf, int depth) {
5266   if (depth == 13) {
5267     for (int j = 0; j < 8192; j += 32) {
5268       for (int k = 0; k < 4; k += 4) {
5269         __asm__ volatile (
5270           "movups (%0), %%xmm0\n"
5271           "movups (%1), %%xmm1\n"
5272           "movups (%2), %%xmm2\n"
5273           "movups (%3), %%xmm3\n"
5274           "movups (%4), %%xmm4\n"
5275           "movups (%5), %%xmm5\n"
5276           "movups (%6), %%xmm6\n"
5277           "movups (%7), %%xmm7\n"
5278           "movaps %%xmm0, %%xmm8\n"
5279           "shufps $160, %%xmm8, %%xmm8\n"
5280           "shufps $245, %%xmm0, %%xmm0\n"
5281           "xorps %%xmm9, %%xmm9\n"
5282           "subps %%xmm0, %%xmm9\n"
5283           "addsubps %%xmm9, %%xmm8\n"
5284           "movaps %%xmm8, %%xmm0\n"
5285           "movaps %%xmm1, %%xmm8\n"
5286           "shufps $160, %%xmm8, %%xmm8\n"
5287           "shufps $245, %%xmm1, %%xmm1\n"
5288           "xorps %%xmm9, %%xmm9\n"
5289           "subps %%xmm1, %%xmm9\n"
5290           "addsubps %%xmm9, %%xmm8\n"
5291           "movaps %%xmm8, %%xmm1\n"
5292           "movaps %%xmm2, %%xmm8\n"
5293           "shufps $160, %%xmm8, %%xmm8\n"
5294           "shufps $245, %%xmm2, %%xmm2\n"
5295           "xorps %%xmm9, %%xmm9\n"
5296           "subps %%xmm2, %%xmm9\n"
5297           "addsubps %%xmm9, %%xmm8\n"
5298           "movaps %%xmm8, %%xmm2\n"
5299           "movaps %%xmm3, %%xmm8\n"
5300           "shufps $160, %%xmm8, %%xmm8\n"
5301           "shufps $245, %%xmm3, %%xmm3\n"
5302           "xorps %%xmm9, %%xmm9\n"
5303           "subps %%xmm3, %%xmm9\n"
5304           "addsubps %%xmm9, %%xmm8\n"
5305           "movaps %%xmm8, %%xmm3\n"
5306           "movaps %%xmm4, %%xmm8\n"
5307           "shufps $160, %%xmm8, %%xmm8\n"
5308           "shufps $245, %%xmm4, %%xmm4\n"
5309           "xorps %%xmm9, %%xmm9\n"
5310           "subps %%xmm4, %%xmm9\n"
5311           "addsubps %%xmm9, %%xmm8\n"
5312           "movaps %%xmm8, %%xmm4\n"
5313           "movaps %%xmm5, %%xmm8\n"
5314           "shufps $160, %%xmm8, %%xmm8\n"
5315           "shufps $245, %%xmm5, %%xmm5\n"
5316           "xorps %%xmm9, %%xmm9\n"
5317           "subps %%xmm5, %%xmm9\n"
5318           "addsubps %%xmm9, %%xmm8\n"
5319           "movaps %%xmm8, %%xmm5\n"
5320           "movaps %%xmm6, %%xmm8\n"
5321           "shufps $160, %%xmm8, %%xmm8\n"
5322           "shufps $245, %%xmm6, %%xmm6\n"
5323           "xorps %%xmm9, %%xmm9\n"
5324           "subps %%xmm6, %%xmm9\n"
5325           "addsubps %%xmm9, %%xmm8\n"
5326           "movaps %%xmm8, %%xmm6\n"
5327           "movaps %%xmm7, %%xmm8\n"
5328           "shufps $160, %%xmm8, %%xmm8\n"
5329           "shufps $245, %%xmm7, %%xmm7\n"
5330           "xorps %%xmm9, %%xmm9\n"
5331           "subps %%xmm7, %%xmm9\n"
5332           "addsubps %%xmm9, %%xmm8\n"
5333           "movaps %%xmm8, %%xmm7\n"
5334           "movaps %%xmm0, %%xmm8\n"
5335           "shufps $68, %%xmm8, %%xmm8\n"
5336           "xorps %%xmm9, %%xmm9\n"
5337           "movaps %%xmm0, %%xmm10\n"
5338           "shufps $14, %%xmm9, %%xmm10\n"
5339           "movaps %%xmm0, %%xmm11\n"
5340           "shufps $224, %%xmm11, %%xmm9\n"
5341           "addps %%xmm8, %%xmm10\n"
5342           "subps %%xmm9, %%xmm10\n"
5343           "movaps %%xmm10, %%xmm0\n"
5344           "movaps %%xmm1, %%xmm8\n"
5345           "shufps $68, %%xmm8, %%xmm8\n"
5346           "xorps %%xmm9, %%xmm9\n"
5347           "movaps %%xmm1, %%xmm10\n"
5348           "shufps $14, %%xmm9, %%xmm10\n"
5349           "movaps %%xmm1, %%xmm11\n"
5350           "shufps $224, %%xmm11, %%xmm9\n"
5351           "addps %%xmm8, %%xmm10\n"
5352           "subps %%xmm9, %%xmm10\n"
5353           "movaps %%xmm10, %%xmm1\n"
5354           "movaps %%xmm2, %%xmm8\n"
5355           "shufps $68, %%xmm8, %%xmm8\n"
5356           "xorps %%xmm9, %%xmm9\n"
5357           "movaps %%xmm2, %%xmm10\n"
5358           "shufps $14, %%xmm9, %%xmm10\n"
5359           "movaps %%xmm2, %%xmm11\n"
5360           "shufps $224, %%xmm11, %%xmm9\n"
5361           "addps %%xmm8, %%xmm10\n"
5362           "subps %%xmm9, %%xmm10\n"
5363           "movaps %%xmm10, %%xmm2\n"
5364           "movaps %%xmm3, %%xmm8\n"
5365           "shufps $68, %%xmm8, %%xmm8\n"
5366           "xorps %%xmm9, %%xmm9\n"
5367           "movaps %%xmm3, %%xmm10\n"
5368           "shufps $14, %%xmm9, %%xmm10\n"
5369           "movaps %%xmm3, %%xmm11\n"
5370           "shufps $224, %%xmm11, %%xmm9\n"
5371           "addps %%xmm8, %%xmm10\n"
5372           "subps %%xmm9, %%xmm10\n"
5373           "movaps %%xmm10, %%xmm3\n"
5374           "movaps %%xmm4, %%xmm8\n"
5375           "shufps $68, %%xmm8, %%xmm8\n"
5376           "xorps %%xmm9, %%xmm9\n"
5377           "movaps %%xmm4, %%xmm10\n"
5378           "shufps $14, %%xmm9, %%xmm10\n"
5379           "movaps %%xmm4, %%xmm11\n"
5380           "shufps $224, %%xmm11, %%xmm9\n"
5381           "addps %%xmm8, %%xmm10\n"
5382           "subps %%xmm9, %%xmm10\n"
5383           "movaps %%xmm10, %%xmm4\n"
5384           "movaps %%xmm5, %%xmm8\n"
5385           "shufps $68, %%xmm8, %%xmm8\n"
5386           "xorps %%xmm9, %%xmm9\n"
5387           "movaps %%xmm5, %%xmm10\n"
5388           "shufps $14, %%xmm9, %%xmm10\n"
5389           "movaps %%xmm5, %%xmm11\n"
5390           "shufps $224, %%xmm11, %%xmm9\n"
5391           "addps %%xmm8, %%xmm10\n"
5392           "subps %%xmm9, %%xmm10\n"
5393           "movaps %%xmm10, %%xmm5\n"
5394           "movaps %%xmm6, %%xmm8\n"
5395           "shufps $68, %%xmm8, %%xmm8\n"
5396           "xorps %%xmm9, %%xmm9\n"
5397           "movaps %%xmm6, %%xmm10\n"
5398           "shufps $14, %%xmm9, %%xmm10\n"
5399           "movaps %%xmm6, %%xmm11\n"
5400           "shufps $224, %%xmm11, %%xmm9\n"
5401           "addps %%xmm8, %%xmm10\n"
5402           "subps %%xmm9, %%xmm10\n"
5403           "movaps %%xmm10, %%xmm6\n"
5404           "movaps %%xmm7, %%xmm8\n"
5405           "shufps $68, %%xmm8, %%xmm8\n"
5406           "xorps %%xmm9, %%xmm9\n"
5407           "movaps %%xmm7, %%xmm10\n"
5408           "shufps $14, %%xmm9, %%xmm10\n"
5409           "movaps %%xmm7, %%xmm11\n"
5410           "shufps $224, %%xmm11, %%xmm9\n"
5411           "addps %%xmm8, %%xmm10\n"
5412           "subps %%xmm9, %%xmm10\n"
5413           "movaps %%xmm10, %%xmm7\n"
5414           "movaps %%xmm0, %%xmm8\n"
5415           "movaps %%xmm0, %%xmm9\n"
5416           "addps %%xmm1, %%xmm8\n"
5417           "subps %%xmm1, %%xmm9\n"
5418           "movaps %%xmm2, %%xmm10\n"
5419           "movaps %%xmm2, %%xmm11\n"
5420           "addps %%xmm3, %%xmm10\n"
5421           "subps %%xmm3, %%xmm11\n"
5422           "movaps %%xmm4, %%xmm12\n"
5423           "movaps %%xmm4, %%xmm13\n"
5424           "addps %%xmm5, %%xmm12\n"
5425           "subps %%xmm5, %%xmm13\n"
5426           "movaps %%xmm6, %%xmm14\n"
5427           "movaps %%xmm6, %%xmm15\n"
5428           "addps %%xmm7, %%xmm14\n"
5429           "subps %%xmm7, %%xmm15\n"
5430           "movaps %%xmm8, %%xmm0\n"
5431           "movaps %%xmm8, %%xmm2\n"
5432           "addps %%xmm10, %%xmm0\n"
5433           "subps %%xmm10, %%xmm2\n"
5434           "movaps %%xmm9, %%xmm1\n"
5435           "movaps %%xmm9, %%xmm3\n"
5436           "addps %%xmm11, %%xmm1\n"
5437           "subps %%xmm11, %%xmm3\n"
5438           "movaps %%xmm12, %%xmm4\n"
5439           "movaps %%xmm12, %%xmm6\n"
5440           "addps %%xmm14, %%xmm4\n"
5441           "subps %%xmm14, %%xmm6\n"
5442           "movaps %%xmm13, %%xmm5\n"
5443           "movaps %%xmm13, %%xmm7\n"
5444           "addps %%xmm15, %%xmm5\n"
5445           "subps %%xmm15, %%xmm7\n"
5446           "movaps %%xmm0, %%xmm8\n"
5447           "movaps %%xmm0, %%xmm12\n"
5448           "addps %%xmm4, %%xmm8\n"
5449           "subps %%xmm4, %%xmm12\n"
5450           "movaps %%xmm1, %%xmm9\n"
5451           "movaps %%xmm1, %%xmm13\n"
5452           "addps %%xmm5, %%xmm9\n"
5453           "subps %%xmm5, %%xmm13\n"
5454           "movaps %%xmm2, %%xmm10\n"
5455           "movaps %%xmm2, %%xmm14\n"
5456           "addps %%xmm6, %%xmm10\n"
5457           "subps %%xmm6, %%xmm14\n"
5458           "movaps %%xmm3, %%xmm11\n"
5459           "movaps %%xmm3, %%xmm15\n"
5460           "addps %%xmm7, %%xmm11\n"
5461           "subps %%xmm7, %%xmm15\n"
5462           "movups %%xmm8, (%0)\n"
5463           "movups %%xmm9, (%1)\n"
5464           "movups %%xmm10, (%2)\n"
5465           "movups %%xmm11, (%3)\n"
5466           "movups %%xmm12, (%4)\n"
5467           "movups %%xmm13, (%5)\n"
5468           "movups %%xmm14, (%6)\n"
5469           "movups %%xmm15, (%7)\n"
5470           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5471         );
5472       }
5473     }
5474     for (int j = 0; j < 8192; j += 256) {
5475       for (int k = 0; k < 32; k += 4) {
5476         __asm__ volatile (
5477           "movups (%0), %%xmm0\n"
5478           "movups (%1), %%xmm1\n"
5479           "movups (%2), %%xmm2\n"
5480           "movups (%3), %%xmm3\n"
5481           "movups (%4), %%xmm4\n"
5482           "movups (%5), %%xmm5\n"
5483           "movups (%6), %%xmm6\n"
5484           "movups (%7), %%xmm7\n"
5485           "movaps %%xmm0, %%xmm8\n"
5486           "movaps %%xmm0, %%xmm9\n"
5487           "addps %%xmm1, %%xmm8\n"
5488           "subps %%xmm1, %%xmm9\n"
5489           "movaps %%xmm2, %%xmm10\n"
5490           "movaps %%xmm2, %%xmm11\n"
5491           "addps %%xmm3, %%xmm10\n"
5492           "subps %%xmm3, %%xmm11\n"
5493           "movaps %%xmm4, %%xmm12\n"
5494           "movaps %%xmm4, %%xmm13\n"
5495           "addps %%xmm5, %%xmm12\n"
5496           "subps %%xmm5, %%xmm13\n"
5497           "movaps %%xmm6, %%xmm14\n"
5498           "movaps %%xmm6, %%xmm15\n"
5499           "addps %%xmm7, %%xmm14\n"
5500           "subps %%xmm7, %%xmm15\n"
5501           "movaps %%xmm8, %%xmm0\n"
5502           "movaps %%xmm8, %%xmm2\n"
5503           "addps %%xmm10, %%xmm0\n"
5504           "subps %%xmm10, %%xmm2\n"
5505           "movaps %%xmm9, %%xmm1\n"
5506           "movaps %%xmm9, %%xmm3\n"
5507           "addps %%xmm11, %%xmm1\n"
5508           "subps %%xmm11, %%xmm3\n"
5509           "movaps %%xmm12, %%xmm4\n"
5510           "movaps %%xmm12, %%xmm6\n"
5511           "addps %%xmm14, %%xmm4\n"
5512           "subps %%xmm14, %%xmm6\n"
5513           "movaps %%xmm13, %%xmm5\n"
5514           "movaps %%xmm13, %%xmm7\n"
5515           "addps %%xmm15, %%xmm5\n"
5516           "subps %%xmm15, %%xmm7\n"
5517           "movaps %%xmm0, %%xmm8\n"
5518           "movaps %%xmm0, %%xmm12\n"
5519           "addps %%xmm4, %%xmm8\n"
5520           "subps %%xmm4, %%xmm12\n"
5521           "movaps %%xmm1, %%xmm9\n"
5522           "movaps %%xmm1, %%xmm13\n"
5523           "addps %%xmm5, %%xmm9\n"
5524           "subps %%xmm5, %%xmm13\n"
5525           "movaps %%xmm2, %%xmm10\n"
5526           "movaps %%xmm2, %%xmm14\n"
5527           "addps %%xmm6, %%xmm10\n"
5528           "subps %%xmm6, %%xmm14\n"
5529           "movaps %%xmm3, %%xmm11\n"
5530           "movaps %%xmm3, %%xmm15\n"
5531           "addps %%xmm7, %%xmm11\n"
5532           "subps %%xmm7, %%xmm15\n"
5533           "movups %%xmm8, (%0)\n"
5534           "movups %%xmm9, (%1)\n"
5535           "movups %%xmm10, (%2)\n"
5536           "movups %%xmm11, (%3)\n"
5537           "movups %%xmm12, (%4)\n"
5538           "movups %%xmm13, (%5)\n"
5539           "movups %%xmm14, (%6)\n"
5540           "movups %%xmm15, (%7)\n"
5541           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5542         );
5543       }
5544     }
5545     for (int j = 0; j < 8192; j += 2048) {
5546       for (int k = 0; k < 256; k += 4) {
5547         __asm__ volatile (
5548           "movups (%0), %%xmm0\n"
5549           "movups (%1), %%xmm1\n"
5550           "movups (%2), %%xmm2\n"
5551           "movups (%3), %%xmm3\n"
5552           "movups (%4), %%xmm4\n"
5553           "movups (%5), %%xmm5\n"
5554           "movups (%6), %%xmm6\n"
5555           "movups (%7), %%xmm7\n"
5556           "movaps %%xmm0, %%xmm8\n"
5557           "movaps %%xmm0, %%xmm9\n"
5558           "addps %%xmm1, %%xmm8\n"
5559           "subps %%xmm1, %%xmm9\n"
5560           "movaps %%xmm2, %%xmm10\n"
5561           "movaps %%xmm2, %%xmm11\n"
5562           "addps %%xmm3, %%xmm10\n"
5563           "subps %%xmm3, %%xmm11\n"
5564           "movaps %%xmm4, %%xmm12\n"
5565           "movaps %%xmm4, %%xmm13\n"
5566           "addps %%xmm5, %%xmm12\n"
5567           "subps %%xmm5, %%xmm13\n"
5568           "movaps %%xmm6, %%xmm14\n"
5569           "movaps %%xmm6, %%xmm15\n"
5570           "addps %%xmm7, %%xmm14\n"
5571           "subps %%xmm7, %%xmm15\n"
5572           "movaps %%xmm8, %%xmm0\n"
5573           "movaps %%xmm8, %%xmm2\n"
5574           "addps %%xmm10, %%xmm0\n"
5575           "subps %%xmm10, %%xmm2\n"
5576           "movaps %%xmm9, %%xmm1\n"
5577           "movaps %%xmm9, %%xmm3\n"
5578           "addps %%xmm11, %%xmm1\n"
5579           "subps %%xmm11, %%xmm3\n"
5580           "movaps %%xmm12, %%xmm4\n"
5581           "movaps %%xmm12, %%xmm6\n"
5582           "addps %%xmm14, %%xmm4\n"
5583           "subps %%xmm14, %%xmm6\n"
5584           "movaps %%xmm13, %%xmm5\n"
5585           "movaps %%xmm13, %%xmm7\n"
5586           "addps %%xmm15, %%xmm5\n"
5587           "subps %%xmm15, %%xmm7\n"
5588           "movaps %%xmm0, %%xmm8\n"
5589           "movaps %%xmm0, %%xmm12\n"
5590           "addps %%xmm4, %%xmm8\n"
5591           "subps %%xmm4, %%xmm12\n"
5592           "movaps %%xmm1, %%xmm9\n"
5593           "movaps %%xmm1, %%xmm13\n"
5594           "addps %%xmm5, %%xmm9\n"
5595           "subps %%xmm5, %%xmm13\n"
5596           "movaps %%xmm2, %%xmm10\n"
5597           "movaps %%xmm2, %%xmm14\n"
5598           "addps %%xmm6, %%xmm10\n"
5599           "subps %%xmm6, %%xmm14\n"
5600           "movaps %%xmm3, %%xmm11\n"
5601           "movaps %%xmm3, %%xmm15\n"
5602           "addps %%xmm7, %%xmm11\n"
5603           "subps %%xmm7, %%xmm15\n"
5604           "movups %%xmm8, (%0)\n"
5605           "movups %%xmm9, (%1)\n"
5606           "movups %%xmm10, (%2)\n"
5607           "movups %%xmm11, (%3)\n"
5608           "movups %%xmm12, (%4)\n"
5609           "movups %%xmm13, (%5)\n"
5610           "movups %%xmm14, (%6)\n"
5611           "movups %%xmm15, (%7)\n"
5612           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5613         );
5614       }
5615     }
5616     for (int j = 0; j < 8192; j += 8192) {
5617       for (int k = 0; k < 2048; k += 4) {
5618         __asm__ volatile (
5619           "movups (%0), %%xmm0\n"
5620           "movups (%1), %%xmm1\n"
5621           "movups (%2), %%xmm2\n"
5622           "movups (%3), %%xmm3\n"
5623           "movaps %%xmm0, %%xmm8\n"
5624           "movaps %%xmm0, %%xmm9\n"
5625           "addps %%xmm1, %%xmm8\n"
5626           "subps %%xmm1, %%xmm9\n"
5627           "movaps %%xmm2, %%xmm10\n"
5628           "movaps %%xmm2, %%xmm11\n"
5629           "addps %%xmm3, %%xmm10\n"
5630           "subps %%xmm3, %%xmm11\n"
5631           "movaps %%xmm8, %%xmm0\n"
5632           "movaps %%xmm8, %%xmm2\n"
5633           "addps %%xmm10, %%xmm0\n"
5634           "subps %%xmm10, %%xmm2\n"
5635           "movaps %%xmm9, %%xmm1\n"
5636           "movaps %%xmm9, %%xmm3\n"
5637           "addps %%xmm11, %%xmm1\n"
5638           "subps %%xmm11, %%xmm3\n"
5639           "movups %%xmm0, (%0)\n"
5640           "movups %%xmm1, (%1)\n"
5641           "movups %%xmm2, (%2)\n"
5642           "movups %%xmm3, (%3)\n"
5643           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5644         );
5645       }
5646     }
5647     return;
5648   }
5649   if (depth == 16) {
5650     helper_float_19_recursive(buf + 0, 13);
5651     helper_float_19_recursive(buf + 8192, 13);
5652     helper_float_19_recursive(buf + 16384, 13);
5653     helper_float_19_recursive(buf + 24576, 13);
5654     helper_float_19_recursive(buf + 32768, 13);
5655     helper_float_19_recursive(buf + 40960, 13);
5656     helper_float_19_recursive(buf + 49152, 13);
5657     helper_float_19_recursive(buf + 57344, 13);
5658     for (int j = 0; j < 65536; j += 65536) {
5659       for (int k = 0; k < 8192; k += 4) {
5660         __asm__ volatile (
5661           "movups (%0), %%xmm0\n"
5662           "movups (%1), %%xmm1\n"
5663           "movups (%2), %%xmm2\n"
5664           "movups (%3), %%xmm3\n"
5665           "movups (%4), %%xmm4\n"
5666           "movups (%5), %%xmm5\n"
5667           "movups (%6), %%xmm6\n"
5668           "movups (%7), %%xmm7\n"
5669           "movaps %%xmm0, %%xmm8\n"
5670           "movaps %%xmm0, %%xmm9\n"
5671           "addps %%xmm1, %%xmm8\n"
5672           "subps %%xmm1, %%xmm9\n"
5673           "movaps %%xmm2, %%xmm10\n"
5674           "movaps %%xmm2, %%xmm11\n"
5675           "addps %%xmm3, %%xmm10\n"
5676           "subps %%xmm3, %%xmm11\n"
5677           "movaps %%xmm4, %%xmm12\n"
5678           "movaps %%xmm4, %%xmm13\n"
5679           "addps %%xmm5, %%xmm12\n"
5680           "subps %%xmm5, %%xmm13\n"
5681           "movaps %%xmm6, %%xmm14\n"
5682           "movaps %%xmm6, %%xmm15\n"
5683           "addps %%xmm7, %%xmm14\n"
5684           "subps %%xmm7, %%xmm15\n"
5685           "movaps %%xmm8, %%xmm0\n"
5686           "movaps %%xmm8, %%xmm2\n"
5687           "addps %%xmm10, %%xmm0\n"
5688           "subps %%xmm10, %%xmm2\n"
5689           "movaps %%xmm9, %%xmm1\n"
5690           "movaps %%xmm9, %%xmm3\n"
5691           "addps %%xmm11, %%xmm1\n"
5692           "subps %%xmm11, %%xmm3\n"
5693           "movaps %%xmm12, %%xmm4\n"
5694           "movaps %%xmm12, %%xmm6\n"
5695           "addps %%xmm14, %%xmm4\n"
5696           "subps %%xmm14, %%xmm6\n"
5697           "movaps %%xmm13, %%xmm5\n"
5698           "movaps %%xmm13, %%xmm7\n"
5699           "addps %%xmm15, %%xmm5\n"
5700           "subps %%xmm15, %%xmm7\n"
5701           "movaps %%xmm0, %%xmm8\n"
5702           "movaps %%xmm0, %%xmm12\n"
5703           "addps %%xmm4, %%xmm8\n"
5704           "subps %%xmm4, %%xmm12\n"
5705           "movaps %%xmm1, %%xmm9\n"
5706           "movaps %%xmm1, %%xmm13\n"
5707           "addps %%xmm5, %%xmm9\n"
5708           "subps %%xmm5, %%xmm13\n"
5709           "movaps %%xmm2, %%xmm10\n"
5710           "movaps %%xmm2, %%xmm14\n"
5711           "addps %%xmm6, %%xmm10\n"
5712           "subps %%xmm6, %%xmm14\n"
5713           "movaps %%xmm3, %%xmm11\n"
5714           "movaps %%xmm3, %%xmm15\n"
5715           "addps %%xmm7, %%xmm11\n"
5716           "subps %%xmm7, %%xmm15\n"
5717           "movups %%xmm8, (%0)\n"
5718           "movups %%xmm9, (%1)\n"
5719           "movups %%xmm10, (%2)\n"
5720           "movups %%xmm11, (%3)\n"
5721           "movups %%xmm12, (%4)\n"
5722           "movups %%xmm13, (%5)\n"
5723           "movups %%xmm14, (%6)\n"
5724           "movups %%xmm15, (%7)\n"
5725           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5726         );
5727       }
5728     }
5729     return;
5730   }
5731   if (depth == 19) {
5732     helper_float_19_recursive(buf + 0, 16);
5733     helper_float_19_recursive(buf + 65536, 16);
5734     helper_float_19_recursive(buf + 131072, 16);
5735     helper_float_19_recursive(buf + 196608, 16);
5736     helper_float_19_recursive(buf + 262144, 16);
5737     helper_float_19_recursive(buf + 327680, 16);
5738     helper_float_19_recursive(buf + 393216, 16);
5739     helper_float_19_recursive(buf + 458752, 16);
5740     for (int j = 0; j < 524288; j += 524288) {
5741       for (int k = 0; k < 65536; k += 4) {
5742         __asm__ volatile (
5743           "movups (%0), %%xmm0\n"
5744           "movups (%1), %%xmm1\n"
5745           "movups (%2), %%xmm2\n"
5746           "movups (%3), %%xmm3\n"
5747           "movups (%4), %%xmm4\n"
5748           "movups (%5), %%xmm5\n"
5749           "movups (%6), %%xmm6\n"
5750           "movups (%7), %%xmm7\n"
5751           "movaps %%xmm0, %%xmm8\n"
5752           "movaps %%xmm0, %%xmm9\n"
5753           "addps %%xmm1, %%xmm8\n"
5754           "subps %%xmm1, %%xmm9\n"
5755           "movaps %%xmm2, %%xmm10\n"
5756           "movaps %%xmm2, %%xmm11\n"
5757           "addps %%xmm3, %%xmm10\n"
5758           "subps %%xmm3, %%xmm11\n"
5759           "movaps %%xmm4, %%xmm12\n"
5760           "movaps %%xmm4, %%xmm13\n"
5761           "addps %%xmm5, %%xmm12\n"
5762           "subps %%xmm5, %%xmm13\n"
5763           "movaps %%xmm6, %%xmm14\n"
5764           "movaps %%xmm6, %%xmm15\n"
5765           "addps %%xmm7, %%xmm14\n"
5766           "subps %%xmm7, %%xmm15\n"
5767           "movaps %%xmm8, %%xmm0\n"
5768           "movaps %%xmm8, %%xmm2\n"
5769           "addps %%xmm10, %%xmm0\n"
5770           "subps %%xmm10, %%xmm2\n"
5771           "movaps %%xmm9, %%xmm1\n"
5772           "movaps %%xmm9, %%xmm3\n"
5773           "addps %%xmm11, %%xmm1\n"
5774           "subps %%xmm11, %%xmm3\n"
5775           "movaps %%xmm12, %%xmm4\n"
5776           "movaps %%xmm12, %%xmm6\n"
5777           "addps %%xmm14, %%xmm4\n"
5778           "subps %%xmm14, %%xmm6\n"
5779           "movaps %%xmm13, %%xmm5\n"
5780           "movaps %%xmm13, %%xmm7\n"
5781           "addps %%xmm15, %%xmm5\n"
5782           "subps %%xmm15, %%xmm7\n"
5783           "movaps %%xmm0, %%xmm8\n"
5784           "movaps %%xmm0, %%xmm12\n"
5785           "addps %%xmm4, %%xmm8\n"
5786           "subps %%xmm4, %%xmm12\n"
5787           "movaps %%xmm1, %%xmm9\n"
5788           "movaps %%xmm1, %%xmm13\n"
5789           "addps %%xmm5, %%xmm9\n"
5790           "subps %%xmm5, %%xmm13\n"
5791           "movaps %%xmm2, %%xmm10\n"
5792           "movaps %%xmm2, %%xmm14\n"
5793           "addps %%xmm6, %%xmm10\n"
5794           "subps %%xmm6, %%xmm14\n"
5795           "movaps %%xmm3, %%xmm11\n"
5796           "movaps %%xmm3, %%xmm15\n"
5797           "addps %%xmm7, %%xmm11\n"
5798           "subps %%xmm7, %%xmm15\n"
5799           "movups %%xmm8, (%0)\n"
5800           "movups %%xmm9, (%1)\n"
5801           "movups %%xmm10, (%2)\n"
5802           "movups %%xmm11, (%3)\n"
5803           "movups %%xmm12, (%4)\n"
5804           "movups %%xmm13, (%5)\n"
5805           "movups %%xmm14, (%6)\n"
5806           "movups %%xmm15, (%7)\n"
5807           :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5808         );
5809       }
5810     }
5811     return;
5812   }
5813 }
5814 void helper_float_19(float *buf);
helper_float_19(float * buf)5815 void helper_float_19(float *buf) {
5816   helper_float_19_recursive(buf, 19);
5817 }
5818 void helper_float_20_recursive(float *buf, int depth);
helper_float_20_recursive(float * buf,int depth)5819 void helper_float_20_recursive(float *buf, int depth) {
5820   if (depth == 8) {
5821     for (int j = 0; j < 256; j += 32) {
5822       for (int k = 0; k < 4; k += 4) {
5823         __asm__ volatile (
5824           "movups (%0), %%xmm0\n"
5825           "movups (%1), %%xmm1\n"
5826           "movups (%2), %%xmm2\n"
5827           "movups (%3), %%xmm3\n"
5828           "movups (%4), %%xmm4\n"
5829           "movups (%5), %%xmm5\n"
5830           "movups (%6), %%xmm6\n"
5831           "movups (%7), %%xmm7\n"
5832           "movaps %%xmm0, %%xmm8\n"
5833           "shufps $160, %%xmm8, %%xmm8\n"
5834           "shufps $245, %%xmm0, %%xmm0\n"
5835           "xorps %%xmm9, %%xmm9\n"
5836           "subps %%xmm0, %%xmm9\n"
5837           "addsubps %%xmm9, %%xmm8\n"
5838           "movaps %%xmm8, %%xmm0\n"
5839           "movaps %%xmm1, %%xmm8\n"
5840           "shufps $160, %%xmm8, %%xmm8\n"
5841           "shufps $245, %%xmm1, %%xmm1\n"
5842           "xorps %%xmm9, %%xmm9\n"
5843           "subps %%xmm1, %%xmm9\n"
5844           "addsubps %%xmm9, %%xmm8\n"
5845           "movaps %%xmm8, %%xmm1\n"
5846           "movaps %%xmm2, %%xmm8\n"
5847           "shufps $160, %%xmm8, %%xmm8\n"
5848           "shufps $245, %%xmm2, %%xmm2\n"
5849           "xorps %%xmm9, %%xmm9\n"
5850           "subps %%xmm2, %%xmm9\n"
5851           "addsubps %%xmm9, %%xmm8\n"
5852           "movaps %%xmm8, %%xmm2\n"
5853           "movaps %%xmm3, %%xmm8\n"
5854           "shufps $160, %%xmm8, %%xmm8\n"
5855           "shufps $245, %%xmm3, %%xmm3\n"
5856           "xorps %%xmm9, %%xmm9\n"
5857           "subps %%xmm3, %%xmm9\n"
5858           "addsubps %%xmm9, %%xmm8\n"
5859           "movaps %%xmm8, %%xmm3\n"
5860           "movaps %%xmm4, %%xmm8\n"
5861           "shufps $160, %%xmm8, %%xmm8\n"
5862           "shufps $245, %%xmm4, %%xmm4\n"
5863           "xorps %%xmm9, %%xmm9\n"
5864           "subps %%xmm4, %%xmm9\n"
5865           "addsubps %%xmm9, %%xmm8\n"
5866           "movaps %%xmm8, %%xmm4\n"
5867           "movaps %%xmm5, %%xmm8\n"
5868           "shufps $160, %%xmm8, %%xmm8\n"
5869           "shufps $245, %%xmm5, %%xmm5\n"
5870           "xorps %%xmm9, %%xmm9\n"
5871           "subps %%xmm5, %%xmm9\n"
5872           "addsubps %%xmm9, %%xmm8\n"
5873           "movaps %%xmm8, %%xmm5\n"
5874           "movaps %%xmm6, %%xmm8\n"
5875           "shufps $160, %%xmm8, %%xmm8\n"
5876           "shufps $245, %%xmm6, %%xmm6\n"
5877           "xorps %%xmm9, %%xmm9\n"
5878           "subps %%xmm6, %%xmm9\n"
5879           "addsubps %%xmm9, %%xmm8\n"
5880           "movaps %%xmm8, %%xmm6\n"
5881           "movaps %%xmm7, %%xmm8\n"
5882           "shufps $160, %%xmm8, %%xmm8\n"
5883           "shufps $245, %%xmm7, %%xmm7\n"
5884           "xorps %%xmm9, %%xmm9\n"
5885           "subps %%xmm7, %%xmm9\n"
5886           "addsubps %%xmm9, %%xmm8\n"
5887           "movaps %%xmm8, %%xmm7\n"
5888           "movaps %%xmm0, %%xmm8\n"
5889           "shufps $68, %%xmm8, %%xmm8\n"
5890           "xorps %%xmm9, %%xmm9\n"
5891           "movaps %%xmm0, %%xmm10\n"
5892           "shufps $14, %%xmm9, %%xmm10\n"
5893           "movaps %%xmm0, %%xmm11\n"
5894           "shufps $224, %%xmm11, %%xmm9\n"
5895           "addps %%xmm8, %%xmm10\n"
5896           "subps %%xmm9, %%xmm10\n"
5897           "movaps %%xmm10, %%xmm0\n"
5898           "movaps %%xmm1, %%xmm8\n"
5899           "shufps $68, %%xmm8, %%xmm8\n"
5900           "xorps %%xmm9, %%xmm9\n"
5901           "movaps %%xmm1, %%xmm10\n"
5902           "shufps $14, %%xmm9, %%xmm10\n"
5903           "movaps %%xmm1, %%xmm11\n"
5904           "shufps $224, %%xmm11, %%xmm9\n"
5905           "addps %%xmm8, %%xmm10\n"
5906           "subps %%xmm9, %%xmm10\n"
5907           "movaps %%xmm10, %%xmm1\n"
5908           "movaps %%xmm2, %%xmm8\n"
5909           "shufps $68, %%xmm8, %%xmm8\n"
5910           "xorps %%xmm9, %%xmm9\n"
5911           "movaps %%xmm2, %%xmm10\n"
5912           "shufps $14, %%xmm9, %%xmm10\n"
5913           "movaps %%xmm2, %%xmm11\n"
5914           "shufps $224, %%xmm11, %%xmm9\n"
5915           "addps %%xmm8, %%xmm10\n"
5916           "subps %%xmm9, %%xmm10\n"
5917           "movaps %%xmm10, %%xmm2\n"
5918           "movaps %%xmm3, %%xmm8\n"
5919           "shufps $68, %%xmm8, %%xmm8\n"
5920           "xorps %%xmm9, %%xmm9\n"
5921           "movaps %%xmm3, %%xmm10\n"
5922           "shufps $14, %%xmm9, %%xmm10\n"
5923           "movaps %%xmm3, %%xmm11\n"
5924           "shufps $224, %%xmm11, %%xmm9\n"
5925           "addps %%xmm8, %%xmm10\n"
5926           "subps %%xmm9, %%xmm10\n"
5927           "movaps %%xmm10, %%xmm3\n"
5928           "movaps %%xmm4, %%xmm8\n"
5929           "shufps $68, %%xmm8, %%xmm8\n"
5930           "xorps %%xmm9, %%xmm9\n"
5931           "movaps %%xmm4, %%xmm10\n"
5932           "shufps $14, %%xmm9, %%xmm10\n"
5933           "movaps %%xmm4, %%xmm11\n"
5934           "shufps $224, %%xmm11, %%xmm9\n"
5935           "addps %%xmm8, %%xmm10\n"
5936           "subps %%xmm9, %%xmm10\n"
5937           "movaps %%xmm10, %%xmm4\n"
5938           "movaps %%xmm5, %%xmm8\n"
5939           "shufps $68, %%xmm8, %%xmm8\n"
5940           "xorps %%xmm9, %%xmm9\n"
5941           "movaps %%xmm5, %%xmm10\n"
5942           "shufps $14, %%xmm9, %%xmm10\n"
5943           "movaps %%xmm5, %%xmm11\n"
5944           "shufps $224, %%xmm11, %%xmm9\n"
5945           "addps %%xmm8, %%xmm10\n"
5946           "subps %%xmm9, %%xmm10\n"
5947           "movaps %%xmm10, %%xmm5\n"
5948           "movaps %%xmm6, %%xmm8\n"
5949           "shufps $68, %%xmm8, %%xmm8\n"
5950           "xorps %%xmm9, %%xmm9\n"
5951           "movaps %%xmm6, %%xmm10\n"
5952           "shufps $14, %%xmm9, %%xmm10\n"
5953           "movaps %%xmm6, %%xmm11\n"
5954           "shufps $224, %%xmm11, %%xmm9\n"
5955           "addps %%xmm8, %%xmm10\n"
5956           "subps %%xmm9, %%xmm10\n"
5957           "movaps %%xmm10, %%xmm6\n"
5958           "movaps %%xmm7, %%xmm8\n"
5959           "shufps $68, %%xmm8, %%xmm8\n"
5960           "xorps %%xmm9, %%xmm9\n"
5961           "movaps %%xmm7, %%xmm10\n"
5962           "shufps $14, %%xmm9, %%xmm10\n"
5963           "movaps %%xmm7, %%xmm11\n"
5964           "shufps $224, %%xmm11, %%xmm9\n"
5965           "addps %%xmm8, %%xmm10\n"
5966           "subps %%xmm9, %%xmm10\n"
5967           "movaps %%xmm10, %%xmm7\n"
5968           "movaps %%xmm0, %%xmm8\n"
5969           "movaps %%xmm0, %%xmm9\n"
5970           "addps %%xmm1, %%xmm8\n"
5971           "subps %%xmm1, %%xmm9\n"
5972           "movaps %%xmm2, %%xmm10\n"
5973           "movaps %%xmm2, %%xmm11\n"
5974           "addps %%xmm3, %%xmm10\n"
5975           "subps %%xmm3, %%xmm11\n"
5976           "movaps %%xmm4, %%xmm12\n"
5977           "movaps %%xmm4, %%xmm13\n"
5978           "addps %%xmm5, %%xmm12\n"
5979           "subps %%xmm5, %%xmm13\n"
5980           "movaps %%xmm6, %%xmm14\n"
5981           "movaps %%xmm6, %%xmm15\n"
5982           "addps %%xmm7, %%xmm14\n"
5983           "subps %%xmm7, %%xmm15\n"
5984           "movaps %%xmm8, %%xmm0\n"
5985           "movaps %%xmm8, %%xmm2\n"
5986           "addps %%xmm10, %%xmm0\n"
5987           "subps %%xmm10, %%xmm2\n"
5988           "movaps %%xmm9, %%xmm1\n"
5989           "movaps %%xmm9, %%xmm3\n"
5990           "addps %%xmm11, %%xmm1\n"
5991           "subps %%xmm11, %%xmm3\n"
5992           "movaps %%xmm12, %%xmm4\n"
5993           "movaps %%xmm12, %%xmm6\n"
5994           "addps %%xmm14, %%xmm4\n"
5995           "subps %%xmm14, %%xmm6\n"
5996           "movaps %%xmm13, %%xmm5\n"
5997           "movaps %%xmm13, %%xmm7\n"
5998           "addps %%xmm15, %%xmm5\n"
5999           "subps %%xmm15, %%xmm7\n"
6000           "movaps %%xmm0, %%xmm8\n"
6001           "movaps %%xmm0, %%xmm12\n"
6002           "addps %%xmm4, %%xmm8\n"
6003           "subps %%xmm4, %%xmm12\n"
6004           "movaps %%xmm1, %%xmm9\n"
6005           "movaps %%xmm1, %%xmm13\n"
6006           "addps %%xmm5, %%xmm9\n"
6007           "subps %%xmm5, %%xmm13\n"
6008           "movaps %%xmm2, %%xmm10\n"
6009           "movaps %%xmm2, %%xmm14\n"
6010           "addps %%xmm6, %%xmm10\n"
6011           "subps %%xmm6, %%xmm14\n"
6012           "movaps %%xmm3, %%xmm11\n"
6013           "movaps %%xmm3, %%xmm15\n"
6014           "addps %%xmm7, %%xmm11\n"
6015           "subps %%xmm7, %%xmm15\n"
6016           "movups %%xmm8, (%0)\n"
6017           "movups %%xmm9, (%1)\n"
6018           "movups %%xmm10, (%2)\n"
6019           "movups %%xmm11, (%3)\n"
6020           "movups %%xmm12, (%4)\n"
6021           "movups %%xmm13, (%5)\n"
6022           "movups %%xmm14, (%6)\n"
6023           "movups %%xmm15, (%7)\n"
6024           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6025         );
6026       }
6027     }
6028     for (int j = 0; j < 256; j += 256) {
6029       for (int k = 0; k < 32; k += 4) {
6030         __asm__ volatile (
6031           "movups (%0), %%xmm0\n"
6032           "movups (%1), %%xmm1\n"
6033           "movups (%2), %%xmm2\n"
6034           "movups (%3), %%xmm3\n"
6035           "movups (%4), %%xmm4\n"
6036           "movups (%5), %%xmm5\n"
6037           "movups (%6), %%xmm6\n"
6038           "movups (%7), %%xmm7\n"
6039           "movaps %%xmm0, %%xmm8\n"
6040           "movaps %%xmm0, %%xmm9\n"
6041           "addps %%xmm1, %%xmm8\n"
6042           "subps %%xmm1, %%xmm9\n"
6043           "movaps %%xmm2, %%xmm10\n"
6044           "movaps %%xmm2, %%xmm11\n"
6045           "addps %%xmm3, %%xmm10\n"
6046           "subps %%xmm3, %%xmm11\n"
6047           "movaps %%xmm4, %%xmm12\n"
6048           "movaps %%xmm4, %%xmm13\n"
6049           "addps %%xmm5, %%xmm12\n"
6050           "subps %%xmm5, %%xmm13\n"
6051           "movaps %%xmm6, %%xmm14\n"
6052           "movaps %%xmm6, %%xmm15\n"
6053           "addps %%xmm7, %%xmm14\n"
6054           "subps %%xmm7, %%xmm15\n"
6055           "movaps %%xmm8, %%xmm0\n"
6056           "movaps %%xmm8, %%xmm2\n"
6057           "addps %%xmm10, %%xmm0\n"
6058           "subps %%xmm10, %%xmm2\n"
6059           "movaps %%xmm9, %%xmm1\n"
6060           "movaps %%xmm9, %%xmm3\n"
6061           "addps %%xmm11, %%xmm1\n"
6062           "subps %%xmm11, %%xmm3\n"
6063           "movaps %%xmm12, %%xmm4\n"
6064           "movaps %%xmm12, %%xmm6\n"
6065           "addps %%xmm14, %%xmm4\n"
6066           "subps %%xmm14, %%xmm6\n"
6067           "movaps %%xmm13, %%xmm5\n"
6068           "movaps %%xmm13, %%xmm7\n"
6069           "addps %%xmm15, %%xmm5\n"
6070           "subps %%xmm15, %%xmm7\n"
6071           "movaps %%xmm0, %%xmm8\n"
6072           "movaps %%xmm0, %%xmm12\n"
6073           "addps %%xmm4, %%xmm8\n"
6074           "subps %%xmm4, %%xmm12\n"
6075           "movaps %%xmm1, %%xmm9\n"
6076           "movaps %%xmm1, %%xmm13\n"
6077           "addps %%xmm5, %%xmm9\n"
6078           "subps %%xmm5, %%xmm13\n"
6079           "movaps %%xmm2, %%xmm10\n"
6080           "movaps %%xmm2, %%xmm14\n"
6081           "addps %%xmm6, %%xmm10\n"
6082           "subps %%xmm6, %%xmm14\n"
6083           "movaps %%xmm3, %%xmm11\n"
6084           "movaps %%xmm3, %%xmm15\n"
6085           "addps %%xmm7, %%xmm11\n"
6086           "subps %%xmm7, %%xmm15\n"
6087           "movups %%xmm8, (%0)\n"
6088           "movups %%xmm9, (%1)\n"
6089           "movups %%xmm10, (%2)\n"
6090           "movups %%xmm11, (%3)\n"
6091           "movups %%xmm12, (%4)\n"
6092           "movups %%xmm13, (%5)\n"
6093           "movups %%xmm14, (%6)\n"
6094           "movups %%xmm15, (%7)\n"
6095           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6096         );
6097       }
6098     }
6099     return;
6100   }
6101   if (depth == 11) {
6102     helper_float_20_recursive(buf + 0, 8);
6103     helper_float_20_recursive(buf + 256, 8);
6104     helper_float_20_recursive(buf + 512, 8);
6105     helper_float_20_recursive(buf + 768, 8);
6106     helper_float_20_recursive(buf + 1024, 8);
6107     helper_float_20_recursive(buf + 1280, 8);
6108     helper_float_20_recursive(buf + 1536, 8);
6109     helper_float_20_recursive(buf + 1792, 8);
6110     for (int j = 0; j < 2048; j += 2048) {
6111       for (int k = 0; k < 256; k += 4) {
6112         __asm__ volatile (
6113           "movups (%0), %%xmm0\n"
6114           "movups (%1), %%xmm1\n"
6115           "movups (%2), %%xmm2\n"
6116           "movups (%3), %%xmm3\n"
6117           "movups (%4), %%xmm4\n"
6118           "movups (%5), %%xmm5\n"
6119           "movups (%6), %%xmm6\n"
6120           "movups (%7), %%xmm7\n"
6121           "movaps %%xmm0, %%xmm8\n"
6122           "movaps %%xmm0, %%xmm9\n"
6123           "addps %%xmm1, %%xmm8\n"
6124           "subps %%xmm1, %%xmm9\n"
6125           "movaps %%xmm2, %%xmm10\n"
6126           "movaps %%xmm2, %%xmm11\n"
6127           "addps %%xmm3, %%xmm10\n"
6128           "subps %%xmm3, %%xmm11\n"
6129           "movaps %%xmm4, %%xmm12\n"
6130           "movaps %%xmm4, %%xmm13\n"
6131           "addps %%xmm5, %%xmm12\n"
6132           "subps %%xmm5, %%xmm13\n"
6133           "movaps %%xmm6, %%xmm14\n"
6134           "movaps %%xmm6, %%xmm15\n"
6135           "addps %%xmm7, %%xmm14\n"
6136           "subps %%xmm7, %%xmm15\n"
6137           "movaps %%xmm8, %%xmm0\n"
6138           "movaps %%xmm8, %%xmm2\n"
6139           "addps %%xmm10, %%xmm0\n"
6140           "subps %%xmm10, %%xmm2\n"
6141           "movaps %%xmm9, %%xmm1\n"
6142           "movaps %%xmm9, %%xmm3\n"
6143           "addps %%xmm11, %%xmm1\n"
6144           "subps %%xmm11, %%xmm3\n"
6145           "movaps %%xmm12, %%xmm4\n"
6146           "movaps %%xmm12, %%xmm6\n"
6147           "addps %%xmm14, %%xmm4\n"
6148           "subps %%xmm14, %%xmm6\n"
6149           "movaps %%xmm13, %%xmm5\n"
6150           "movaps %%xmm13, %%xmm7\n"
6151           "addps %%xmm15, %%xmm5\n"
6152           "subps %%xmm15, %%xmm7\n"
6153           "movaps %%xmm0, %%xmm8\n"
6154           "movaps %%xmm0, %%xmm12\n"
6155           "addps %%xmm4, %%xmm8\n"
6156           "subps %%xmm4, %%xmm12\n"
6157           "movaps %%xmm1, %%xmm9\n"
6158           "movaps %%xmm1, %%xmm13\n"
6159           "addps %%xmm5, %%xmm9\n"
6160           "subps %%xmm5, %%xmm13\n"
6161           "movaps %%xmm2, %%xmm10\n"
6162           "movaps %%xmm2, %%xmm14\n"
6163           "addps %%xmm6, %%xmm10\n"
6164           "subps %%xmm6, %%xmm14\n"
6165           "movaps %%xmm3, %%xmm11\n"
6166           "movaps %%xmm3, %%xmm15\n"
6167           "addps %%xmm7, %%xmm11\n"
6168           "subps %%xmm7, %%xmm15\n"
6169           "movups %%xmm8, (%0)\n"
6170           "movups %%xmm9, (%1)\n"
6171           "movups %%xmm10, (%2)\n"
6172           "movups %%xmm11, (%3)\n"
6173           "movups %%xmm12, (%4)\n"
6174           "movups %%xmm13, (%5)\n"
6175           "movups %%xmm14, (%6)\n"
6176           "movups %%xmm15, (%7)\n"
6177           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6178         );
6179       }
6180     }
6181     return;
6182   }
6183   if (depth == 14) {
6184     helper_float_20_recursive(buf + 0, 11);
6185     helper_float_20_recursive(buf + 2048, 11);
6186     helper_float_20_recursive(buf + 4096, 11);
6187     helper_float_20_recursive(buf + 6144, 11);
6188     helper_float_20_recursive(buf + 8192, 11);
6189     helper_float_20_recursive(buf + 10240, 11);
6190     helper_float_20_recursive(buf + 12288, 11);
6191     helper_float_20_recursive(buf + 14336, 11);
6192     for (int j = 0; j < 16384; j += 16384) {
6193       for (int k = 0; k < 2048; k += 4) {
6194         __asm__ volatile (
6195           "movups (%0), %%xmm0\n"
6196           "movups (%1), %%xmm1\n"
6197           "movups (%2), %%xmm2\n"
6198           "movups (%3), %%xmm3\n"
6199           "movups (%4), %%xmm4\n"
6200           "movups (%5), %%xmm5\n"
6201           "movups (%6), %%xmm6\n"
6202           "movups (%7), %%xmm7\n"
6203           "movaps %%xmm0, %%xmm8\n"
6204           "movaps %%xmm0, %%xmm9\n"
6205           "addps %%xmm1, %%xmm8\n"
6206           "subps %%xmm1, %%xmm9\n"
6207           "movaps %%xmm2, %%xmm10\n"
6208           "movaps %%xmm2, %%xmm11\n"
6209           "addps %%xmm3, %%xmm10\n"
6210           "subps %%xmm3, %%xmm11\n"
6211           "movaps %%xmm4, %%xmm12\n"
6212           "movaps %%xmm4, %%xmm13\n"
6213           "addps %%xmm5, %%xmm12\n"
6214           "subps %%xmm5, %%xmm13\n"
6215           "movaps %%xmm6, %%xmm14\n"
6216           "movaps %%xmm6, %%xmm15\n"
6217           "addps %%xmm7, %%xmm14\n"
6218           "subps %%xmm7, %%xmm15\n"
6219           "movaps %%xmm8, %%xmm0\n"
6220           "movaps %%xmm8, %%xmm2\n"
6221           "addps %%xmm10, %%xmm0\n"
6222           "subps %%xmm10, %%xmm2\n"
6223           "movaps %%xmm9, %%xmm1\n"
6224           "movaps %%xmm9, %%xmm3\n"
6225           "addps %%xmm11, %%xmm1\n"
6226           "subps %%xmm11, %%xmm3\n"
6227           "movaps %%xmm12, %%xmm4\n"
6228           "movaps %%xmm12, %%xmm6\n"
6229           "addps %%xmm14, %%xmm4\n"
6230           "subps %%xmm14, %%xmm6\n"
6231           "movaps %%xmm13, %%xmm5\n"
6232           "movaps %%xmm13, %%xmm7\n"
6233           "addps %%xmm15, %%xmm5\n"
6234           "subps %%xmm15, %%xmm7\n"
6235           "movaps %%xmm0, %%xmm8\n"
6236           "movaps %%xmm0, %%xmm12\n"
6237           "addps %%xmm4, %%xmm8\n"
6238           "subps %%xmm4, %%xmm12\n"
6239           "movaps %%xmm1, %%xmm9\n"
6240           "movaps %%xmm1, %%xmm13\n"
6241           "addps %%xmm5, %%xmm9\n"
6242           "subps %%xmm5, %%xmm13\n"
6243           "movaps %%xmm2, %%xmm10\n"
6244           "movaps %%xmm2, %%xmm14\n"
6245           "addps %%xmm6, %%xmm10\n"
6246           "subps %%xmm6, %%xmm14\n"
6247           "movaps %%xmm3, %%xmm11\n"
6248           "movaps %%xmm3, %%xmm15\n"
6249           "addps %%xmm7, %%xmm11\n"
6250           "subps %%xmm7, %%xmm15\n"
6251           "movups %%xmm8, (%0)\n"
6252           "movups %%xmm9, (%1)\n"
6253           "movups %%xmm10, (%2)\n"
6254           "movups %%xmm11, (%3)\n"
6255           "movups %%xmm12, (%4)\n"
6256           "movups %%xmm13, (%5)\n"
6257           "movups %%xmm14, (%6)\n"
6258           "movups %%xmm15, (%7)\n"
6259           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6260         );
6261       }
6262     }
6263     return;
6264   }
6265   if (depth == 17) {
6266     helper_float_20_recursive(buf + 0, 14);
6267     helper_float_20_recursive(buf + 16384, 14);
6268     helper_float_20_recursive(buf + 32768, 14);
6269     helper_float_20_recursive(buf + 49152, 14);
6270     helper_float_20_recursive(buf + 65536, 14);
6271     helper_float_20_recursive(buf + 81920, 14);
6272     helper_float_20_recursive(buf + 98304, 14);
6273     helper_float_20_recursive(buf + 114688, 14);
6274     for (int j = 0; j < 131072; j += 131072) {
6275       for (int k = 0; k < 16384; k += 4) {
6276         __asm__ volatile (
6277           "movups (%0), %%xmm0\n"
6278           "movups (%1), %%xmm1\n"
6279           "movups (%2), %%xmm2\n"
6280           "movups (%3), %%xmm3\n"
6281           "movups (%4), %%xmm4\n"
6282           "movups (%5), %%xmm5\n"
6283           "movups (%6), %%xmm6\n"
6284           "movups (%7), %%xmm7\n"
6285           "movaps %%xmm0, %%xmm8\n"
6286           "movaps %%xmm0, %%xmm9\n"
6287           "addps %%xmm1, %%xmm8\n"
6288           "subps %%xmm1, %%xmm9\n"
6289           "movaps %%xmm2, %%xmm10\n"
6290           "movaps %%xmm2, %%xmm11\n"
6291           "addps %%xmm3, %%xmm10\n"
6292           "subps %%xmm3, %%xmm11\n"
6293           "movaps %%xmm4, %%xmm12\n"
6294           "movaps %%xmm4, %%xmm13\n"
6295           "addps %%xmm5, %%xmm12\n"
6296           "subps %%xmm5, %%xmm13\n"
6297           "movaps %%xmm6, %%xmm14\n"
6298           "movaps %%xmm6, %%xmm15\n"
6299           "addps %%xmm7, %%xmm14\n"
6300           "subps %%xmm7, %%xmm15\n"
6301           "movaps %%xmm8, %%xmm0\n"
6302           "movaps %%xmm8, %%xmm2\n"
6303           "addps %%xmm10, %%xmm0\n"
6304           "subps %%xmm10, %%xmm2\n"
6305           "movaps %%xmm9, %%xmm1\n"
6306           "movaps %%xmm9, %%xmm3\n"
6307           "addps %%xmm11, %%xmm1\n"
6308           "subps %%xmm11, %%xmm3\n"
6309           "movaps %%xmm12, %%xmm4\n"
6310           "movaps %%xmm12, %%xmm6\n"
6311           "addps %%xmm14, %%xmm4\n"
6312           "subps %%xmm14, %%xmm6\n"
6313           "movaps %%xmm13, %%xmm5\n"
6314           "movaps %%xmm13, %%xmm7\n"
6315           "addps %%xmm15, %%xmm5\n"
6316           "subps %%xmm15, %%xmm7\n"
6317           "movaps %%xmm0, %%xmm8\n"
6318           "movaps %%xmm0, %%xmm12\n"
6319           "addps %%xmm4, %%xmm8\n"
6320           "subps %%xmm4, %%xmm12\n"
6321           "movaps %%xmm1, %%xmm9\n"
6322           "movaps %%xmm1, %%xmm13\n"
6323           "addps %%xmm5, %%xmm9\n"
6324           "subps %%xmm5, %%xmm13\n"
6325           "movaps %%xmm2, %%xmm10\n"
6326           "movaps %%xmm2, %%xmm14\n"
6327           "addps %%xmm6, %%xmm10\n"
6328           "subps %%xmm6, %%xmm14\n"
6329           "movaps %%xmm3, %%xmm11\n"
6330           "movaps %%xmm3, %%xmm15\n"
6331           "addps %%xmm7, %%xmm11\n"
6332           "subps %%xmm7, %%xmm15\n"
6333           "movups %%xmm8, (%0)\n"
6334           "movups %%xmm9, (%1)\n"
6335           "movups %%xmm10, (%2)\n"
6336           "movups %%xmm11, (%3)\n"
6337           "movups %%xmm12, (%4)\n"
6338           "movups %%xmm13, (%5)\n"
6339           "movups %%xmm14, (%6)\n"
6340           "movups %%xmm15, (%7)\n"
6341           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6342         );
6343       }
6344     }
6345     return;
6346   }
6347   if (depth == 20) {
6348     helper_float_20_recursive(buf + 0, 17);
6349     helper_float_20_recursive(buf + 131072, 17);
6350     helper_float_20_recursive(buf + 262144, 17);
6351     helper_float_20_recursive(buf + 393216, 17);
6352     helper_float_20_recursive(buf + 524288, 17);
6353     helper_float_20_recursive(buf + 655360, 17);
6354     helper_float_20_recursive(buf + 786432, 17);
6355     helper_float_20_recursive(buf + 917504, 17);
6356     for (int j = 0; j < 1048576; j += 1048576) {
6357       for (int k = 0; k < 131072; k += 4) {
6358         __asm__ volatile (
6359           "movups (%0), %%xmm0\n"
6360           "movups (%1), %%xmm1\n"
6361           "movups (%2), %%xmm2\n"
6362           "movups (%3), %%xmm3\n"
6363           "movups (%4), %%xmm4\n"
6364           "movups (%5), %%xmm5\n"
6365           "movups (%6), %%xmm6\n"
6366           "movups (%7), %%xmm7\n"
6367           "movaps %%xmm0, %%xmm8\n"
6368           "movaps %%xmm0, %%xmm9\n"
6369           "addps %%xmm1, %%xmm8\n"
6370           "subps %%xmm1, %%xmm9\n"
6371           "movaps %%xmm2, %%xmm10\n"
6372           "movaps %%xmm2, %%xmm11\n"
6373           "addps %%xmm3, %%xmm10\n"
6374           "subps %%xmm3, %%xmm11\n"
6375           "movaps %%xmm4, %%xmm12\n"
6376           "movaps %%xmm4, %%xmm13\n"
6377           "addps %%xmm5, %%xmm12\n"
6378           "subps %%xmm5, %%xmm13\n"
6379           "movaps %%xmm6, %%xmm14\n"
6380           "movaps %%xmm6, %%xmm15\n"
6381           "addps %%xmm7, %%xmm14\n"
6382           "subps %%xmm7, %%xmm15\n"
6383           "movaps %%xmm8, %%xmm0\n"
6384           "movaps %%xmm8, %%xmm2\n"
6385           "addps %%xmm10, %%xmm0\n"
6386           "subps %%xmm10, %%xmm2\n"
6387           "movaps %%xmm9, %%xmm1\n"
6388           "movaps %%xmm9, %%xmm3\n"
6389           "addps %%xmm11, %%xmm1\n"
6390           "subps %%xmm11, %%xmm3\n"
6391           "movaps %%xmm12, %%xmm4\n"
6392           "movaps %%xmm12, %%xmm6\n"
6393           "addps %%xmm14, %%xmm4\n"
6394           "subps %%xmm14, %%xmm6\n"
6395           "movaps %%xmm13, %%xmm5\n"
6396           "movaps %%xmm13, %%xmm7\n"
6397           "addps %%xmm15, %%xmm5\n"
6398           "subps %%xmm15, %%xmm7\n"
6399           "movaps %%xmm0, %%xmm8\n"
6400           "movaps %%xmm0, %%xmm12\n"
6401           "addps %%xmm4, %%xmm8\n"
6402           "subps %%xmm4, %%xmm12\n"
6403           "movaps %%xmm1, %%xmm9\n"
6404           "movaps %%xmm1, %%xmm13\n"
6405           "addps %%xmm5, %%xmm9\n"
6406           "subps %%xmm5, %%xmm13\n"
6407           "movaps %%xmm2, %%xmm10\n"
6408           "movaps %%xmm2, %%xmm14\n"
6409           "addps %%xmm6, %%xmm10\n"
6410           "subps %%xmm6, %%xmm14\n"
6411           "movaps %%xmm3, %%xmm11\n"
6412           "movaps %%xmm3, %%xmm15\n"
6413           "addps %%xmm7, %%xmm11\n"
6414           "subps %%xmm7, %%xmm15\n"
6415           "movups %%xmm8, (%0)\n"
6416           "movups %%xmm9, (%1)\n"
6417           "movups %%xmm10, (%2)\n"
6418           "movups %%xmm11, (%3)\n"
6419           "movups %%xmm12, (%4)\n"
6420           "movups %%xmm13, (%5)\n"
6421           "movups %%xmm14, (%6)\n"
6422           "movups %%xmm15, (%7)\n"
6423           :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6424         );
6425       }
6426     }
6427     return;
6428   }
6429 }
6430 void helper_float_20(float *buf);
helper_float_20(float * buf)6431 void helper_float_20(float *buf) {
6432   helper_float_20_recursive(buf, 20);
6433 }
6434 void helper_float_21_recursive(float *buf, int depth);
helper_float_21_recursive(float * buf,int depth)6435 void helper_float_21_recursive(float *buf, int depth) {
6436   if (depth == 13) {
6437     for (int j = 0; j < 8192; j += 32) {
6438       for (int k = 0; k < 4; k += 4) {
6439         __asm__ volatile (
6440           "movups (%0), %%xmm0\n"
6441           "movups (%1), %%xmm1\n"
6442           "movups (%2), %%xmm2\n"
6443           "movups (%3), %%xmm3\n"
6444           "movups (%4), %%xmm4\n"
6445           "movups (%5), %%xmm5\n"
6446           "movups (%6), %%xmm6\n"
6447           "movups (%7), %%xmm7\n"
6448           "movaps %%xmm0, %%xmm8\n"
6449           "shufps $160, %%xmm8, %%xmm8\n"
6450           "shufps $245, %%xmm0, %%xmm0\n"
6451           "xorps %%xmm9, %%xmm9\n"
6452           "subps %%xmm0, %%xmm9\n"
6453           "addsubps %%xmm9, %%xmm8\n"
6454           "movaps %%xmm8, %%xmm0\n"
6455           "movaps %%xmm1, %%xmm8\n"
6456           "shufps $160, %%xmm8, %%xmm8\n"
6457           "shufps $245, %%xmm1, %%xmm1\n"
6458           "xorps %%xmm9, %%xmm9\n"
6459           "subps %%xmm1, %%xmm9\n"
6460           "addsubps %%xmm9, %%xmm8\n"
6461           "movaps %%xmm8, %%xmm1\n"
6462           "movaps %%xmm2, %%xmm8\n"
6463           "shufps $160, %%xmm8, %%xmm8\n"
6464           "shufps $245, %%xmm2, %%xmm2\n"
6465           "xorps %%xmm9, %%xmm9\n"
6466           "subps %%xmm2, %%xmm9\n"
6467           "addsubps %%xmm9, %%xmm8\n"
6468           "movaps %%xmm8, %%xmm2\n"
6469           "movaps %%xmm3, %%xmm8\n"
6470           "shufps $160, %%xmm8, %%xmm8\n"
6471           "shufps $245, %%xmm3, %%xmm3\n"
6472           "xorps %%xmm9, %%xmm9\n"
6473           "subps %%xmm3, %%xmm9\n"
6474           "addsubps %%xmm9, %%xmm8\n"
6475           "movaps %%xmm8, %%xmm3\n"
6476           "movaps %%xmm4, %%xmm8\n"
6477           "shufps $160, %%xmm8, %%xmm8\n"
6478           "shufps $245, %%xmm4, %%xmm4\n"
6479           "xorps %%xmm9, %%xmm9\n"
6480           "subps %%xmm4, %%xmm9\n"
6481           "addsubps %%xmm9, %%xmm8\n"
6482           "movaps %%xmm8, %%xmm4\n"
6483           "movaps %%xmm5, %%xmm8\n"
6484           "shufps $160, %%xmm8, %%xmm8\n"
6485           "shufps $245, %%xmm5, %%xmm5\n"
6486           "xorps %%xmm9, %%xmm9\n"
6487           "subps %%xmm5, %%xmm9\n"
6488           "addsubps %%xmm9, %%xmm8\n"
6489           "movaps %%xmm8, %%xmm5\n"
6490           "movaps %%xmm6, %%xmm8\n"
6491           "shufps $160, %%xmm8, %%xmm8\n"
6492           "shufps $245, %%xmm6, %%xmm6\n"
6493           "xorps %%xmm9, %%xmm9\n"
6494           "subps %%xmm6, %%xmm9\n"
6495           "addsubps %%xmm9, %%xmm8\n"
6496           "movaps %%xmm8, %%xmm6\n"
6497           "movaps %%xmm7, %%xmm8\n"
6498           "shufps $160, %%xmm8, %%xmm8\n"
6499           "shufps $245, %%xmm7, %%xmm7\n"
6500           "xorps %%xmm9, %%xmm9\n"
6501           "subps %%xmm7, %%xmm9\n"
6502           "addsubps %%xmm9, %%xmm8\n"
6503           "movaps %%xmm8, %%xmm7\n"
6504           "movaps %%xmm0, %%xmm8\n"
6505           "shufps $68, %%xmm8, %%xmm8\n"
6506           "xorps %%xmm9, %%xmm9\n"
6507           "movaps %%xmm0, %%xmm10\n"
6508           "shufps $14, %%xmm9, %%xmm10\n"
6509           "movaps %%xmm0, %%xmm11\n"
6510           "shufps $224, %%xmm11, %%xmm9\n"
6511           "addps %%xmm8, %%xmm10\n"
6512           "subps %%xmm9, %%xmm10\n"
6513           "movaps %%xmm10, %%xmm0\n"
6514           "movaps %%xmm1, %%xmm8\n"
6515           "shufps $68, %%xmm8, %%xmm8\n"
6516           "xorps %%xmm9, %%xmm9\n"
6517           "movaps %%xmm1, %%xmm10\n"
6518           "shufps $14, %%xmm9, %%xmm10\n"
6519           "movaps %%xmm1, %%xmm11\n"
6520           "shufps $224, %%xmm11, %%xmm9\n"
6521           "addps %%xmm8, %%xmm10\n"
6522           "subps %%xmm9, %%xmm10\n"
6523           "movaps %%xmm10, %%xmm1\n"
6524           "movaps %%xmm2, %%xmm8\n"
6525           "shufps $68, %%xmm8, %%xmm8\n"
6526           "xorps %%xmm9, %%xmm9\n"
6527           "movaps %%xmm2, %%xmm10\n"
6528           "shufps $14, %%xmm9, %%xmm10\n"
6529           "movaps %%xmm2, %%xmm11\n"
6530           "shufps $224, %%xmm11, %%xmm9\n"
6531           "addps %%xmm8, %%xmm10\n"
6532           "subps %%xmm9, %%xmm10\n"
6533           "movaps %%xmm10, %%xmm2\n"
6534           "movaps %%xmm3, %%xmm8\n"
6535           "shufps $68, %%xmm8, %%xmm8\n"
6536           "xorps %%xmm9, %%xmm9\n"
6537           "movaps %%xmm3, %%xmm10\n"
6538           "shufps $14, %%xmm9, %%xmm10\n"
6539           "movaps %%xmm3, %%xmm11\n"
6540           "shufps $224, %%xmm11, %%xmm9\n"
6541           "addps %%xmm8, %%xmm10\n"
6542           "subps %%xmm9, %%xmm10\n"
6543           "movaps %%xmm10, %%xmm3\n"
6544           "movaps %%xmm4, %%xmm8\n"
6545           "shufps $68, %%xmm8, %%xmm8\n"
6546           "xorps %%xmm9, %%xmm9\n"
6547           "movaps %%xmm4, %%xmm10\n"
6548           "shufps $14, %%xmm9, %%xmm10\n"
6549           "movaps %%xmm4, %%xmm11\n"
6550           "shufps $224, %%xmm11, %%xmm9\n"
6551           "addps %%xmm8, %%xmm10\n"
6552           "subps %%xmm9, %%xmm10\n"
6553           "movaps %%xmm10, %%xmm4\n"
6554           "movaps %%xmm5, %%xmm8\n"
6555           "shufps $68, %%xmm8, %%xmm8\n"
6556           "xorps %%xmm9, %%xmm9\n"
6557           "movaps %%xmm5, %%xmm10\n"
6558           "shufps $14, %%xmm9, %%xmm10\n"
6559           "movaps %%xmm5, %%xmm11\n"
6560           "shufps $224, %%xmm11, %%xmm9\n"
6561           "addps %%xmm8, %%xmm10\n"
6562           "subps %%xmm9, %%xmm10\n"
6563           "movaps %%xmm10, %%xmm5\n"
6564           "movaps %%xmm6, %%xmm8\n"
6565           "shufps $68, %%xmm8, %%xmm8\n"
6566           "xorps %%xmm9, %%xmm9\n"
6567           "movaps %%xmm6, %%xmm10\n"
6568           "shufps $14, %%xmm9, %%xmm10\n"
6569           "movaps %%xmm6, %%xmm11\n"
6570           "shufps $224, %%xmm11, %%xmm9\n"
6571           "addps %%xmm8, %%xmm10\n"
6572           "subps %%xmm9, %%xmm10\n"
6573           "movaps %%xmm10, %%xmm6\n"
6574           "movaps %%xmm7, %%xmm8\n"
6575           "shufps $68, %%xmm8, %%xmm8\n"
6576           "xorps %%xmm9, %%xmm9\n"
6577           "movaps %%xmm7, %%xmm10\n"
6578           "shufps $14, %%xmm9, %%xmm10\n"
6579           "movaps %%xmm7, %%xmm11\n"
6580           "shufps $224, %%xmm11, %%xmm9\n"
6581           "addps %%xmm8, %%xmm10\n"
6582           "subps %%xmm9, %%xmm10\n"
6583           "movaps %%xmm10, %%xmm7\n"
6584           "movaps %%xmm0, %%xmm8\n"
6585           "movaps %%xmm0, %%xmm9\n"
6586           "addps %%xmm1, %%xmm8\n"
6587           "subps %%xmm1, %%xmm9\n"
6588           "movaps %%xmm2, %%xmm10\n"
6589           "movaps %%xmm2, %%xmm11\n"
6590           "addps %%xmm3, %%xmm10\n"
6591           "subps %%xmm3, %%xmm11\n"
6592           "movaps %%xmm4, %%xmm12\n"
6593           "movaps %%xmm4, %%xmm13\n"
6594           "addps %%xmm5, %%xmm12\n"
6595           "subps %%xmm5, %%xmm13\n"
6596           "movaps %%xmm6, %%xmm14\n"
6597           "movaps %%xmm6, %%xmm15\n"
6598           "addps %%xmm7, %%xmm14\n"
6599           "subps %%xmm7, %%xmm15\n"
6600           "movaps %%xmm8, %%xmm0\n"
6601           "movaps %%xmm8, %%xmm2\n"
6602           "addps %%xmm10, %%xmm0\n"
6603           "subps %%xmm10, %%xmm2\n"
6604           "movaps %%xmm9, %%xmm1\n"
6605           "movaps %%xmm9, %%xmm3\n"
6606           "addps %%xmm11, %%xmm1\n"
6607           "subps %%xmm11, %%xmm3\n"
6608           "movaps %%xmm12, %%xmm4\n"
6609           "movaps %%xmm12, %%xmm6\n"
6610           "addps %%xmm14, %%xmm4\n"
6611           "subps %%xmm14, %%xmm6\n"
6612           "movaps %%xmm13, %%xmm5\n"
6613           "movaps %%xmm13, %%xmm7\n"
6614           "addps %%xmm15, %%xmm5\n"
6615           "subps %%xmm15, %%xmm7\n"
6616           "movaps %%xmm0, %%xmm8\n"
6617           "movaps %%xmm0, %%xmm12\n"
6618           "addps %%xmm4, %%xmm8\n"
6619           "subps %%xmm4, %%xmm12\n"
6620           "movaps %%xmm1, %%xmm9\n"
6621           "movaps %%xmm1, %%xmm13\n"
6622           "addps %%xmm5, %%xmm9\n"
6623           "subps %%xmm5, %%xmm13\n"
6624           "movaps %%xmm2, %%xmm10\n"
6625           "movaps %%xmm2, %%xmm14\n"
6626           "addps %%xmm6, %%xmm10\n"
6627           "subps %%xmm6, %%xmm14\n"
6628           "movaps %%xmm3, %%xmm11\n"
6629           "movaps %%xmm3, %%xmm15\n"
6630           "addps %%xmm7, %%xmm11\n"
6631           "subps %%xmm7, %%xmm15\n"
6632           "movups %%xmm8, (%0)\n"
6633           "movups %%xmm9, (%1)\n"
6634           "movups %%xmm10, (%2)\n"
6635           "movups %%xmm11, (%3)\n"
6636           "movups %%xmm12, (%4)\n"
6637           "movups %%xmm13, (%5)\n"
6638           "movups %%xmm14, (%6)\n"
6639           "movups %%xmm15, (%7)\n"
6640           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6641         );
6642       }
6643     }
6644     for (int j = 0; j < 8192; j += 256) {
6645       for (int k = 0; k < 32; k += 4) {
6646         __asm__ volatile (
6647           "movups (%0), %%xmm0\n"
6648           "movups (%1), %%xmm1\n"
6649           "movups (%2), %%xmm2\n"
6650           "movups (%3), %%xmm3\n"
6651           "movups (%4), %%xmm4\n"
6652           "movups (%5), %%xmm5\n"
6653           "movups (%6), %%xmm6\n"
6654           "movups (%7), %%xmm7\n"
6655           "movaps %%xmm0, %%xmm8\n"
6656           "movaps %%xmm0, %%xmm9\n"
6657           "addps %%xmm1, %%xmm8\n"
6658           "subps %%xmm1, %%xmm9\n"
6659           "movaps %%xmm2, %%xmm10\n"
6660           "movaps %%xmm2, %%xmm11\n"
6661           "addps %%xmm3, %%xmm10\n"
6662           "subps %%xmm3, %%xmm11\n"
6663           "movaps %%xmm4, %%xmm12\n"
6664           "movaps %%xmm4, %%xmm13\n"
6665           "addps %%xmm5, %%xmm12\n"
6666           "subps %%xmm5, %%xmm13\n"
6667           "movaps %%xmm6, %%xmm14\n"
6668           "movaps %%xmm6, %%xmm15\n"
6669           "addps %%xmm7, %%xmm14\n"
6670           "subps %%xmm7, %%xmm15\n"
6671           "movaps %%xmm8, %%xmm0\n"
6672           "movaps %%xmm8, %%xmm2\n"
6673           "addps %%xmm10, %%xmm0\n"
6674           "subps %%xmm10, %%xmm2\n"
6675           "movaps %%xmm9, %%xmm1\n"
6676           "movaps %%xmm9, %%xmm3\n"
6677           "addps %%xmm11, %%xmm1\n"
6678           "subps %%xmm11, %%xmm3\n"
6679           "movaps %%xmm12, %%xmm4\n"
6680           "movaps %%xmm12, %%xmm6\n"
6681           "addps %%xmm14, %%xmm4\n"
6682           "subps %%xmm14, %%xmm6\n"
6683           "movaps %%xmm13, %%xmm5\n"
6684           "movaps %%xmm13, %%xmm7\n"
6685           "addps %%xmm15, %%xmm5\n"
6686           "subps %%xmm15, %%xmm7\n"
6687           "movaps %%xmm0, %%xmm8\n"
6688           "movaps %%xmm0, %%xmm12\n"
6689           "addps %%xmm4, %%xmm8\n"
6690           "subps %%xmm4, %%xmm12\n"
6691           "movaps %%xmm1, %%xmm9\n"
6692           "movaps %%xmm1, %%xmm13\n"
6693           "addps %%xmm5, %%xmm9\n"
6694           "subps %%xmm5, %%xmm13\n"
6695           "movaps %%xmm2, %%xmm10\n"
6696           "movaps %%xmm2, %%xmm14\n"
6697           "addps %%xmm6, %%xmm10\n"
6698           "subps %%xmm6, %%xmm14\n"
6699           "movaps %%xmm3, %%xmm11\n"
6700           "movaps %%xmm3, %%xmm15\n"
6701           "addps %%xmm7, %%xmm11\n"
6702           "subps %%xmm7, %%xmm15\n"
6703           "movups %%xmm8, (%0)\n"
6704           "movups %%xmm9, (%1)\n"
6705           "movups %%xmm10, (%2)\n"
6706           "movups %%xmm11, (%3)\n"
6707           "movups %%xmm12, (%4)\n"
6708           "movups %%xmm13, (%5)\n"
6709           "movups %%xmm14, (%6)\n"
6710           "movups %%xmm15, (%7)\n"
6711           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6712         );
6713       }
6714     }
6715     for (int j = 0; j < 8192; j += 2048) {
6716       for (int k = 0; k < 256; k += 4) {
6717         __asm__ volatile (
6718           "movups (%0), %%xmm0\n"
6719           "movups (%1), %%xmm1\n"
6720           "movups (%2), %%xmm2\n"
6721           "movups (%3), %%xmm3\n"
6722           "movups (%4), %%xmm4\n"
6723           "movups (%5), %%xmm5\n"
6724           "movups (%6), %%xmm6\n"
6725           "movups (%7), %%xmm7\n"
6726           "movaps %%xmm0, %%xmm8\n"
6727           "movaps %%xmm0, %%xmm9\n"
6728           "addps %%xmm1, %%xmm8\n"
6729           "subps %%xmm1, %%xmm9\n"
6730           "movaps %%xmm2, %%xmm10\n"
6731           "movaps %%xmm2, %%xmm11\n"
6732           "addps %%xmm3, %%xmm10\n"
6733           "subps %%xmm3, %%xmm11\n"
6734           "movaps %%xmm4, %%xmm12\n"
6735           "movaps %%xmm4, %%xmm13\n"
6736           "addps %%xmm5, %%xmm12\n"
6737           "subps %%xmm5, %%xmm13\n"
6738           "movaps %%xmm6, %%xmm14\n"
6739           "movaps %%xmm6, %%xmm15\n"
6740           "addps %%xmm7, %%xmm14\n"
6741           "subps %%xmm7, %%xmm15\n"
6742           "movaps %%xmm8, %%xmm0\n"
6743           "movaps %%xmm8, %%xmm2\n"
6744           "addps %%xmm10, %%xmm0\n"
6745           "subps %%xmm10, %%xmm2\n"
6746           "movaps %%xmm9, %%xmm1\n"
6747           "movaps %%xmm9, %%xmm3\n"
6748           "addps %%xmm11, %%xmm1\n"
6749           "subps %%xmm11, %%xmm3\n"
6750           "movaps %%xmm12, %%xmm4\n"
6751           "movaps %%xmm12, %%xmm6\n"
6752           "addps %%xmm14, %%xmm4\n"
6753           "subps %%xmm14, %%xmm6\n"
6754           "movaps %%xmm13, %%xmm5\n"
6755           "movaps %%xmm13, %%xmm7\n"
6756           "addps %%xmm15, %%xmm5\n"
6757           "subps %%xmm15, %%xmm7\n"
6758           "movaps %%xmm0, %%xmm8\n"
6759           "movaps %%xmm0, %%xmm12\n"
6760           "addps %%xmm4, %%xmm8\n"
6761           "subps %%xmm4, %%xmm12\n"
6762           "movaps %%xmm1, %%xmm9\n"
6763           "movaps %%xmm1, %%xmm13\n"
6764           "addps %%xmm5, %%xmm9\n"
6765           "subps %%xmm5, %%xmm13\n"
6766           "movaps %%xmm2, %%xmm10\n"
6767           "movaps %%xmm2, %%xmm14\n"
6768           "addps %%xmm6, %%xmm10\n"
6769           "subps %%xmm6, %%xmm14\n"
6770           "movaps %%xmm3, %%xmm11\n"
6771           "movaps %%xmm3, %%xmm15\n"
6772           "addps %%xmm7, %%xmm11\n"
6773           "subps %%xmm7, %%xmm15\n"
6774           "movups %%xmm8, (%0)\n"
6775           "movups %%xmm9, (%1)\n"
6776           "movups %%xmm10, (%2)\n"
6777           "movups %%xmm11, (%3)\n"
6778           "movups %%xmm12, (%4)\n"
6779           "movups %%xmm13, (%5)\n"
6780           "movups %%xmm14, (%6)\n"
6781           "movups %%xmm15, (%7)\n"
6782           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6783         );
6784       }
6785     }
6786     for (int j = 0; j < 8192; j += 8192) {
6787       for (int k = 0; k < 2048; k += 4) {
6788         __asm__ volatile (
6789           "movups (%0), %%xmm0\n"
6790           "movups (%1), %%xmm1\n"
6791           "movups (%2), %%xmm2\n"
6792           "movups (%3), %%xmm3\n"
6793           "movaps %%xmm0, %%xmm8\n"
6794           "movaps %%xmm0, %%xmm9\n"
6795           "addps %%xmm1, %%xmm8\n"
6796           "subps %%xmm1, %%xmm9\n"
6797           "movaps %%xmm2, %%xmm10\n"
6798           "movaps %%xmm2, %%xmm11\n"
6799           "addps %%xmm3, %%xmm10\n"
6800           "subps %%xmm3, %%xmm11\n"
6801           "movaps %%xmm8, %%xmm0\n"
6802           "movaps %%xmm8, %%xmm2\n"
6803           "addps %%xmm10, %%xmm0\n"
6804           "subps %%xmm10, %%xmm2\n"
6805           "movaps %%xmm9, %%xmm1\n"
6806           "movaps %%xmm9, %%xmm3\n"
6807           "addps %%xmm11, %%xmm1\n"
6808           "subps %%xmm11, %%xmm3\n"
6809           "movups %%xmm0, (%0)\n"
6810           "movups %%xmm1, (%1)\n"
6811           "movups %%xmm2, (%2)\n"
6812           "movups %%xmm3, (%3)\n"
6813           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6814         );
6815       }
6816     }
6817     return;
6818   }
6819   if (depth == 16) {
6820     helper_float_21_recursive(buf + 0, 13);
6821     helper_float_21_recursive(buf + 8192, 13);
6822     helper_float_21_recursive(buf + 16384, 13);
6823     helper_float_21_recursive(buf + 24576, 13);
6824     helper_float_21_recursive(buf + 32768, 13);
6825     helper_float_21_recursive(buf + 40960, 13);
6826     helper_float_21_recursive(buf + 49152, 13);
6827     helper_float_21_recursive(buf + 57344, 13);
6828     for (int j = 0; j < 65536; j += 65536) {
6829       for (int k = 0; k < 8192; k += 4) {
6830         __asm__ volatile (
6831           "movups (%0), %%xmm0\n"
6832           "movups (%1), %%xmm1\n"
6833           "movups (%2), %%xmm2\n"
6834           "movups (%3), %%xmm3\n"
6835           "movups (%4), %%xmm4\n"
6836           "movups (%5), %%xmm5\n"
6837           "movups (%6), %%xmm6\n"
6838           "movups (%7), %%xmm7\n"
6839           "movaps %%xmm0, %%xmm8\n"
6840           "movaps %%xmm0, %%xmm9\n"
6841           "addps %%xmm1, %%xmm8\n"
6842           "subps %%xmm1, %%xmm9\n"
6843           "movaps %%xmm2, %%xmm10\n"
6844           "movaps %%xmm2, %%xmm11\n"
6845           "addps %%xmm3, %%xmm10\n"
6846           "subps %%xmm3, %%xmm11\n"
6847           "movaps %%xmm4, %%xmm12\n"
6848           "movaps %%xmm4, %%xmm13\n"
6849           "addps %%xmm5, %%xmm12\n"
6850           "subps %%xmm5, %%xmm13\n"
6851           "movaps %%xmm6, %%xmm14\n"
6852           "movaps %%xmm6, %%xmm15\n"
6853           "addps %%xmm7, %%xmm14\n"
6854           "subps %%xmm7, %%xmm15\n"
6855           "movaps %%xmm8, %%xmm0\n"
6856           "movaps %%xmm8, %%xmm2\n"
6857           "addps %%xmm10, %%xmm0\n"
6858           "subps %%xmm10, %%xmm2\n"
6859           "movaps %%xmm9, %%xmm1\n"
6860           "movaps %%xmm9, %%xmm3\n"
6861           "addps %%xmm11, %%xmm1\n"
6862           "subps %%xmm11, %%xmm3\n"
6863           "movaps %%xmm12, %%xmm4\n"
6864           "movaps %%xmm12, %%xmm6\n"
6865           "addps %%xmm14, %%xmm4\n"
6866           "subps %%xmm14, %%xmm6\n"
6867           "movaps %%xmm13, %%xmm5\n"
6868           "movaps %%xmm13, %%xmm7\n"
6869           "addps %%xmm15, %%xmm5\n"
6870           "subps %%xmm15, %%xmm7\n"
6871           "movaps %%xmm0, %%xmm8\n"
6872           "movaps %%xmm0, %%xmm12\n"
6873           "addps %%xmm4, %%xmm8\n"
6874           "subps %%xmm4, %%xmm12\n"
6875           "movaps %%xmm1, %%xmm9\n"
6876           "movaps %%xmm1, %%xmm13\n"
6877           "addps %%xmm5, %%xmm9\n"
6878           "subps %%xmm5, %%xmm13\n"
6879           "movaps %%xmm2, %%xmm10\n"
6880           "movaps %%xmm2, %%xmm14\n"
6881           "addps %%xmm6, %%xmm10\n"
6882           "subps %%xmm6, %%xmm14\n"
6883           "movaps %%xmm3, %%xmm11\n"
6884           "movaps %%xmm3, %%xmm15\n"
6885           "addps %%xmm7, %%xmm11\n"
6886           "subps %%xmm7, %%xmm15\n"
6887           "movups %%xmm8, (%0)\n"
6888           "movups %%xmm9, (%1)\n"
6889           "movups %%xmm10, (%2)\n"
6890           "movups %%xmm11, (%3)\n"
6891           "movups %%xmm12, (%4)\n"
6892           "movups %%xmm13, (%5)\n"
6893           "movups %%xmm14, (%6)\n"
6894           "movups %%xmm15, (%7)\n"
6895           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6896         );
6897       }
6898     }
6899     return;
6900   }
6901   if (depth == 19) {
6902     helper_float_21_recursive(buf + 0, 16);
6903     helper_float_21_recursive(buf + 65536, 16);
6904     helper_float_21_recursive(buf + 131072, 16);
6905     helper_float_21_recursive(buf + 196608, 16);
6906     helper_float_21_recursive(buf + 262144, 16);
6907     helper_float_21_recursive(buf + 327680, 16);
6908     helper_float_21_recursive(buf + 393216, 16);
6909     helper_float_21_recursive(buf + 458752, 16);
6910     for (int j = 0; j < 524288; j += 524288) {
6911       for (int k = 0; k < 65536; k += 4) {
6912         __asm__ volatile (
6913           "movups (%0), %%xmm0\n"
6914           "movups (%1), %%xmm1\n"
6915           "movups (%2), %%xmm2\n"
6916           "movups (%3), %%xmm3\n"
6917           "movups (%4), %%xmm4\n"
6918           "movups (%5), %%xmm5\n"
6919           "movups (%6), %%xmm6\n"
6920           "movups (%7), %%xmm7\n"
6921           "movaps %%xmm0, %%xmm8\n"
6922           "movaps %%xmm0, %%xmm9\n"
6923           "addps %%xmm1, %%xmm8\n"
6924           "subps %%xmm1, %%xmm9\n"
6925           "movaps %%xmm2, %%xmm10\n"
6926           "movaps %%xmm2, %%xmm11\n"
6927           "addps %%xmm3, %%xmm10\n"
6928           "subps %%xmm3, %%xmm11\n"
6929           "movaps %%xmm4, %%xmm12\n"
6930           "movaps %%xmm4, %%xmm13\n"
6931           "addps %%xmm5, %%xmm12\n"
6932           "subps %%xmm5, %%xmm13\n"
6933           "movaps %%xmm6, %%xmm14\n"
6934           "movaps %%xmm6, %%xmm15\n"
6935           "addps %%xmm7, %%xmm14\n"
6936           "subps %%xmm7, %%xmm15\n"
6937           "movaps %%xmm8, %%xmm0\n"
6938           "movaps %%xmm8, %%xmm2\n"
6939           "addps %%xmm10, %%xmm0\n"
6940           "subps %%xmm10, %%xmm2\n"
6941           "movaps %%xmm9, %%xmm1\n"
6942           "movaps %%xmm9, %%xmm3\n"
6943           "addps %%xmm11, %%xmm1\n"
6944           "subps %%xmm11, %%xmm3\n"
6945           "movaps %%xmm12, %%xmm4\n"
6946           "movaps %%xmm12, %%xmm6\n"
6947           "addps %%xmm14, %%xmm4\n"
6948           "subps %%xmm14, %%xmm6\n"
6949           "movaps %%xmm13, %%xmm5\n"
6950           "movaps %%xmm13, %%xmm7\n"
6951           "addps %%xmm15, %%xmm5\n"
6952           "subps %%xmm15, %%xmm7\n"
6953           "movaps %%xmm0, %%xmm8\n"
6954           "movaps %%xmm0, %%xmm12\n"
6955           "addps %%xmm4, %%xmm8\n"
6956           "subps %%xmm4, %%xmm12\n"
6957           "movaps %%xmm1, %%xmm9\n"
6958           "movaps %%xmm1, %%xmm13\n"
6959           "addps %%xmm5, %%xmm9\n"
6960           "subps %%xmm5, %%xmm13\n"
6961           "movaps %%xmm2, %%xmm10\n"
6962           "movaps %%xmm2, %%xmm14\n"
6963           "addps %%xmm6, %%xmm10\n"
6964           "subps %%xmm6, %%xmm14\n"
6965           "movaps %%xmm3, %%xmm11\n"
6966           "movaps %%xmm3, %%xmm15\n"
6967           "addps %%xmm7, %%xmm11\n"
6968           "subps %%xmm7, %%xmm15\n"
6969           "movups %%xmm8, (%0)\n"
6970           "movups %%xmm9, (%1)\n"
6971           "movups %%xmm10, (%2)\n"
6972           "movups %%xmm11, (%3)\n"
6973           "movups %%xmm12, (%4)\n"
6974           "movups %%xmm13, (%5)\n"
6975           "movups %%xmm14, (%6)\n"
6976           "movups %%xmm15, (%7)\n"
6977           :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6978         );
6979       }
6980     }
6981     return;
6982   }
6983   if (depth == 21) {
6984     helper_float_21_recursive(buf + 0, 19);
6985     helper_float_21_recursive(buf + 524288, 19);
6986     helper_float_21_recursive(buf + 1048576, 19);
6987     helper_float_21_recursive(buf + 1572864, 19);
6988     for (int j = 0; j < 2097152; j += 2097152) {
6989       for (int k = 0; k < 524288; k += 4) {
6990         __asm__ volatile (
6991           "movups (%0), %%xmm0\n"
6992           "movups (%1), %%xmm1\n"
6993           "movups (%2), %%xmm2\n"
6994           "movups (%3), %%xmm3\n"
6995           "movaps %%xmm0, %%xmm8\n"
6996           "movaps %%xmm0, %%xmm9\n"
6997           "addps %%xmm1, %%xmm8\n"
6998           "subps %%xmm1, %%xmm9\n"
6999           "movaps %%xmm2, %%xmm10\n"
7000           "movaps %%xmm2, %%xmm11\n"
7001           "addps %%xmm3, %%xmm10\n"
7002           "subps %%xmm3, %%xmm11\n"
7003           "movaps %%xmm8, %%xmm0\n"
7004           "movaps %%xmm8, %%xmm2\n"
7005           "addps %%xmm10, %%xmm0\n"
7006           "subps %%xmm10, %%xmm2\n"
7007           "movaps %%xmm9, %%xmm1\n"
7008           "movaps %%xmm9, %%xmm3\n"
7009           "addps %%xmm11, %%xmm1\n"
7010           "subps %%xmm11, %%xmm3\n"
7011           "movups %%xmm0, (%0)\n"
7012           "movups %%xmm1, (%1)\n"
7013           "movups %%xmm2, (%2)\n"
7014           "movups %%xmm3, (%3)\n"
7015           :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7016         );
7017       }
7018     }
7019     return;
7020   }
7021 }
7022 void helper_float_21(float *buf);
helper_float_21(float * buf)7023 void helper_float_21(float *buf) {
7024   helper_float_21_recursive(buf, 21);
7025 }
7026 void helper_float_22_recursive(float *buf, int depth);
helper_float_22_recursive(float * buf,int depth)7027 void helper_float_22_recursive(float *buf, int depth) {
7028   if (depth == 11) {
7029     for (int j = 0; j < 2048; j += 32) {
7030       for (int k = 0; k < 4; k += 4) {
7031         __asm__ volatile (
7032           "movups (%0), %%xmm0\n"
7033           "movups (%1), %%xmm1\n"
7034           "movups (%2), %%xmm2\n"
7035           "movups (%3), %%xmm3\n"
7036           "movups (%4), %%xmm4\n"
7037           "movups (%5), %%xmm5\n"
7038           "movups (%6), %%xmm6\n"
7039           "movups (%7), %%xmm7\n"
7040           "movaps %%xmm0, %%xmm8\n"
7041           "shufps $160, %%xmm8, %%xmm8\n"
7042           "shufps $245, %%xmm0, %%xmm0\n"
7043           "xorps %%xmm9, %%xmm9\n"
7044           "subps %%xmm0, %%xmm9\n"
7045           "addsubps %%xmm9, %%xmm8\n"
7046           "movaps %%xmm8, %%xmm0\n"
7047           "movaps %%xmm1, %%xmm8\n"
7048           "shufps $160, %%xmm8, %%xmm8\n"
7049           "shufps $245, %%xmm1, %%xmm1\n"
7050           "xorps %%xmm9, %%xmm9\n"
7051           "subps %%xmm1, %%xmm9\n"
7052           "addsubps %%xmm9, %%xmm8\n"
7053           "movaps %%xmm8, %%xmm1\n"
7054           "movaps %%xmm2, %%xmm8\n"
7055           "shufps $160, %%xmm8, %%xmm8\n"
7056           "shufps $245, %%xmm2, %%xmm2\n"
7057           "xorps %%xmm9, %%xmm9\n"
7058           "subps %%xmm2, %%xmm9\n"
7059           "addsubps %%xmm9, %%xmm8\n"
7060           "movaps %%xmm8, %%xmm2\n"
7061           "movaps %%xmm3, %%xmm8\n"
7062           "shufps $160, %%xmm8, %%xmm8\n"
7063           "shufps $245, %%xmm3, %%xmm3\n"
7064           "xorps %%xmm9, %%xmm9\n"
7065           "subps %%xmm3, %%xmm9\n"
7066           "addsubps %%xmm9, %%xmm8\n"
7067           "movaps %%xmm8, %%xmm3\n"
7068           "movaps %%xmm4, %%xmm8\n"
7069           "shufps $160, %%xmm8, %%xmm8\n"
7070           "shufps $245, %%xmm4, %%xmm4\n"
7071           "xorps %%xmm9, %%xmm9\n"
7072           "subps %%xmm4, %%xmm9\n"
7073           "addsubps %%xmm9, %%xmm8\n"
7074           "movaps %%xmm8, %%xmm4\n"
7075           "movaps %%xmm5, %%xmm8\n"
7076           "shufps $160, %%xmm8, %%xmm8\n"
7077           "shufps $245, %%xmm5, %%xmm5\n"
7078           "xorps %%xmm9, %%xmm9\n"
7079           "subps %%xmm5, %%xmm9\n"
7080           "addsubps %%xmm9, %%xmm8\n"
7081           "movaps %%xmm8, %%xmm5\n"
7082           "movaps %%xmm6, %%xmm8\n"
7083           "shufps $160, %%xmm8, %%xmm8\n"
7084           "shufps $245, %%xmm6, %%xmm6\n"
7085           "xorps %%xmm9, %%xmm9\n"
7086           "subps %%xmm6, %%xmm9\n"
7087           "addsubps %%xmm9, %%xmm8\n"
7088           "movaps %%xmm8, %%xmm6\n"
7089           "movaps %%xmm7, %%xmm8\n"
7090           "shufps $160, %%xmm8, %%xmm8\n"
7091           "shufps $245, %%xmm7, %%xmm7\n"
7092           "xorps %%xmm9, %%xmm9\n"
7093           "subps %%xmm7, %%xmm9\n"
7094           "addsubps %%xmm9, %%xmm8\n"
7095           "movaps %%xmm8, %%xmm7\n"
7096           "movaps %%xmm0, %%xmm8\n"
7097           "shufps $68, %%xmm8, %%xmm8\n"
7098           "xorps %%xmm9, %%xmm9\n"
7099           "movaps %%xmm0, %%xmm10\n"
7100           "shufps $14, %%xmm9, %%xmm10\n"
7101           "movaps %%xmm0, %%xmm11\n"
7102           "shufps $224, %%xmm11, %%xmm9\n"
7103           "addps %%xmm8, %%xmm10\n"
7104           "subps %%xmm9, %%xmm10\n"
7105           "movaps %%xmm10, %%xmm0\n"
7106           "movaps %%xmm1, %%xmm8\n"
7107           "shufps $68, %%xmm8, %%xmm8\n"
7108           "xorps %%xmm9, %%xmm9\n"
7109           "movaps %%xmm1, %%xmm10\n"
7110           "shufps $14, %%xmm9, %%xmm10\n"
7111           "movaps %%xmm1, %%xmm11\n"
7112           "shufps $224, %%xmm11, %%xmm9\n"
7113           "addps %%xmm8, %%xmm10\n"
7114           "subps %%xmm9, %%xmm10\n"
7115           "movaps %%xmm10, %%xmm1\n"
7116           "movaps %%xmm2, %%xmm8\n"
7117           "shufps $68, %%xmm8, %%xmm8\n"
7118           "xorps %%xmm9, %%xmm9\n"
7119           "movaps %%xmm2, %%xmm10\n"
7120           "shufps $14, %%xmm9, %%xmm10\n"
7121           "movaps %%xmm2, %%xmm11\n"
7122           "shufps $224, %%xmm11, %%xmm9\n"
7123           "addps %%xmm8, %%xmm10\n"
7124           "subps %%xmm9, %%xmm10\n"
7125           "movaps %%xmm10, %%xmm2\n"
7126           "movaps %%xmm3, %%xmm8\n"
7127           "shufps $68, %%xmm8, %%xmm8\n"
7128           "xorps %%xmm9, %%xmm9\n"
7129           "movaps %%xmm3, %%xmm10\n"
7130           "shufps $14, %%xmm9, %%xmm10\n"
7131           "movaps %%xmm3, %%xmm11\n"
7132           "shufps $224, %%xmm11, %%xmm9\n"
7133           "addps %%xmm8, %%xmm10\n"
7134           "subps %%xmm9, %%xmm10\n"
7135           "movaps %%xmm10, %%xmm3\n"
7136           "movaps %%xmm4, %%xmm8\n"
7137           "shufps $68, %%xmm8, %%xmm8\n"
7138           "xorps %%xmm9, %%xmm9\n"
7139           "movaps %%xmm4, %%xmm10\n"
7140           "shufps $14, %%xmm9, %%xmm10\n"
7141           "movaps %%xmm4, %%xmm11\n"
7142           "shufps $224, %%xmm11, %%xmm9\n"
7143           "addps %%xmm8, %%xmm10\n"
7144           "subps %%xmm9, %%xmm10\n"
7145           "movaps %%xmm10, %%xmm4\n"
7146           "movaps %%xmm5, %%xmm8\n"
7147           "shufps $68, %%xmm8, %%xmm8\n"
7148           "xorps %%xmm9, %%xmm9\n"
7149           "movaps %%xmm5, %%xmm10\n"
7150           "shufps $14, %%xmm9, %%xmm10\n"
7151           "movaps %%xmm5, %%xmm11\n"
7152           "shufps $224, %%xmm11, %%xmm9\n"
7153           "addps %%xmm8, %%xmm10\n"
7154           "subps %%xmm9, %%xmm10\n"
7155           "movaps %%xmm10, %%xmm5\n"
7156           "movaps %%xmm6, %%xmm8\n"
7157           "shufps $68, %%xmm8, %%xmm8\n"
7158           "xorps %%xmm9, %%xmm9\n"
7159           "movaps %%xmm6, %%xmm10\n"
7160           "shufps $14, %%xmm9, %%xmm10\n"
7161           "movaps %%xmm6, %%xmm11\n"
7162           "shufps $224, %%xmm11, %%xmm9\n"
7163           "addps %%xmm8, %%xmm10\n"
7164           "subps %%xmm9, %%xmm10\n"
7165           "movaps %%xmm10, %%xmm6\n"
7166           "movaps %%xmm7, %%xmm8\n"
7167           "shufps $68, %%xmm8, %%xmm8\n"
7168           "xorps %%xmm9, %%xmm9\n"
7169           "movaps %%xmm7, %%xmm10\n"
7170           "shufps $14, %%xmm9, %%xmm10\n"
7171           "movaps %%xmm7, %%xmm11\n"
7172           "shufps $224, %%xmm11, %%xmm9\n"
7173           "addps %%xmm8, %%xmm10\n"
7174           "subps %%xmm9, %%xmm10\n"
7175           "movaps %%xmm10, %%xmm7\n"
7176           "movaps %%xmm0, %%xmm8\n"
7177           "movaps %%xmm0, %%xmm9\n"
7178           "addps %%xmm1, %%xmm8\n"
7179           "subps %%xmm1, %%xmm9\n"
7180           "movaps %%xmm2, %%xmm10\n"
7181           "movaps %%xmm2, %%xmm11\n"
7182           "addps %%xmm3, %%xmm10\n"
7183           "subps %%xmm3, %%xmm11\n"
7184           "movaps %%xmm4, %%xmm12\n"
7185           "movaps %%xmm4, %%xmm13\n"
7186           "addps %%xmm5, %%xmm12\n"
7187           "subps %%xmm5, %%xmm13\n"
7188           "movaps %%xmm6, %%xmm14\n"
7189           "movaps %%xmm6, %%xmm15\n"
7190           "addps %%xmm7, %%xmm14\n"
7191           "subps %%xmm7, %%xmm15\n"
7192           "movaps %%xmm8, %%xmm0\n"
7193           "movaps %%xmm8, %%xmm2\n"
7194           "addps %%xmm10, %%xmm0\n"
7195           "subps %%xmm10, %%xmm2\n"
7196           "movaps %%xmm9, %%xmm1\n"
7197           "movaps %%xmm9, %%xmm3\n"
7198           "addps %%xmm11, %%xmm1\n"
7199           "subps %%xmm11, %%xmm3\n"
7200           "movaps %%xmm12, %%xmm4\n"
7201           "movaps %%xmm12, %%xmm6\n"
7202           "addps %%xmm14, %%xmm4\n"
7203           "subps %%xmm14, %%xmm6\n"
7204           "movaps %%xmm13, %%xmm5\n"
7205           "movaps %%xmm13, %%xmm7\n"
7206           "addps %%xmm15, %%xmm5\n"
7207           "subps %%xmm15, %%xmm7\n"
7208           "movaps %%xmm0, %%xmm8\n"
7209           "movaps %%xmm0, %%xmm12\n"
7210           "addps %%xmm4, %%xmm8\n"
7211           "subps %%xmm4, %%xmm12\n"
7212           "movaps %%xmm1, %%xmm9\n"
7213           "movaps %%xmm1, %%xmm13\n"
7214           "addps %%xmm5, %%xmm9\n"
7215           "subps %%xmm5, %%xmm13\n"
7216           "movaps %%xmm2, %%xmm10\n"
7217           "movaps %%xmm2, %%xmm14\n"
7218           "addps %%xmm6, %%xmm10\n"
7219           "subps %%xmm6, %%xmm14\n"
7220           "movaps %%xmm3, %%xmm11\n"
7221           "movaps %%xmm3, %%xmm15\n"
7222           "addps %%xmm7, %%xmm11\n"
7223           "subps %%xmm7, %%xmm15\n"
7224           "movups %%xmm8, (%0)\n"
7225           "movups %%xmm9, (%1)\n"
7226           "movups %%xmm10, (%2)\n"
7227           "movups %%xmm11, (%3)\n"
7228           "movups %%xmm12, (%4)\n"
7229           "movups %%xmm13, (%5)\n"
7230           "movups %%xmm14, (%6)\n"
7231           "movups %%xmm15, (%7)\n"
7232           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7233         );
7234       }
7235     }
7236     for (int j = 0; j < 2048; j += 256) {
7237       for (int k = 0; k < 32; k += 4) {
7238         __asm__ volatile (
7239           "movups (%0), %%xmm0\n"
7240           "movups (%1), %%xmm1\n"
7241           "movups (%2), %%xmm2\n"
7242           "movups (%3), %%xmm3\n"
7243           "movups (%4), %%xmm4\n"
7244           "movups (%5), %%xmm5\n"
7245           "movups (%6), %%xmm6\n"
7246           "movups (%7), %%xmm7\n"
7247           "movaps %%xmm0, %%xmm8\n"
7248           "movaps %%xmm0, %%xmm9\n"
7249           "addps %%xmm1, %%xmm8\n"
7250           "subps %%xmm1, %%xmm9\n"
7251           "movaps %%xmm2, %%xmm10\n"
7252           "movaps %%xmm2, %%xmm11\n"
7253           "addps %%xmm3, %%xmm10\n"
7254           "subps %%xmm3, %%xmm11\n"
7255           "movaps %%xmm4, %%xmm12\n"
7256           "movaps %%xmm4, %%xmm13\n"
7257           "addps %%xmm5, %%xmm12\n"
7258           "subps %%xmm5, %%xmm13\n"
7259           "movaps %%xmm6, %%xmm14\n"
7260           "movaps %%xmm6, %%xmm15\n"
7261           "addps %%xmm7, %%xmm14\n"
7262           "subps %%xmm7, %%xmm15\n"
7263           "movaps %%xmm8, %%xmm0\n"
7264           "movaps %%xmm8, %%xmm2\n"
7265           "addps %%xmm10, %%xmm0\n"
7266           "subps %%xmm10, %%xmm2\n"
7267           "movaps %%xmm9, %%xmm1\n"
7268           "movaps %%xmm9, %%xmm3\n"
7269           "addps %%xmm11, %%xmm1\n"
7270           "subps %%xmm11, %%xmm3\n"
7271           "movaps %%xmm12, %%xmm4\n"
7272           "movaps %%xmm12, %%xmm6\n"
7273           "addps %%xmm14, %%xmm4\n"
7274           "subps %%xmm14, %%xmm6\n"
7275           "movaps %%xmm13, %%xmm5\n"
7276           "movaps %%xmm13, %%xmm7\n"
7277           "addps %%xmm15, %%xmm5\n"
7278           "subps %%xmm15, %%xmm7\n"
7279           "movaps %%xmm0, %%xmm8\n"
7280           "movaps %%xmm0, %%xmm12\n"
7281           "addps %%xmm4, %%xmm8\n"
7282           "subps %%xmm4, %%xmm12\n"
7283           "movaps %%xmm1, %%xmm9\n"
7284           "movaps %%xmm1, %%xmm13\n"
7285           "addps %%xmm5, %%xmm9\n"
7286           "subps %%xmm5, %%xmm13\n"
7287           "movaps %%xmm2, %%xmm10\n"
7288           "movaps %%xmm2, %%xmm14\n"
7289           "addps %%xmm6, %%xmm10\n"
7290           "subps %%xmm6, %%xmm14\n"
7291           "movaps %%xmm3, %%xmm11\n"
7292           "movaps %%xmm3, %%xmm15\n"
7293           "addps %%xmm7, %%xmm11\n"
7294           "subps %%xmm7, %%xmm15\n"
7295           "movups %%xmm8, (%0)\n"
7296           "movups %%xmm9, (%1)\n"
7297           "movups %%xmm10, (%2)\n"
7298           "movups %%xmm11, (%3)\n"
7299           "movups %%xmm12, (%4)\n"
7300           "movups %%xmm13, (%5)\n"
7301           "movups %%xmm14, (%6)\n"
7302           "movups %%xmm15, (%7)\n"
7303           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7304         );
7305       }
7306     }
7307     for (int j = 0; j < 2048; j += 2048) {
7308       for (int k = 0; k < 256; k += 4) {
7309         __asm__ volatile (
7310           "movups (%0), %%xmm0\n"
7311           "movups (%1), %%xmm1\n"
7312           "movups (%2), %%xmm2\n"
7313           "movups (%3), %%xmm3\n"
7314           "movups (%4), %%xmm4\n"
7315           "movups (%5), %%xmm5\n"
7316           "movups (%6), %%xmm6\n"
7317           "movups (%7), %%xmm7\n"
7318           "movaps %%xmm0, %%xmm8\n"
7319           "movaps %%xmm0, %%xmm9\n"
7320           "addps %%xmm1, %%xmm8\n"
7321           "subps %%xmm1, %%xmm9\n"
7322           "movaps %%xmm2, %%xmm10\n"
7323           "movaps %%xmm2, %%xmm11\n"
7324           "addps %%xmm3, %%xmm10\n"
7325           "subps %%xmm3, %%xmm11\n"
7326           "movaps %%xmm4, %%xmm12\n"
7327           "movaps %%xmm4, %%xmm13\n"
7328           "addps %%xmm5, %%xmm12\n"
7329           "subps %%xmm5, %%xmm13\n"
7330           "movaps %%xmm6, %%xmm14\n"
7331           "movaps %%xmm6, %%xmm15\n"
7332           "addps %%xmm7, %%xmm14\n"
7333           "subps %%xmm7, %%xmm15\n"
7334           "movaps %%xmm8, %%xmm0\n"
7335           "movaps %%xmm8, %%xmm2\n"
7336           "addps %%xmm10, %%xmm0\n"
7337           "subps %%xmm10, %%xmm2\n"
7338           "movaps %%xmm9, %%xmm1\n"
7339           "movaps %%xmm9, %%xmm3\n"
7340           "addps %%xmm11, %%xmm1\n"
7341           "subps %%xmm11, %%xmm3\n"
7342           "movaps %%xmm12, %%xmm4\n"
7343           "movaps %%xmm12, %%xmm6\n"
7344           "addps %%xmm14, %%xmm4\n"
7345           "subps %%xmm14, %%xmm6\n"
7346           "movaps %%xmm13, %%xmm5\n"
7347           "movaps %%xmm13, %%xmm7\n"
7348           "addps %%xmm15, %%xmm5\n"
7349           "subps %%xmm15, %%xmm7\n"
7350           "movaps %%xmm0, %%xmm8\n"
7351           "movaps %%xmm0, %%xmm12\n"
7352           "addps %%xmm4, %%xmm8\n"
7353           "subps %%xmm4, %%xmm12\n"
7354           "movaps %%xmm1, %%xmm9\n"
7355           "movaps %%xmm1, %%xmm13\n"
7356           "addps %%xmm5, %%xmm9\n"
7357           "subps %%xmm5, %%xmm13\n"
7358           "movaps %%xmm2, %%xmm10\n"
7359           "movaps %%xmm2, %%xmm14\n"
7360           "addps %%xmm6, %%xmm10\n"
7361           "subps %%xmm6, %%xmm14\n"
7362           "movaps %%xmm3, %%xmm11\n"
7363           "movaps %%xmm3, %%xmm15\n"
7364           "addps %%xmm7, %%xmm11\n"
7365           "subps %%xmm7, %%xmm15\n"
7366           "movups %%xmm8, (%0)\n"
7367           "movups %%xmm9, (%1)\n"
7368           "movups %%xmm10, (%2)\n"
7369           "movups %%xmm11, (%3)\n"
7370           "movups %%xmm12, (%4)\n"
7371           "movups %%xmm13, (%5)\n"
7372           "movups %%xmm14, (%6)\n"
7373           "movups %%xmm15, (%7)\n"
7374           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7375         );
7376       }
7377     }
7378     return;
7379   }
7380   if (depth == 14) {
7381     helper_float_22_recursive(buf + 0, 11);
7382     helper_float_22_recursive(buf + 2048, 11);
7383     helper_float_22_recursive(buf + 4096, 11);
7384     helper_float_22_recursive(buf + 6144, 11);
7385     helper_float_22_recursive(buf + 8192, 11);
7386     helper_float_22_recursive(buf + 10240, 11);
7387     helper_float_22_recursive(buf + 12288, 11);
7388     helper_float_22_recursive(buf + 14336, 11);
7389     for (int j = 0; j < 16384; j += 16384) {
7390       for (int k = 0; k < 2048; k += 4) {
7391         __asm__ volatile (
7392           "movups (%0), %%xmm0\n"
7393           "movups (%1), %%xmm1\n"
7394           "movups (%2), %%xmm2\n"
7395           "movups (%3), %%xmm3\n"
7396           "movups (%4), %%xmm4\n"
7397           "movups (%5), %%xmm5\n"
7398           "movups (%6), %%xmm6\n"
7399           "movups (%7), %%xmm7\n"
7400           "movaps %%xmm0, %%xmm8\n"
7401           "movaps %%xmm0, %%xmm9\n"
7402           "addps %%xmm1, %%xmm8\n"
7403           "subps %%xmm1, %%xmm9\n"
7404           "movaps %%xmm2, %%xmm10\n"
7405           "movaps %%xmm2, %%xmm11\n"
7406           "addps %%xmm3, %%xmm10\n"
7407           "subps %%xmm3, %%xmm11\n"
7408           "movaps %%xmm4, %%xmm12\n"
7409           "movaps %%xmm4, %%xmm13\n"
7410           "addps %%xmm5, %%xmm12\n"
7411           "subps %%xmm5, %%xmm13\n"
7412           "movaps %%xmm6, %%xmm14\n"
7413           "movaps %%xmm6, %%xmm15\n"
7414           "addps %%xmm7, %%xmm14\n"
7415           "subps %%xmm7, %%xmm15\n"
7416           "movaps %%xmm8, %%xmm0\n"
7417           "movaps %%xmm8, %%xmm2\n"
7418           "addps %%xmm10, %%xmm0\n"
7419           "subps %%xmm10, %%xmm2\n"
7420           "movaps %%xmm9, %%xmm1\n"
7421           "movaps %%xmm9, %%xmm3\n"
7422           "addps %%xmm11, %%xmm1\n"
7423           "subps %%xmm11, %%xmm3\n"
7424           "movaps %%xmm12, %%xmm4\n"
7425           "movaps %%xmm12, %%xmm6\n"
7426           "addps %%xmm14, %%xmm4\n"
7427           "subps %%xmm14, %%xmm6\n"
7428           "movaps %%xmm13, %%xmm5\n"
7429           "movaps %%xmm13, %%xmm7\n"
7430           "addps %%xmm15, %%xmm5\n"
7431           "subps %%xmm15, %%xmm7\n"
7432           "movaps %%xmm0, %%xmm8\n"
7433           "movaps %%xmm0, %%xmm12\n"
7434           "addps %%xmm4, %%xmm8\n"
7435           "subps %%xmm4, %%xmm12\n"
7436           "movaps %%xmm1, %%xmm9\n"
7437           "movaps %%xmm1, %%xmm13\n"
7438           "addps %%xmm5, %%xmm9\n"
7439           "subps %%xmm5, %%xmm13\n"
7440           "movaps %%xmm2, %%xmm10\n"
7441           "movaps %%xmm2, %%xmm14\n"
7442           "addps %%xmm6, %%xmm10\n"
7443           "subps %%xmm6, %%xmm14\n"
7444           "movaps %%xmm3, %%xmm11\n"
7445           "movaps %%xmm3, %%xmm15\n"
7446           "addps %%xmm7, %%xmm11\n"
7447           "subps %%xmm7, %%xmm15\n"
7448           "movups %%xmm8, (%0)\n"
7449           "movups %%xmm9, (%1)\n"
7450           "movups %%xmm10, (%2)\n"
7451           "movups %%xmm11, (%3)\n"
7452           "movups %%xmm12, (%4)\n"
7453           "movups %%xmm13, (%5)\n"
7454           "movups %%xmm14, (%6)\n"
7455           "movups %%xmm15, (%7)\n"
7456           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7457         );
7458       }
7459     }
7460     return;
7461   }
7462   if (depth == 17) {
7463     helper_float_22_recursive(buf + 0, 14);
7464     helper_float_22_recursive(buf + 16384, 14);
7465     helper_float_22_recursive(buf + 32768, 14);
7466     helper_float_22_recursive(buf + 49152, 14);
7467     helper_float_22_recursive(buf + 65536, 14);
7468     helper_float_22_recursive(buf + 81920, 14);
7469     helper_float_22_recursive(buf + 98304, 14);
7470     helper_float_22_recursive(buf + 114688, 14);
7471     for (int j = 0; j < 131072; j += 131072) {
7472       for (int k = 0; k < 16384; k += 4) {
7473         __asm__ volatile (
7474           "movups (%0), %%xmm0\n"
7475           "movups (%1), %%xmm1\n"
7476           "movups (%2), %%xmm2\n"
7477           "movups (%3), %%xmm3\n"
7478           "movups (%4), %%xmm4\n"
7479           "movups (%5), %%xmm5\n"
7480           "movups (%6), %%xmm6\n"
7481           "movups (%7), %%xmm7\n"
7482           "movaps %%xmm0, %%xmm8\n"
7483           "movaps %%xmm0, %%xmm9\n"
7484           "addps %%xmm1, %%xmm8\n"
7485           "subps %%xmm1, %%xmm9\n"
7486           "movaps %%xmm2, %%xmm10\n"
7487           "movaps %%xmm2, %%xmm11\n"
7488           "addps %%xmm3, %%xmm10\n"
7489           "subps %%xmm3, %%xmm11\n"
7490           "movaps %%xmm4, %%xmm12\n"
7491           "movaps %%xmm4, %%xmm13\n"
7492           "addps %%xmm5, %%xmm12\n"
7493           "subps %%xmm5, %%xmm13\n"
7494           "movaps %%xmm6, %%xmm14\n"
7495           "movaps %%xmm6, %%xmm15\n"
7496           "addps %%xmm7, %%xmm14\n"
7497           "subps %%xmm7, %%xmm15\n"
7498           "movaps %%xmm8, %%xmm0\n"
7499           "movaps %%xmm8, %%xmm2\n"
7500           "addps %%xmm10, %%xmm0\n"
7501           "subps %%xmm10, %%xmm2\n"
7502           "movaps %%xmm9, %%xmm1\n"
7503           "movaps %%xmm9, %%xmm3\n"
7504           "addps %%xmm11, %%xmm1\n"
7505           "subps %%xmm11, %%xmm3\n"
7506           "movaps %%xmm12, %%xmm4\n"
7507           "movaps %%xmm12, %%xmm6\n"
7508           "addps %%xmm14, %%xmm4\n"
7509           "subps %%xmm14, %%xmm6\n"
7510           "movaps %%xmm13, %%xmm5\n"
7511           "movaps %%xmm13, %%xmm7\n"
7512           "addps %%xmm15, %%xmm5\n"
7513           "subps %%xmm15, %%xmm7\n"
7514           "movaps %%xmm0, %%xmm8\n"
7515           "movaps %%xmm0, %%xmm12\n"
7516           "addps %%xmm4, %%xmm8\n"
7517           "subps %%xmm4, %%xmm12\n"
7518           "movaps %%xmm1, %%xmm9\n"
7519           "movaps %%xmm1, %%xmm13\n"
7520           "addps %%xmm5, %%xmm9\n"
7521           "subps %%xmm5, %%xmm13\n"
7522           "movaps %%xmm2, %%xmm10\n"
7523           "movaps %%xmm2, %%xmm14\n"
7524           "addps %%xmm6, %%xmm10\n"
7525           "subps %%xmm6, %%xmm14\n"
7526           "movaps %%xmm3, %%xmm11\n"
7527           "movaps %%xmm3, %%xmm15\n"
7528           "addps %%xmm7, %%xmm11\n"
7529           "subps %%xmm7, %%xmm15\n"
7530           "movups %%xmm8, (%0)\n"
7531           "movups %%xmm9, (%1)\n"
7532           "movups %%xmm10, (%2)\n"
7533           "movups %%xmm11, (%3)\n"
7534           "movups %%xmm12, (%4)\n"
7535           "movups %%xmm13, (%5)\n"
7536           "movups %%xmm14, (%6)\n"
7537           "movups %%xmm15, (%7)\n"
7538           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7539         );
7540       }
7541     }
7542     return;
7543   }
7544   if (depth == 20) {
7545     helper_float_22_recursive(buf + 0, 17);
7546     helper_float_22_recursive(buf + 131072, 17);
7547     helper_float_22_recursive(buf + 262144, 17);
7548     helper_float_22_recursive(buf + 393216, 17);
7549     helper_float_22_recursive(buf + 524288, 17);
7550     helper_float_22_recursive(buf + 655360, 17);
7551     helper_float_22_recursive(buf + 786432, 17);
7552     helper_float_22_recursive(buf + 917504, 17);
7553     for (int j = 0; j < 1048576; j += 1048576) {
7554       for (int k = 0; k < 131072; k += 4) {
7555         __asm__ volatile (
7556           "movups (%0), %%xmm0\n"
7557           "movups (%1), %%xmm1\n"
7558           "movups (%2), %%xmm2\n"
7559           "movups (%3), %%xmm3\n"
7560           "movups (%4), %%xmm4\n"
7561           "movups (%5), %%xmm5\n"
7562           "movups (%6), %%xmm6\n"
7563           "movups (%7), %%xmm7\n"
7564           "movaps %%xmm0, %%xmm8\n"
7565           "movaps %%xmm0, %%xmm9\n"
7566           "addps %%xmm1, %%xmm8\n"
7567           "subps %%xmm1, %%xmm9\n"
7568           "movaps %%xmm2, %%xmm10\n"
7569           "movaps %%xmm2, %%xmm11\n"
7570           "addps %%xmm3, %%xmm10\n"
7571           "subps %%xmm3, %%xmm11\n"
7572           "movaps %%xmm4, %%xmm12\n"
7573           "movaps %%xmm4, %%xmm13\n"
7574           "addps %%xmm5, %%xmm12\n"
7575           "subps %%xmm5, %%xmm13\n"
7576           "movaps %%xmm6, %%xmm14\n"
7577           "movaps %%xmm6, %%xmm15\n"
7578           "addps %%xmm7, %%xmm14\n"
7579           "subps %%xmm7, %%xmm15\n"
7580           "movaps %%xmm8, %%xmm0\n"
7581           "movaps %%xmm8, %%xmm2\n"
7582           "addps %%xmm10, %%xmm0\n"
7583           "subps %%xmm10, %%xmm2\n"
7584           "movaps %%xmm9, %%xmm1\n"
7585           "movaps %%xmm9, %%xmm3\n"
7586           "addps %%xmm11, %%xmm1\n"
7587           "subps %%xmm11, %%xmm3\n"
7588           "movaps %%xmm12, %%xmm4\n"
7589           "movaps %%xmm12, %%xmm6\n"
7590           "addps %%xmm14, %%xmm4\n"
7591           "subps %%xmm14, %%xmm6\n"
7592           "movaps %%xmm13, %%xmm5\n"
7593           "movaps %%xmm13, %%xmm7\n"
7594           "addps %%xmm15, %%xmm5\n"
7595           "subps %%xmm15, %%xmm7\n"
7596           "movaps %%xmm0, %%xmm8\n"
7597           "movaps %%xmm0, %%xmm12\n"
7598           "addps %%xmm4, %%xmm8\n"
7599           "subps %%xmm4, %%xmm12\n"
7600           "movaps %%xmm1, %%xmm9\n"
7601           "movaps %%xmm1, %%xmm13\n"
7602           "addps %%xmm5, %%xmm9\n"
7603           "subps %%xmm5, %%xmm13\n"
7604           "movaps %%xmm2, %%xmm10\n"
7605           "movaps %%xmm2, %%xmm14\n"
7606           "addps %%xmm6, %%xmm10\n"
7607           "subps %%xmm6, %%xmm14\n"
7608           "movaps %%xmm3, %%xmm11\n"
7609           "movaps %%xmm3, %%xmm15\n"
7610           "addps %%xmm7, %%xmm11\n"
7611           "subps %%xmm7, %%xmm15\n"
7612           "movups %%xmm8, (%0)\n"
7613           "movups %%xmm9, (%1)\n"
7614           "movups %%xmm10, (%2)\n"
7615           "movups %%xmm11, (%3)\n"
7616           "movups %%xmm12, (%4)\n"
7617           "movups %%xmm13, (%5)\n"
7618           "movups %%xmm14, (%6)\n"
7619           "movups %%xmm15, (%7)\n"
7620           :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7621         );
7622       }
7623     }
7624     return;
7625   }
7626   if (depth == 22) {
7627     helper_float_22_recursive(buf + 0, 20);
7628     helper_float_22_recursive(buf + 1048576, 20);
7629     helper_float_22_recursive(buf + 2097152, 20);
7630     helper_float_22_recursive(buf + 3145728, 20);
7631     for (int j = 0; j < 4194304; j += 4194304) {
7632       for (int k = 0; k < 1048576; k += 4) {
7633         __asm__ volatile (
7634           "movups (%0), %%xmm0\n"
7635           "movups (%1), %%xmm1\n"
7636           "movups (%2), %%xmm2\n"
7637           "movups (%3), %%xmm3\n"
7638           "movaps %%xmm0, %%xmm8\n"
7639           "movaps %%xmm0, %%xmm9\n"
7640           "addps %%xmm1, %%xmm8\n"
7641           "subps %%xmm1, %%xmm9\n"
7642           "movaps %%xmm2, %%xmm10\n"
7643           "movaps %%xmm2, %%xmm11\n"
7644           "addps %%xmm3, %%xmm10\n"
7645           "subps %%xmm3, %%xmm11\n"
7646           "movaps %%xmm8, %%xmm0\n"
7647           "movaps %%xmm8, %%xmm2\n"
7648           "addps %%xmm10, %%xmm0\n"
7649           "subps %%xmm10, %%xmm2\n"
7650           "movaps %%xmm9, %%xmm1\n"
7651           "movaps %%xmm9, %%xmm3\n"
7652           "addps %%xmm11, %%xmm1\n"
7653           "subps %%xmm11, %%xmm3\n"
7654           "movups %%xmm0, (%0)\n"
7655           "movups %%xmm1, (%1)\n"
7656           "movups %%xmm2, (%2)\n"
7657           "movups %%xmm3, (%3)\n"
7658           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7659         );
7660       }
7661     }
7662     return;
7663   }
7664 }
7665 void helper_float_22(float *buf);
helper_float_22(float * buf)7666 void helper_float_22(float *buf) {
7667   helper_float_22_recursive(buf, 22);
7668 }
7669 void helper_float_23_recursive(float *buf, int depth);
helper_float_23_recursive(float * buf,int depth)7670 void helper_float_23_recursive(float *buf, int depth) {
7671   if (depth == 6) {
7672     for (int j = 0; j < 64; j += 32) {
7673       for (int k = 0; k < 4; k += 4) {
7674         __asm__ volatile (
7675           "movups (%0), %%xmm0\n"
7676           "movups (%1), %%xmm1\n"
7677           "movups (%2), %%xmm2\n"
7678           "movups (%3), %%xmm3\n"
7679           "movups (%4), %%xmm4\n"
7680           "movups (%5), %%xmm5\n"
7681           "movups (%6), %%xmm6\n"
7682           "movups (%7), %%xmm7\n"
7683           "movaps %%xmm0, %%xmm8\n"
7684           "shufps $160, %%xmm8, %%xmm8\n"
7685           "shufps $245, %%xmm0, %%xmm0\n"
7686           "xorps %%xmm9, %%xmm9\n"
7687           "subps %%xmm0, %%xmm9\n"
7688           "addsubps %%xmm9, %%xmm8\n"
7689           "movaps %%xmm8, %%xmm0\n"
7690           "movaps %%xmm1, %%xmm8\n"
7691           "shufps $160, %%xmm8, %%xmm8\n"
7692           "shufps $245, %%xmm1, %%xmm1\n"
7693           "xorps %%xmm9, %%xmm9\n"
7694           "subps %%xmm1, %%xmm9\n"
7695           "addsubps %%xmm9, %%xmm8\n"
7696           "movaps %%xmm8, %%xmm1\n"
7697           "movaps %%xmm2, %%xmm8\n"
7698           "shufps $160, %%xmm8, %%xmm8\n"
7699           "shufps $245, %%xmm2, %%xmm2\n"
7700           "xorps %%xmm9, %%xmm9\n"
7701           "subps %%xmm2, %%xmm9\n"
7702           "addsubps %%xmm9, %%xmm8\n"
7703           "movaps %%xmm8, %%xmm2\n"
7704           "movaps %%xmm3, %%xmm8\n"
7705           "shufps $160, %%xmm8, %%xmm8\n"
7706           "shufps $245, %%xmm3, %%xmm3\n"
7707           "xorps %%xmm9, %%xmm9\n"
7708           "subps %%xmm3, %%xmm9\n"
7709           "addsubps %%xmm9, %%xmm8\n"
7710           "movaps %%xmm8, %%xmm3\n"
7711           "movaps %%xmm4, %%xmm8\n"
7712           "shufps $160, %%xmm8, %%xmm8\n"
7713           "shufps $245, %%xmm4, %%xmm4\n"
7714           "xorps %%xmm9, %%xmm9\n"
7715           "subps %%xmm4, %%xmm9\n"
7716           "addsubps %%xmm9, %%xmm8\n"
7717           "movaps %%xmm8, %%xmm4\n"
7718           "movaps %%xmm5, %%xmm8\n"
7719           "shufps $160, %%xmm8, %%xmm8\n"
7720           "shufps $245, %%xmm5, %%xmm5\n"
7721           "xorps %%xmm9, %%xmm9\n"
7722           "subps %%xmm5, %%xmm9\n"
7723           "addsubps %%xmm9, %%xmm8\n"
7724           "movaps %%xmm8, %%xmm5\n"
7725           "movaps %%xmm6, %%xmm8\n"
7726           "shufps $160, %%xmm8, %%xmm8\n"
7727           "shufps $245, %%xmm6, %%xmm6\n"
7728           "xorps %%xmm9, %%xmm9\n"
7729           "subps %%xmm6, %%xmm9\n"
7730           "addsubps %%xmm9, %%xmm8\n"
7731           "movaps %%xmm8, %%xmm6\n"
7732           "movaps %%xmm7, %%xmm8\n"
7733           "shufps $160, %%xmm8, %%xmm8\n"
7734           "shufps $245, %%xmm7, %%xmm7\n"
7735           "xorps %%xmm9, %%xmm9\n"
7736           "subps %%xmm7, %%xmm9\n"
7737           "addsubps %%xmm9, %%xmm8\n"
7738           "movaps %%xmm8, %%xmm7\n"
7739           "movaps %%xmm0, %%xmm8\n"
7740           "shufps $68, %%xmm8, %%xmm8\n"
7741           "xorps %%xmm9, %%xmm9\n"
7742           "movaps %%xmm0, %%xmm10\n"
7743           "shufps $14, %%xmm9, %%xmm10\n"
7744           "movaps %%xmm0, %%xmm11\n"
7745           "shufps $224, %%xmm11, %%xmm9\n"
7746           "addps %%xmm8, %%xmm10\n"
7747           "subps %%xmm9, %%xmm10\n"
7748           "movaps %%xmm10, %%xmm0\n"
7749           "movaps %%xmm1, %%xmm8\n"
7750           "shufps $68, %%xmm8, %%xmm8\n"
7751           "xorps %%xmm9, %%xmm9\n"
7752           "movaps %%xmm1, %%xmm10\n"
7753           "shufps $14, %%xmm9, %%xmm10\n"
7754           "movaps %%xmm1, %%xmm11\n"
7755           "shufps $224, %%xmm11, %%xmm9\n"
7756           "addps %%xmm8, %%xmm10\n"
7757           "subps %%xmm9, %%xmm10\n"
7758           "movaps %%xmm10, %%xmm1\n"
7759           "movaps %%xmm2, %%xmm8\n"
7760           "shufps $68, %%xmm8, %%xmm8\n"
7761           "xorps %%xmm9, %%xmm9\n"
7762           "movaps %%xmm2, %%xmm10\n"
7763           "shufps $14, %%xmm9, %%xmm10\n"
7764           "movaps %%xmm2, %%xmm11\n"
7765           "shufps $224, %%xmm11, %%xmm9\n"
7766           "addps %%xmm8, %%xmm10\n"
7767           "subps %%xmm9, %%xmm10\n"
7768           "movaps %%xmm10, %%xmm2\n"
7769           "movaps %%xmm3, %%xmm8\n"
7770           "shufps $68, %%xmm8, %%xmm8\n"
7771           "xorps %%xmm9, %%xmm9\n"
7772           "movaps %%xmm3, %%xmm10\n"
7773           "shufps $14, %%xmm9, %%xmm10\n"
7774           "movaps %%xmm3, %%xmm11\n"
7775           "shufps $224, %%xmm11, %%xmm9\n"
7776           "addps %%xmm8, %%xmm10\n"
7777           "subps %%xmm9, %%xmm10\n"
7778           "movaps %%xmm10, %%xmm3\n"
7779           "movaps %%xmm4, %%xmm8\n"
7780           "shufps $68, %%xmm8, %%xmm8\n"
7781           "xorps %%xmm9, %%xmm9\n"
7782           "movaps %%xmm4, %%xmm10\n"
7783           "shufps $14, %%xmm9, %%xmm10\n"
7784           "movaps %%xmm4, %%xmm11\n"
7785           "shufps $224, %%xmm11, %%xmm9\n"
7786           "addps %%xmm8, %%xmm10\n"
7787           "subps %%xmm9, %%xmm10\n"
7788           "movaps %%xmm10, %%xmm4\n"
7789           "movaps %%xmm5, %%xmm8\n"
7790           "shufps $68, %%xmm8, %%xmm8\n"
7791           "xorps %%xmm9, %%xmm9\n"
7792           "movaps %%xmm5, %%xmm10\n"
7793           "shufps $14, %%xmm9, %%xmm10\n"
7794           "movaps %%xmm5, %%xmm11\n"
7795           "shufps $224, %%xmm11, %%xmm9\n"
7796           "addps %%xmm8, %%xmm10\n"
7797           "subps %%xmm9, %%xmm10\n"
7798           "movaps %%xmm10, %%xmm5\n"
7799           "movaps %%xmm6, %%xmm8\n"
7800           "shufps $68, %%xmm8, %%xmm8\n"
7801           "xorps %%xmm9, %%xmm9\n"
7802           "movaps %%xmm6, %%xmm10\n"
7803           "shufps $14, %%xmm9, %%xmm10\n"
7804           "movaps %%xmm6, %%xmm11\n"
7805           "shufps $224, %%xmm11, %%xmm9\n"
7806           "addps %%xmm8, %%xmm10\n"
7807           "subps %%xmm9, %%xmm10\n"
7808           "movaps %%xmm10, %%xmm6\n"
7809           "movaps %%xmm7, %%xmm8\n"
7810           "shufps $68, %%xmm8, %%xmm8\n"
7811           "xorps %%xmm9, %%xmm9\n"
7812           "movaps %%xmm7, %%xmm10\n"
7813           "shufps $14, %%xmm9, %%xmm10\n"
7814           "movaps %%xmm7, %%xmm11\n"
7815           "shufps $224, %%xmm11, %%xmm9\n"
7816           "addps %%xmm8, %%xmm10\n"
7817           "subps %%xmm9, %%xmm10\n"
7818           "movaps %%xmm10, %%xmm7\n"
7819           "movaps %%xmm0, %%xmm8\n"
7820           "movaps %%xmm0, %%xmm9\n"
7821           "addps %%xmm1, %%xmm8\n"
7822           "subps %%xmm1, %%xmm9\n"
7823           "movaps %%xmm2, %%xmm10\n"
7824           "movaps %%xmm2, %%xmm11\n"
7825           "addps %%xmm3, %%xmm10\n"
7826           "subps %%xmm3, %%xmm11\n"
7827           "movaps %%xmm4, %%xmm12\n"
7828           "movaps %%xmm4, %%xmm13\n"
7829           "addps %%xmm5, %%xmm12\n"
7830           "subps %%xmm5, %%xmm13\n"
7831           "movaps %%xmm6, %%xmm14\n"
7832           "movaps %%xmm6, %%xmm15\n"
7833           "addps %%xmm7, %%xmm14\n"
7834           "subps %%xmm7, %%xmm15\n"
7835           "movaps %%xmm8, %%xmm0\n"
7836           "movaps %%xmm8, %%xmm2\n"
7837           "addps %%xmm10, %%xmm0\n"
7838           "subps %%xmm10, %%xmm2\n"
7839           "movaps %%xmm9, %%xmm1\n"
7840           "movaps %%xmm9, %%xmm3\n"
7841           "addps %%xmm11, %%xmm1\n"
7842           "subps %%xmm11, %%xmm3\n"
7843           "movaps %%xmm12, %%xmm4\n"
7844           "movaps %%xmm12, %%xmm6\n"
7845           "addps %%xmm14, %%xmm4\n"
7846           "subps %%xmm14, %%xmm6\n"
7847           "movaps %%xmm13, %%xmm5\n"
7848           "movaps %%xmm13, %%xmm7\n"
7849           "addps %%xmm15, %%xmm5\n"
7850           "subps %%xmm15, %%xmm7\n"
7851           "movaps %%xmm0, %%xmm8\n"
7852           "movaps %%xmm0, %%xmm12\n"
7853           "addps %%xmm4, %%xmm8\n"
7854           "subps %%xmm4, %%xmm12\n"
7855           "movaps %%xmm1, %%xmm9\n"
7856           "movaps %%xmm1, %%xmm13\n"
7857           "addps %%xmm5, %%xmm9\n"
7858           "subps %%xmm5, %%xmm13\n"
7859           "movaps %%xmm2, %%xmm10\n"
7860           "movaps %%xmm2, %%xmm14\n"
7861           "addps %%xmm6, %%xmm10\n"
7862           "subps %%xmm6, %%xmm14\n"
7863           "movaps %%xmm3, %%xmm11\n"
7864           "movaps %%xmm3, %%xmm15\n"
7865           "addps %%xmm7, %%xmm11\n"
7866           "subps %%xmm7, %%xmm15\n"
7867           "movups %%xmm8, (%0)\n"
7868           "movups %%xmm9, (%1)\n"
7869           "movups %%xmm10, (%2)\n"
7870           "movups %%xmm11, (%3)\n"
7871           "movups %%xmm12, (%4)\n"
7872           "movups %%xmm13, (%5)\n"
7873           "movups %%xmm14, (%6)\n"
7874           "movups %%xmm15, (%7)\n"
7875           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7876         );
7877       }
7878     }
7879     for (int j = 0; j < 64; j += 64) {
7880       for (int k = 0; k < 32; k += 4) {
7881         __asm__ volatile (
7882           "movups (%0), %%xmm0\n"
7883           "movups (%1), %%xmm1\n"
7884           "movaps %%xmm0, %%xmm8\n"
7885           "movaps %%xmm0, %%xmm9\n"
7886           "addps %%xmm1, %%xmm8\n"
7887           "subps %%xmm1, %%xmm9\n"
7888           "movups %%xmm8, (%0)\n"
7889           "movups %%xmm9, (%1)\n"
7890           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7891         );
7892       }
7893     }
7894     return;
7895   }
7896   if (depth == 9) {
7897     helper_float_23_recursive(buf + 0, 6);
7898     helper_float_23_recursive(buf + 64, 6);
7899     helper_float_23_recursive(buf + 128, 6);
7900     helper_float_23_recursive(buf + 192, 6);
7901     helper_float_23_recursive(buf + 256, 6);
7902     helper_float_23_recursive(buf + 320, 6);
7903     helper_float_23_recursive(buf + 384, 6);
7904     helper_float_23_recursive(buf + 448, 6);
7905     for (int j = 0; j < 512; j += 512) {
7906       for (int k = 0; k < 64; k += 4) {
7907         __asm__ volatile (
7908           "movups (%0), %%xmm0\n"
7909           "movups (%1), %%xmm1\n"
7910           "movups (%2), %%xmm2\n"
7911           "movups (%3), %%xmm3\n"
7912           "movups (%4), %%xmm4\n"
7913           "movups (%5), %%xmm5\n"
7914           "movups (%6), %%xmm6\n"
7915           "movups (%7), %%xmm7\n"
7916           "movaps %%xmm0, %%xmm8\n"
7917           "movaps %%xmm0, %%xmm9\n"
7918           "addps %%xmm1, %%xmm8\n"
7919           "subps %%xmm1, %%xmm9\n"
7920           "movaps %%xmm2, %%xmm10\n"
7921           "movaps %%xmm2, %%xmm11\n"
7922           "addps %%xmm3, %%xmm10\n"
7923           "subps %%xmm3, %%xmm11\n"
7924           "movaps %%xmm4, %%xmm12\n"
7925           "movaps %%xmm4, %%xmm13\n"
7926           "addps %%xmm5, %%xmm12\n"
7927           "subps %%xmm5, %%xmm13\n"
7928           "movaps %%xmm6, %%xmm14\n"
7929           "movaps %%xmm6, %%xmm15\n"
7930           "addps %%xmm7, %%xmm14\n"
7931           "subps %%xmm7, %%xmm15\n"
7932           "movaps %%xmm8, %%xmm0\n"
7933           "movaps %%xmm8, %%xmm2\n"
7934           "addps %%xmm10, %%xmm0\n"
7935           "subps %%xmm10, %%xmm2\n"
7936           "movaps %%xmm9, %%xmm1\n"
7937           "movaps %%xmm9, %%xmm3\n"
7938           "addps %%xmm11, %%xmm1\n"
7939           "subps %%xmm11, %%xmm3\n"
7940           "movaps %%xmm12, %%xmm4\n"
7941           "movaps %%xmm12, %%xmm6\n"
7942           "addps %%xmm14, %%xmm4\n"
7943           "subps %%xmm14, %%xmm6\n"
7944           "movaps %%xmm13, %%xmm5\n"
7945           "movaps %%xmm13, %%xmm7\n"
7946           "addps %%xmm15, %%xmm5\n"
7947           "subps %%xmm15, %%xmm7\n"
7948           "movaps %%xmm0, %%xmm8\n"
7949           "movaps %%xmm0, %%xmm12\n"
7950           "addps %%xmm4, %%xmm8\n"
7951           "subps %%xmm4, %%xmm12\n"
7952           "movaps %%xmm1, %%xmm9\n"
7953           "movaps %%xmm1, %%xmm13\n"
7954           "addps %%xmm5, %%xmm9\n"
7955           "subps %%xmm5, %%xmm13\n"
7956           "movaps %%xmm2, %%xmm10\n"
7957           "movaps %%xmm2, %%xmm14\n"
7958           "addps %%xmm6, %%xmm10\n"
7959           "subps %%xmm6, %%xmm14\n"
7960           "movaps %%xmm3, %%xmm11\n"
7961           "movaps %%xmm3, %%xmm15\n"
7962           "addps %%xmm7, %%xmm11\n"
7963           "subps %%xmm7, %%xmm15\n"
7964           "movups %%xmm8, (%0)\n"
7965           "movups %%xmm9, (%1)\n"
7966           "movups %%xmm10, (%2)\n"
7967           "movups %%xmm11, (%3)\n"
7968           "movups %%xmm12, (%4)\n"
7969           "movups %%xmm13, (%5)\n"
7970           "movups %%xmm14, (%6)\n"
7971           "movups %%xmm15, (%7)\n"
7972           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7973         );
7974       }
7975     }
7976     return;
7977   }
7978   if (depth == 12) {
7979     helper_float_23_recursive(buf + 0, 9);
7980     helper_float_23_recursive(buf + 512, 9);
7981     helper_float_23_recursive(buf + 1024, 9);
7982     helper_float_23_recursive(buf + 1536, 9);
7983     helper_float_23_recursive(buf + 2048, 9);
7984     helper_float_23_recursive(buf + 2560, 9);
7985     helper_float_23_recursive(buf + 3072, 9);
7986     helper_float_23_recursive(buf + 3584, 9);
7987     for (int j = 0; j < 4096; j += 4096) {
7988       for (int k = 0; k < 512; k += 4) {
7989         __asm__ volatile (
7990           "movups (%0), %%xmm0\n"
7991           "movups (%1), %%xmm1\n"
7992           "movups (%2), %%xmm2\n"
7993           "movups (%3), %%xmm3\n"
7994           "movups (%4), %%xmm4\n"
7995           "movups (%5), %%xmm5\n"
7996           "movups (%6), %%xmm6\n"
7997           "movups (%7), %%xmm7\n"
7998           "movaps %%xmm0, %%xmm8\n"
7999           "movaps %%xmm0, %%xmm9\n"
8000           "addps %%xmm1, %%xmm8\n"
8001           "subps %%xmm1, %%xmm9\n"
8002           "movaps %%xmm2, %%xmm10\n"
8003           "movaps %%xmm2, %%xmm11\n"
8004           "addps %%xmm3, %%xmm10\n"
8005           "subps %%xmm3, %%xmm11\n"
8006           "movaps %%xmm4, %%xmm12\n"
8007           "movaps %%xmm4, %%xmm13\n"
8008           "addps %%xmm5, %%xmm12\n"
8009           "subps %%xmm5, %%xmm13\n"
8010           "movaps %%xmm6, %%xmm14\n"
8011           "movaps %%xmm6, %%xmm15\n"
8012           "addps %%xmm7, %%xmm14\n"
8013           "subps %%xmm7, %%xmm15\n"
8014           "movaps %%xmm8, %%xmm0\n"
8015           "movaps %%xmm8, %%xmm2\n"
8016           "addps %%xmm10, %%xmm0\n"
8017           "subps %%xmm10, %%xmm2\n"
8018           "movaps %%xmm9, %%xmm1\n"
8019           "movaps %%xmm9, %%xmm3\n"
8020           "addps %%xmm11, %%xmm1\n"
8021           "subps %%xmm11, %%xmm3\n"
8022           "movaps %%xmm12, %%xmm4\n"
8023           "movaps %%xmm12, %%xmm6\n"
8024           "addps %%xmm14, %%xmm4\n"
8025           "subps %%xmm14, %%xmm6\n"
8026           "movaps %%xmm13, %%xmm5\n"
8027           "movaps %%xmm13, %%xmm7\n"
8028           "addps %%xmm15, %%xmm5\n"
8029           "subps %%xmm15, %%xmm7\n"
8030           "movaps %%xmm0, %%xmm8\n"
8031           "movaps %%xmm0, %%xmm12\n"
8032           "addps %%xmm4, %%xmm8\n"
8033           "subps %%xmm4, %%xmm12\n"
8034           "movaps %%xmm1, %%xmm9\n"
8035           "movaps %%xmm1, %%xmm13\n"
8036           "addps %%xmm5, %%xmm9\n"
8037           "subps %%xmm5, %%xmm13\n"
8038           "movaps %%xmm2, %%xmm10\n"
8039           "movaps %%xmm2, %%xmm14\n"
8040           "addps %%xmm6, %%xmm10\n"
8041           "subps %%xmm6, %%xmm14\n"
8042           "movaps %%xmm3, %%xmm11\n"
8043           "movaps %%xmm3, %%xmm15\n"
8044           "addps %%xmm7, %%xmm11\n"
8045           "subps %%xmm7, %%xmm15\n"
8046           "movups %%xmm8, (%0)\n"
8047           "movups %%xmm9, (%1)\n"
8048           "movups %%xmm10, (%2)\n"
8049           "movups %%xmm11, (%3)\n"
8050           "movups %%xmm12, (%4)\n"
8051           "movups %%xmm13, (%5)\n"
8052           "movups %%xmm14, (%6)\n"
8053           "movups %%xmm15, (%7)\n"
8054           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8055         );
8056       }
8057     }
8058     return;
8059   }
8060   if (depth == 15) {
8061     helper_float_23_recursive(buf + 0, 12);
8062     helper_float_23_recursive(buf + 4096, 12);
8063     helper_float_23_recursive(buf + 8192, 12);
8064     helper_float_23_recursive(buf + 12288, 12);
8065     helper_float_23_recursive(buf + 16384, 12);
8066     helper_float_23_recursive(buf + 20480, 12);
8067     helper_float_23_recursive(buf + 24576, 12);
8068     helper_float_23_recursive(buf + 28672, 12);
8069     for (int j = 0; j < 32768; j += 32768) {
8070       for (int k = 0; k < 4096; k += 4) {
8071         __asm__ volatile (
8072           "movups (%0), %%xmm0\n"
8073           "movups (%1), %%xmm1\n"
8074           "movups (%2), %%xmm2\n"
8075           "movups (%3), %%xmm3\n"
8076           "movups (%4), %%xmm4\n"
8077           "movups (%5), %%xmm5\n"
8078           "movups (%6), %%xmm6\n"
8079           "movups (%7), %%xmm7\n"
8080           "movaps %%xmm0, %%xmm8\n"
8081           "movaps %%xmm0, %%xmm9\n"
8082           "addps %%xmm1, %%xmm8\n"
8083           "subps %%xmm1, %%xmm9\n"
8084           "movaps %%xmm2, %%xmm10\n"
8085           "movaps %%xmm2, %%xmm11\n"
8086           "addps %%xmm3, %%xmm10\n"
8087           "subps %%xmm3, %%xmm11\n"
8088           "movaps %%xmm4, %%xmm12\n"
8089           "movaps %%xmm4, %%xmm13\n"
8090           "addps %%xmm5, %%xmm12\n"
8091           "subps %%xmm5, %%xmm13\n"
8092           "movaps %%xmm6, %%xmm14\n"
8093           "movaps %%xmm6, %%xmm15\n"
8094           "addps %%xmm7, %%xmm14\n"
8095           "subps %%xmm7, %%xmm15\n"
8096           "movaps %%xmm8, %%xmm0\n"
8097           "movaps %%xmm8, %%xmm2\n"
8098           "addps %%xmm10, %%xmm0\n"
8099           "subps %%xmm10, %%xmm2\n"
8100           "movaps %%xmm9, %%xmm1\n"
8101           "movaps %%xmm9, %%xmm3\n"
8102           "addps %%xmm11, %%xmm1\n"
8103           "subps %%xmm11, %%xmm3\n"
8104           "movaps %%xmm12, %%xmm4\n"
8105           "movaps %%xmm12, %%xmm6\n"
8106           "addps %%xmm14, %%xmm4\n"
8107           "subps %%xmm14, %%xmm6\n"
8108           "movaps %%xmm13, %%xmm5\n"
8109           "movaps %%xmm13, %%xmm7\n"
8110           "addps %%xmm15, %%xmm5\n"
8111           "subps %%xmm15, %%xmm7\n"
8112           "movaps %%xmm0, %%xmm8\n"
8113           "movaps %%xmm0, %%xmm12\n"
8114           "addps %%xmm4, %%xmm8\n"
8115           "subps %%xmm4, %%xmm12\n"
8116           "movaps %%xmm1, %%xmm9\n"
8117           "movaps %%xmm1, %%xmm13\n"
8118           "addps %%xmm5, %%xmm9\n"
8119           "subps %%xmm5, %%xmm13\n"
8120           "movaps %%xmm2, %%xmm10\n"
8121           "movaps %%xmm2, %%xmm14\n"
8122           "addps %%xmm6, %%xmm10\n"
8123           "subps %%xmm6, %%xmm14\n"
8124           "movaps %%xmm3, %%xmm11\n"
8125           "movaps %%xmm3, %%xmm15\n"
8126           "addps %%xmm7, %%xmm11\n"
8127           "subps %%xmm7, %%xmm15\n"
8128           "movups %%xmm8, (%0)\n"
8129           "movups %%xmm9, (%1)\n"
8130           "movups %%xmm10, (%2)\n"
8131           "movups %%xmm11, (%3)\n"
8132           "movups %%xmm12, (%4)\n"
8133           "movups %%xmm13, (%5)\n"
8134           "movups %%xmm14, (%6)\n"
8135           "movups %%xmm15, (%7)\n"
8136           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8137         );
8138       }
8139     }
8140     return;
8141   }
8142   if (depth == 18) {
8143     helper_float_23_recursive(buf + 0, 15);
8144     helper_float_23_recursive(buf + 32768, 15);
8145     helper_float_23_recursive(buf + 65536, 15);
8146     helper_float_23_recursive(buf + 98304, 15);
8147     helper_float_23_recursive(buf + 131072, 15);
8148     helper_float_23_recursive(buf + 163840, 15);
8149     helper_float_23_recursive(buf + 196608, 15);
8150     helper_float_23_recursive(buf + 229376, 15);
8151     for (int j = 0; j < 262144; j += 262144) {
8152       for (int k = 0; k < 32768; k += 4) {
8153         __asm__ volatile (
8154           "movups (%0), %%xmm0\n"
8155           "movups (%1), %%xmm1\n"
8156           "movups (%2), %%xmm2\n"
8157           "movups (%3), %%xmm3\n"
8158           "movups (%4), %%xmm4\n"
8159           "movups (%5), %%xmm5\n"
8160           "movups (%6), %%xmm6\n"
8161           "movups (%7), %%xmm7\n"
8162           "movaps %%xmm0, %%xmm8\n"
8163           "movaps %%xmm0, %%xmm9\n"
8164           "addps %%xmm1, %%xmm8\n"
8165           "subps %%xmm1, %%xmm9\n"
8166           "movaps %%xmm2, %%xmm10\n"
8167           "movaps %%xmm2, %%xmm11\n"
8168           "addps %%xmm3, %%xmm10\n"
8169           "subps %%xmm3, %%xmm11\n"
8170           "movaps %%xmm4, %%xmm12\n"
8171           "movaps %%xmm4, %%xmm13\n"
8172           "addps %%xmm5, %%xmm12\n"
8173           "subps %%xmm5, %%xmm13\n"
8174           "movaps %%xmm6, %%xmm14\n"
8175           "movaps %%xmm6, %%xmm15\n"
8176           "addps %%xmm7, %%xmm14\n"
8177           "subps %%xmm7, %%xmm15\n"
8178           "movaps %%xmm8, %%xmm0\n"
8179           "movaps %%xmm8, %%xmm2\n"
8180           "addps %%xmm10, %%xmm0\n"
8181           "subps %%xmm10, %%xmm2\n"
8182           "movaps %%xmm9, %%xmm1\n"
8183           "movaps %%xmm9, %%xmm3\n"
8184           "addps %%xmm11, %%xmm1\n"
8185           "subps %%xmm11, %%xmm3\n"
8186           "movaps %%xmm12, %%xmm4\n"
8187           "movaps %%xmm12, %%xmm6\n"
8188           "addps %%xmm14, %%xmm4\n"
8189           "subps %%xmm14, %%xmm6\n"
8190           "movaps %%xmm13, %%xmm5\n"
8191           "movaps %%xmm13, %%xmm7\n"
8192           "addps %%xmm15, %%xmm5\n"
8193           "subps %%xmm15, %%xmm7\n"
8194           "movaps %%xmm0, %%xmm8\n"
8195           "movaps %%xmm0, %%xmm12\n"
8196           "addps %%xmm4, %%xmm8\n"
8197           "subps %%xmm4, %%xmm12\n"
8198           "movaps %%xmm1, %%xmm9\n"
8199           "movaps %%xmm1, %%xmm13\n"
8200           "addps %%xmm5, %%xmm9\n"
8201           "subps %%xmm5, %%xmm13\n"
8202           "movaps %%xmm2, %%xmm10\n"
8203           "movaps %%xmm2, %%xmm14\n"
8204           "addps %%xmm6, %%xmm10\n"
8205           "subps %%xmm6, %%xmm14\n"
8206           "movaps %%xmm3, %%xmm11\n"
8207           "movaps %%xmm3, %%xmm15\n"
8208           "addps %%xmm7, %%xmm11\n"
8209           "subps %%xmm7, %%xmm15\n"
8210           "movups %%xmm8, (%0)\n"
8211           "movups %%xmm9, (%1)\n"
8212           "movups %%xmm10, (%2)\n"
8213           "movups %%xmm11, (%3)\n"
8214           "movups %%xmm12, (%4)\n"
8215           "movups %%xmm13, (%5)\n"
8216           "movups %%xmm14, (%6)\n"
8217           "movups %%xmm15, (%7)\n"
8218           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8219         );
8220       }
8221     }
8222     return;
8223   }
8224   if (depth == 21) {
8225     helper_float_23_recursive(buf + 0, 18);
8226     helper_float_23_recursive(buf + 262144, 18);
8227     helper_float_23_recursive(buf + 524288, 18);
8228     helper_float_23_recursive(buf + 786432, 18);
8229     helper_float_23_recursive(buf + 1048576, 18);
8230     helper_float_23_recursive(buf + 1310720, 18);
8231     helper_float_23_recursive(buf + 1572864, 18);
8232     helper_float_23_recursive(buf + 1835008, 18);
8233     for (int j = 0; j < 2097152; j += 2097152) {
8234       for (int k = 0; k < 262144; k += 4) {
8235         __asm__ volatile (
8236           "movups (%0), %%xmm0\n"
8237           "movups (%1), %%xmm1\n"
8238           "movups (%2), %%xmm2\n"
8239           "movups (%3), %%xmm3\n"
8240           "movups (%4), %%xmm4\n"
8241           "movups (%5), %%xmm5\n"
8242           "movups (%6), %%xmm6\n"
8243           "movups (%7), %%xmm7\n"
8244           "movaps %%xmm0, %%xmm8\n"
8245           "movaps %%xmm0, %%xmm9\n"
8246           "addps %%xmm1, %%xmm8\n"
8247           "subps %%xmm1, %%xmm9\n"
8248           "movaps %%xmm2, %%xmm10\n"
8249           "movaps %%xmm2, %%xmm11\n"
8250           "addps %%xmm3, %%xmm10\n"
8251           "subps %%xmm3, %%xmm11\n"
8252           "movaps %%xmm4, %%xmm12\n"
8253           "movaps %%xmm4, %%xmm13\n"
8254           "addps %%xmm5, %%xmm12\n"
8255           "subps %%xmm5, %%xmm13\n"
8256           "movaps %%xmm6, %%xmm14\n"
8257           "movaps %%xmm6, %%xmm15\n"
8258           "addps %%xmm7, %%xmm14\n"
8259           "subps %%xmm7, %%xmm15\n"
8260           "movaps %%xmm8, %%xmm0\n"
8261           "movaps %%xmm8, %%xmm2\n"
8262           "addps %%xmm10, %%xmm0\n"
8263           "subps %%xmm10, %%xmm2\n"
8264           "movaps %%xmm9, %%xmm1\n"
8265           "movaps %%xmm9, %%xmm3\n"
8266           "addps %%xmm11, %%xmm1\n"
8267           "subps %%xmm11, %%xmm3\n"
8268           "movaps %%xmm12, %%xmm4\n"
8269           "movaps %%xmm12, %%xmm6\n"
8270           "addps %%xmm14, %%xmm4\n"
8271           "subps %%xmm14, %%xmm6\n"
8272           "movaps %%xmm13, %%xmm5\n"
8273           "movaps %%xmm13, %%xmm7\n"
8274           "addps %%xmm15, %%xmm5\n"
8275           "subps %%xmm15, %%xmm7\n"
8276           "movaps %%xmm0, %%xmm8\n"
8277           "movaps %%xmm0, %%xmm12\n"
8278           "addps %%xmm4, %%xmm8\n"
8279           "subps %%xmm4, %%xmm12\n"
8280           "movaps %%xmm1, %%xmm9\n"
8281           "movaps %%xmm1, %%xmm13\n"
8282           "addps %%xmm5, %%xmm9\n"
8283           "subps %%xmm5, %%xmm13\n"
8284           "movaps %%xmm2, %%xmm10\n"
8285           "movaps %%xmm2, %%xmm14\n"
8286           "addps %%xmm6, %%xmm10\n"
8287           "subps %%xmm6, %%xmm14\n"
8288           "movaps %%xmm3, %%xmm11\n"
8289           "movaps %%xmm3, %%xmm15\n"
8290           "addps %%xmm7, %%xmm11\n"
8291           "subps %%xmm7, %%xmm15\n"
8292           "movups %%xmm8, (%0)\n"
8293           "movups %%xmm9, (%1)\n"
8294           "movups %%xmm10, (%2)\n"
8295           "movups %%xmm11, (%3)\n"
8296           "movups %%xmm12, (%4)\n"
8297           "movups %%xmm13, (%5)\n"
8298           "movups %%xmm14, (%6)\n"
8299           "movups %%xmm15, (%7)\n"
8300           :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8301         );
8302       }
8303     }
8304     return;
8305   }
8306   if (depth == 23) {
8307     helper_float_23_recursive(buf + 0, 21);
8308     helper_float_23_recursive(buf + 2097152, 21);
8309     helper_float_23_recursive(buf + 4194304, 21);
8310     helper_float_23_recursive(buf + 6291456, 21);
8311     for (int j = 0; j < 8388608; j += 8388608) {
8312       for (int k = 0; k < 2097152; k += 4) {
8313         __asm__ volatile (
8314           "movups (%0), %%xmm0\n"
8315           "movups (%1), %%xmm1\n"
8316           "movups (%2), %%xmm2\n"
8317           "movups (%3), %%xmm3\n"
8318           "movaps %%xmm0, %%xmm8\n"
8319           "movaps %%xmm0, %%xmm9\n"
8320           "addps %%xmm1, %%xmm8\n"
8321           "subps %%xmm1, %%xmm9\n"
8322           "movaps %%xmm2, %%xmm10\n"
8323           "movaps %%xmm2, %%xmm11\n"
8324           "addps %%xmm3, %%xmm10\n"
8325           "subps %%xmm3, %%xmm11\n"
8326           "movaps %%xmm8, %%xmm0\n"
8327           "movaps %%xmm8, %%xmm2\n"
8328           "addps %%xmm10, %%xmm0\n"
8329           "subps %%xmm10, %%xmm2\n"
8330           "movaps %%xmm9, %%xmm1\n"
8331           "movaps %%xmm9, %%xmm3\n"
8332           "addps %%xmm11, %%xmm1\n"
8333           "subps %%xmm11, %%xmm3\n"
8334           "movups %%xmm0, (%0)\n"
8335           "movups %%xmm1, (%1)\n"
8336           "movups %%xmm2, (%2)\n"
8337           "movups %%xmm3, (%3)\n"
8338           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8339         );
8340       }
8341     }
8342     return;
8343   }
8344 }
8345 void helper_float_23(float *buf);
helper_float_23(float * buf)8346 void helper_float_23(float *buf) {
8347   helper_float_23_recursive(buf, 23);
8348 }
8349 void helper_float_24_recursive(float *buf, int depth);
helper_float_24_recursive(float * buf,int depth)8350 void helper_float_24_recursive(float *buf, int depth) {
8351   if (depth == 15) {
8352     for (int j = 0; j < 32768; j += 32) {
8353       for (int k = 0; k < 4; k += 4) {
8354         __asm__ volatile (
8355           "movups (%0), %%xmm0\n"
8356           "movups (%1), %%xmm1\n"
8357           "movups (%2), %%xmm2\n"
8358           "movups (%3), %%xmm3\n"
8359           "movups (%4), %%xmm4\n"
8360           "movups (%5), %%xmm5\n"
8361           "movups (%6), %%xmm6\n"
8362           "movups (%7), %%xmm7\n"
8363           "movaps %%xmm0, %%xmm8\n"
8364           "shufps $160, %%xmm8, %%xmm8\n"
8365           "shufps $245, %%xmm0, %%xmm0\n"
8366           "xorps %%xmm9, %%xmm9\n"
8367           "subps %%xmm0, %%xmm9\n"
8368           "addsubps %%xmm9, %%xmm8\n"
8369           "movaps %%xmm8, %%xmm0\n"
8370           "movaps %%xmm1, %%xmm8\n"
8371           "shufps $160, %%xmm8, %%xmm8\n"
8372           "shufps $245, %%xmm1, %%xmm1\n"
8373           "xorps %%xmm9, %%xmm9\n"
8374           "subps %%xmm1, %%xmm9\n"
8375           "addsubps %%xmm9, %%xmm8\n"
8376           "movaps %%xmm8, %%xmm1\n"
8377           "movaps %%xmm2, %%xmm8\n"
8378           "shufps $160, %%xmm8, %%xmm8\n"
8379           "shufps $245, %%xmm2, %%xmm2\n"
8380           "xorps %%xmm9, %%xmm9\n"
8381           "subps %%xmm2, %%xmm9\n"
8382           "addsubps %%xmm9, %%xmm8\n"
8383           "movaps %%xmm8, %%xmm2\n"
8384           "movaps %%xmm3, %%xmm8\n"
8385           "shufps $160, %%xmm8, %%xmm8\n"
8386           "shufps $245, %%xmm3, %%xmm3\n"
8387           "xorps %%xmm9, %%xmm9\n"
8388           "subps %%xmm3, %%xmm9\n"
8389           "addsubps %%xmm9, %%xmm8\n"
8390           "movaps %%xmm8, %%xmm3\n"
8391           "movaps %%xmm4, %%xmm8\n"
8392           "shufps $160, %%xmm8, %%xmm8\n"
8393           "shufps $245, %%xmm4, %%xmm4\n"
8394           "xorps %%xmm9, %%xmm9\n"
8395           "subps %%xmm4, %%xmm9\n"
8396           "addsubps %%xmm9, %%xmm8\n"
8397           "movaps %%xmm8, %%xmm4\n"
8398           "movaps %%xmm5, %%xmm8\n"
8399           "shufps $160, %%xmm8, %%xmm8\n"
8400           "shufps $245, %%xmm5, %%xmm5\n"
8401           "xorps %%xmm9, %%xmm9\n"
8402           "subps %%xmm5, %%xmm9\n"
8403           "addsubps %%xmm9, %%xmm8\n"
8404           "movaps %%xmm8, %%xmm5\n"
8405           "movaps %%xmm6, %%xmm8\n"
8406           "shufps $160, %%xmm8, %%xmm8\n"
8407           "shufps $245, %%xmm6, %%xmm6\n"
8408           "xorps %%xmm9, %%xmm9\n"
8409           "subps %%xmm6, %%xmm9\n"
8410           "addsubps %%xmm9, %%xmm8\n"
8411           "movaps %%xmm8, %%xmm6\n"
8412           "movaps %%xmm7, %%xmm8\n"
8413           "shufps $160, %%xmm8, %%xmm8\n"
8414           "shufps $245, %%xmm7, %%xmm7\n"
8415           "xorps %%xmm9, %%xmm9\n"
8416           "subps %%xmm7, %%xmm9\n"
8417           "addsubps %%xmm9, %%xmm8\n"
8418           "movaps %%xmm8, %%xmm7\n"
8419           "movaps %%xmm0, %%xmm8\n"
8420           "shufps $68, %%xmm8, %%xmm8\n"
8421           "xorps %%xmm9, %%xmm9\n"
8422           "movaps %%xmm0, %%xmm10\n"
8423           "shufps $14, %%xmm9, %%xmm10\n"
8424           "movaps %%xmm0, %%xmm11\n"
8425           "shufps $224, %%xmm11, %%xmm9\n"
8426           "addps %%xmm8, %%xmm10\n"
8427           "subps %%xmm9, %%xmm10\n"
8428           "movaps %%xmm10, %%xmm0\n"
8429           "movaps %%xmm1, %%xmm8\n"
8430           "shufps $68, %%xmm8, %%xmm8\n"
8431           "xorps %%xmm9, %%xmm9\n"
8432           "movaps %%xmm1, %%xmm10\n"
8433           "shufps $14, %%xmm9, %%xmm10\n"
8434           "movaps %%xmm1, %%xmm11\n"
8435           "shufps $224, %%xmm11, %%xmm9\n"
8436           "addps %%xmm8, %%xmm10\n"
8437           "subps %%xmm9, %%xmm10\n"
8438           "movaps %%xmm10, %%xmm1\n"
8439           "movaps %%xmm2, %%xmm8\n"
8440           "shufps $68, %%xmm8, %%xmm8\n"
8441           "xorps %%xmm9, %%xmm9\n"
8442           "movaps %%xmm2, %%xmm10\n"
8443           "shufps $14, %%xmm9, %%xmm10\n"
8444           "movaps %%xmm2, %%xmm11\n"
8445           "shufps $224, %%xmm11, %%xmm9\n"
8446           "addps %%xmm8, %%xmm10\n"
8447           "subps %%xmm9, %%xmm10\n"
8448           "movaps %%xmm10, %%xmm2\n"
8449           "movaps %%xmm3, %%xmm8\n"
8450           "shufps $68, %%xmm8, %%xmm8\n"
8451           "xorps %%xmm9, %%xmm9\n"
8452           "movaps %%xmm3, %%xmm10\n"
8453           "shufps $14, %%xmm9, %%xmm10\n"
8454           "movaps %%xmm3, %%xmm11\n"
8455           "shufps $224, %%xmm11, %%xmm9\n"
8456           "addps %%xmm8, %%xmm10\n"
8457           "subps %%xmm9, %%xmm10\n"
8458           "movaps %%xmm10, %%xmm3\n"
8459           "movaps %%xmm4, %%xmm8\n"
8460           "shufps $68, %%xmm8, %%xmm8\n"
8461           "xorps %%xmm9, %%xmm9\n"
8462           "movaps %%xmm4, %%xmm10\n"
8463           "shufps $14, %%xmm9, %%xmm10\n"
8464           "movaps %%xmm4, %%xmm11\n"
8465           "shufps $224, %%xmm11, %%xmm9\n"
8466           "addps %%xmm8, %%xmm10\n"
8467           "subps %%xmm9, %%xmm10\n"
8468           "movaps %%xmm10, %%xmm4\n"
8469           "movaps %%xmm5, %%xmm8\n"
8470           "shufps $68, %%xmm8, %%xmm8\n"
8471           "xorps %%xmm9, %%xmm9\n"
8472           "movaps %%xmm5, %%xmm10\n"
8473           "shufps $14, %%xmm9, %%xmm10\n"
8474           "movaps %%xmm5, %%xmm11\n"
8475           "shufps $224, %%xmm11, %%xmm9\n"
8476           "addps %%xmm8, %%xmm10\n"
8477           "subps %%xmm9, %%xmm10\n"
8478           "movaps %%xmm10, %%xmm5\n"
8479           "movaps %%xmm6, %%xmm8\n"
8480           "shufps $68, %%xmm8, %%xmm8\n"
8481           "xorps %%xmm9, %%xmm9\n"
8482           "movaps %%xmm6, %%xmm10\n"
8483           "shufps $14, %%xmm9, %%xmm10\n"
8484           "movaps %%xmm6, %%xmm11\n"
8485           "shufps $224, %%xmm11, %%xmm9\n"
8486           "addps %%xmm8, %%xmm10\n"
8487           "subps %%xmm9, %%xmm10\n"
8488           "movaps %%xmm10, %%xmm6\n"
8489           "movaps %%xmm7, %%xmm8\n"
8490           "shufps $68, %%xmm8, %%xmm8\n"
8491           "xorps %%xmm9, %%xmm9\n"
8492           "movaps %%xmm7, %%xmm10\n"
8493           "shufps $14, %%xmm9, %%xmm10\n"
8494           "movaps %%xmm7, %%xmm11\n"
8495           "shufps $224, %%xmm11, %%xmm9\n"
8496           "addps %%xmm8, %%xmm10\n"
8497           "subps %%xmm9, %%xmm10\n"
8498           "movaps %%xmm10, %%xmm7\n"
8499           "movaps %%xmm0, %%xmm8\n"
8500           "movaps %%xmm0, %%xmm9\n"
8501           "addps %%xmm1, %%xmm8\n"
8502           "subps %%xmm1, %%xmm9\n"
8503           "movaps %%xmm2, %%xmm10\n"
8504           "movaps %%xmm2, %%xmm11\n"
8505           "addps %%xmm3, %%xmm10\n"
8506           "subps %%xmm3, %%xmm11\n"
8507           "movaps %%xmm4, %%xmm12\n"
8508           "movaps %%xmm4, %%xmm13\n"
8509           "addps %%xmm5, %%xmm12\n"
8510           "subps %%xmm5, %%xmm13\n"
8511           "movaps %%xmm6, %%xmm14\n"
8512           "movaps %%xmm6, %%xmm15\n"
8513           "addps %%xmm7, %%xmm14\n"
8514           "subps %%xmm7, %%xmm15\n"
8515           "movaps %%xmm8, %%xmm0\n"
8516           "movaps %%xmm8, %%xmm2\n"
8517           "addps %%xmm10, %%xmm0\n"
8518           "subps %%xmm10, %%xmm2\n"
8519           "movaps %%xmm9, %%xmm1\n"
8520           "movaps %%xmm9, %%xmm3\n"
8521           "addps %%xmm11, %%xmm1\n"
8522           "subps %%xmm11, %%xmm3\n"
8523           "movaps %%xmm12, %%xmm4\n"
8524           "movaps %%xmm12, %%xmm6\n"
8525           "addps %%xmm14, %%xmm4\n"
8526           "subps %%xmm14, %%xmm6\n"
8527           "movaps %%xmm13, %%xmm5\n"
8528           "movaps %%xmm13, %%xmm7\n"
8529           "addps %%xmm15, %%xmm5\n"
8530           "subps %%xmm15, %%xmm7\n"
8531           "movaps %%xmm0, %%xmm8\n"
8532           "movaps %%xmm0, %%xmm12\n"
8533           "addps %%xmm4, %%xmm8\n"
8534           "subps %%xmm4, %%xmm12\n"
8535           "movaps %%xmm1, %%xmm9\n"
8536           "movaps %%xmm1, %%xmm13\n"
8537           "addps %%xmm5, %%xmm9\n"
8538           "subps %%xmm5, %%xmm13\n"
8539           "movaps %%xmm2, %%xmm10\n"
8540           "movaps %%xmm2, %%xmm14\n"
8541           "addps %%xmm6, %%xmm10\n"
8542           "subps %%xmm6, %%xmm14\n"
8543           "movaps %%xmm3, %%xmm11\n"
8544           "movaps %%xmm3, %%xmm15\n"
8545           "addps %%xmm7, %%xmm11\n"
8546           "subps %%xmm7, %%xmm15\n"
8547           "movups %%xmm8, (%0)\n"
8548           "movups %%xmm9, (%1)\n"
8549           "movups %%xmm10, (%2)\n"
8550           "movups %%xmm11, (%3)\n"
8551           "movups %%xmm12, (%4)\n"
8552           "movups %%xmm13, (%5)\n"
8553           "movups %%xmm14, (%6)\n"
8554           "movups %%xmm15, (%7)\n"
8555           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8556         );
8557       }
8558     }
8559     for (int j = 0; j < 32768; j += 256) {
8560       for (int k = 0; k < 32; k += 4) {
8561         __asm__ volatile (
8562           "movups (%0), %%xmm0\n"
8563           "movups (%1), %%xmm1\n"
8564           "movups (%2), %%xmm2\n"
8565           "movups (%3), %%xmm3\n"
8566           "movups (%4), %%xmm4\n"
8567           "movups (%5), %%xmm5\n"
8568           "movups (%6), %%xmm6\n"
8569           "movups (%7), %%xmm7\n"
8570           "movaps %%xmm0, %%xmm8\n"
8571           "movaps %%xmm0, %%xmm9\n"
8572           "addps %%xmm1, %%xmm8\n"
8573           "subps %%xmm1, %%xmm9\n"
8574           "movaps %%xmm2, %%xmm10\n"
8575           "movaps %%xmm2, %%xmm11\n"
8576           "addps %%xmm3, %%xmm10\n"
8577           "subps %%xmm3, %%xmm11\n"
8578           "movaps %%xmm4, %%xmm12\n"
8579           "movaps %%xmm4, %%xmm13\n"
8580           "addps %%xmm5, %%xmm12\n"
8581           "subps %%xmm5, %%xmm13\n"
8582           "movaps %%xmm6, %%xmm14\n"
8583           "movaps %%xmm6, %%xmm15\n"
8584           "addps %%xmm7, %%xmm14\n"
8585           "subps %%xmm7, %%xmm15\n"
8586           "movaps %%xmm8, %%xmm0\n"
8587           "movaps %%xmm8, %%xmm2\n"
8588           "addps %%xmm10, %%xmm0\n"
8589           "subps %%xmm10, %%xmm2\n"
8590           "movaps %%xmm9, %%xmm1\n"
8591           "movaps %%xmm9, %%xmm3\n"
8592           "addps %%xmm11, %%xmm1\n"
8593           "subps %%xmm11, %%xmm3\n"
8594           "movaps %%xmm12, %%xmm4\n"
8595           "movaps %%xmm12, %%xmm6\n"
8596           "addps %%xmm14, %%xmm4\n"
8597           "subps %%xmm14, %%xmm6\n"
8598           "movaps %%xmm13, %%xmm5\n"
8599           "movaps %%xmm13, %%xmm7\n"
8600           "addps %%xmm15, %%xmm5\n"
8601           "subps %%xmm15, %%xmm7\n"
8602           "movaps %%xmm0, %%xmm8\n"
8603           "movaps %%xmm0, %%xmm12\n"
8604           "addps %%xmm4, %%xmm8\n"
8605           "subps %%xmm4, %%xmm12\n"
8606           "movaps %%xmm1, %%xmm9\n"
8607           "movaps %%xmm1, %%xmm13\n"
8608           "addps %%xmm5, %%xmm9\n"
8609           "subps %%xmm5, %%xmm13\n"
8610           "movaps %%xmm2, %%xmm10\n"
8611           "movaps %%xmm2, %%xmm14\n"
8612           "addps %%xmm6, %%xmm10\n"
8613           "subps %%xmm6, %%xmm14\n"
8614           "movaps %%xmm3, %%xmm11\n"
8615           "movaps %%xmm3, %%xmm15\n"
8616           "addps %%xmm7, %%xmm11\n"
8617           "subps %%xmm7, %%xmm15\n"
8618           "movups %%xmm8, (%0)\n"
8619           "movups %%xmm9, (%1)\n"
8620           "movups %%xmm10, (%2)\n"
8621           "movups %%xmm11, (%3)\n"
8622           "movups %%xmm12, (%4)\n"
8623           "movups %%xmm13, (%5)\n"
8624           "movups %%xmm14, (%6)\n"
8625           "movups %%xmm15, (%7)\n"
8626           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8627         );
8628       }
8629     }
8630     for (int j = 0; j < 32768; j += 2048) {
8631       for (int k = 0; k < 256; k += 4) {
8632         __asm__ volatile (
8633           "movups (%0), %%xmm0\n"
8634           "movups (%1), %%xmm1\n"
8635           "movups (%2), %%xmm2\n"
8636           "movups (%3), %%xmm3\n"
8637           "movups (%4), %%xmm4\n"
8638           "movups (%5), %%xmm5\n"
8639           "movups (%6), %%xmm6\n"
8640           "movups (%7), %%xmm7\n"
8641           "movaps %%xmm0, %%xmm8\n"
8642           "movaps %%xmm0, %%xmm9\n"
8643           "addps %%xmm1, %%xmm8\n"
8644           "subps %%xmm1, %%xmm9\n"
8645           "movaps %%xmm2, %%xmm10\n"
8646           "movaps %%xmm2, %%xmm11\n"
8647           "addps %%xmm3, %%xmm10\n"
8648           "subps %%xmm3, %%xmm11\n"
8649           "movaps %%xmm4, %%xmm12\n"
8650           "movaps %%xmm4, %%xmm13\n"
8651           "addps %%xmm5, %%xmm12\n"
8652           "subps %%xmm5, %%xmm13\n"
8653           "movaps %%xmm6, %%xmm14\n"
8654           "movaps %%xmm6, %%xmm15\n"
8655           "addps %%xmm7, %%xmm14\n"
8656           "subps %%xmm7, %%xmm15\n"
8657           "movaps %%xmm8, %%xmm0\n"
8658           "movaps %%xmm8, %%xmm2\n"
8659           "addps %%xmm10, %%xmm0\n"
8660           "subps %%xmm10, %%xmm2\n"
8661           "movaps %%xmm9, %%xmm1\n"
8662           "movaps %%xmm9, %%xmm3\n"
8663           "addps %%xmm11, %%xmm1\n"
8664           "subps %%xmm11, %%xmm3\n"
8665           "movaps %%xmm12, %%xmm4\n"
8666           "movaps %%xmm12, %%xmm6\n"
8667           "addps %%xmm14, %%xmm4\n"
8668           "subps %%xmm14, %%xmm6\n"
8669           "movaps %%xmm13, %%xmm5\n"
8670           "movaps %%xmm13, %%xmm7\n"
8671           "addps %%xmm15, %%xmm5\n"
8672           "subps %%xmm15, %%xmm7\n"
8673           "movaps %%xmm0, %%xmm8\n"
8674           "movaps %%xmm0, %%xmm12\n"
8675           "addps %%xmm4, %%xmm8\n"
8676           "subps %%xmm4, %%xmm12\n"
8677           "movaps %%xmm1, %%xmm9\n"
8678           "movaps %%xmm1, %%xmm13\n"
8679           "addps %%xmm5, %%xmm9\n"
8680           "subps %%xmm5, %%xmm13\n"
8681           "movaps %%xmm2, %%xmm10\n"
8682           "movaps %%xmm2, %%xmm14\n"
8683           "addps %%xmm6, %%xmm10\n"
8684           "subps %%xmm6, %%xmm14\n"
8685           "movaps %%xmm3, %%xmm11\n"
8686           "movaps %%xmm3, %%xmm15\n"
8687           "addps %%xmm7, %%xmm11\n"
8688           "subps %%xmm7, %%xmm15\n"
8689           "movups %%xmm8, (%0)\n"
8690           "movups %%xmm9, (%1)\n"
8691           "movups %%xmm10, (%2)\n"
8692           "movups %%xmm11, (%3)\n"
8693           "movups %%xmm12, (%4)\n"
8694           "movups %%xmm13, (%5)\n"
8695           "movups %%xmm14, (%6)\n"
8696           "movups %%xmm15, (%7)\n"
8697           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8698         );
8699       }
8700     }
8701     for (int j = 0; j < 32768; j += 16384) {
8702       for (int k = 0; k < 2048; k += 4) {
8703         __asm__ volatile (
8704           "movups (%0), %%xmm0\n"
8705           "movups (%1), %%xmm1\n"
8706           "movups (%2), %%xmm2\n"
8707           "movups (%3), %%xmm3\n"
8708           "movups (%4), %%xmm4\n"
8709           "movups (%5), %%xmm5\n"
8710           "movups (%6), %%xmm6\n"
8711           "movups (%7), %%xmm7\n"
8712           "movaps %%xmm0, %%xmm8\n"
8713           "movaps %%xmm0, %%xmm9\n"
8714           "addps %%xmm1, %%xmm8\n"
8715           "subps %%xmm1, %%xmm9\n"
8716           "movaps %%xmm2, %%xmm10\n"
8717           "movaps %%xmm2, %%xmm11\n"
8718           "addps %%xmm3, %%xmm10\n"
8719           "subps %%xmm3, %%xmm11\n"
8720           "movaps %%xmm4, %%xmm12\n"
8721           "movaps %%xmm4, %%xmm13\n"
8722           "addps %%xmm5, %%xmm12\n"
8723           "subps %%xmm5, %%xmm13\n"
8724           "movaps %%xmm6, %%xmm14\n"
8725           "movaps %%xmm6, %%xmm15\n"
8726           "addps %%xmm7, %%xmm14\n"
8727           "subps %%xmm7, %%xmm15\n"
8728           "movaps %%xmm8, %%xmm0\n"
8729           "movaps %%xmm8, %%xmm2\n"
8730           "addps %%xmm10, %%xmm0\n"
8731           "subps %%xmm10, %%xmm2\n"
8732           "movaps %%xmm9, %%xmm1\n"
8733           "movaps %%xmm9, %%xmm3\n"
8734           "addps %%xmm11, %%xmm1\n"
8735           "subps %%xmm11, %%xmm3\n"
8736           "movaps %%xmm12, %%xmm4\n"
8737           "movaps %%xmm12, %%xmm6\n"
8738           "addps %%xmm14, %%xmm4\n"
8739           "subps %%xmm14, %%xmm6\n"
8740           "movaps %%xmm13, %%xmm5\n"
8741           "movaps %%xmm13, %%xmm7\n"
8742           "addps %%xmm15, %%xmm5\n"
8743           "subps %%xmm15, %%xmm7\n"
8744           "movaps %%xmm0, %%xmm8\n"
8745           "movaps %%xmm0, %%xmm12\n"
8746           "addps %%xmm4, %%xmm8\n"
8747           "subps %%xmm4, %%xmm12\n"
8748           "movaps %%xmm1, %%xmm9\n"
8749           "movaps %%xmm1, %%xmm13\n"
8750           "addps %%xmm5, %%xmm9\n"
8751           "subps %%xmm5, %%xmm13\n"
8752           "movaps %%xmm2, %%xmm10\n"
8753           "movaps %%xmm2, %%xmm14\n"
8754           "addps %%xmm6, %%xmm10\n"
8755           "subps %%xmm6, %%xmm14\n"
8756           "movaps %%xmm3, %%xmm11\n"
8757           "movaps %%xmm3, %%xmm15\n"
8758           "addps %%xmm7, %%xmm11\n"
8759           "subps %%xmm7, %%xmm15\n"
8760           "movups %%xmm8, (%0)\n"
8761           "movups %%xmm9, (%1)\n"
8762           "movups %%xmm10, (%2)\n"
8763           "movups %%xmm11, (%3)\n"
8764           "movups %%xmm12, (%4)\n"
8765           "movups %%xmm13, (%5)\n"
8766           "movups %%xmm14, (%6)\n"
8767           "movups %%xmm15, (%7)\n"
8768           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8769         );
8770       }
8771     }
8772     for (int j = 0; j < 32768; j += 32768) {
8773       for (int k = 0; k < 16384; k += 4) {
8774         __asm__ volatile (
8775           "movups (%0), %%xmm0\n"
8776           "movups (%1), %%xmm1\n"
8777           "movaps %%xmm0, %%xmm8\n"
8778           "movaps %%xmm0, %%xmm9\n"
8779           "addps %%xmm1, %%xmm8\n"
8780           "subps %%xmm1, %%xmm9\n"
8781           "movups %%xmm8, (%0)\n"
8782           "movups %%xmm9, (%1)\n"
8783           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8784         );
8785       }
8786     }
8787     return;
8788   }
8789   if (depth == 18) {
8790     helper_float_24_recursive(buf + 0, 15);
8791     helper_float_24_recursive(buf + 32768, 15);
8792     helper_float_24_recursive(buf + 65536, 15);
8793     helper_float_24_recursive(buf + 98304, 15);
8794     helper_float_24_recursive(buf + 131072, 15);
8795     helper_float_24_recursive(buf + 163840, 15);
8796     helper_float_24_recursive(buf + 196608, 15);
8797     helper_float_24_recursive(buf + 229376, 15);
8798     for (int j = 0; j < 262144; j += 262144) {
8799       for (int k = 0; k < 32768; k += 4) {
8800         __asm__ volatile (
8801           "movups (%0), %%xmm0\n"
8802           "movups (%1), %%xmm1\n"
8803           "movups (%2), %%xmm2\n"
8804           "movups (%3), %%xmm3\n"
8805           "movups (%4), %%xmm4\n"
8806           "movups (%5), %%xmm5\n"
8807           "movups (%6), %%xmm6\n"
8808           "movups (%7), %%xmm7\n"
8809           "movaps %%xmm0, %%xmm8\n"
8810           "movaps %%xmm0, %%xmm9\n"
8811           "addps %%xmm1, %%xmm8\n"
8812           "subps %%xmm1, %%xmm9\n"
8813           "movaps %%xmm2, %%xmm10\n"
8814           "movaps %%xmm2, %%xmm11\n"
8815           "addps %%xmm3, %%xmm10\n"
8816           "subps %%xmm3, %%xmm11\n"
8817           "movaps %%xmm4, %%xmm12\n"
8818           "movaps %%xmm4, %%xmm13\n"
8819           "addps %%xmm5, %%xmm12\n"
8820           "subps %%xmm5, %%xmm13\n"
8821           "movaps %%xmm6, %%xmm14\n"
8822           "movaps %%xmm6, %%xmm15\n"
8823           "addps %%xmm7, %%xmm14\n"
8824           "subps %%xmm7, %%xmm15\n"
8825           "movaps %%xmm8, %%xmm0\n"
8826           "movaps %%xmm8, %%xmm2\n"
8827           "addps %%xmm10, %%xmm0\n"
8828           "subps %%xmm10, %%xmm2\n"
8829           "movaps %%xmm9, %%xmm1\n"
8830           "movaps %%xmm9, %%xmm3\n"
8831           "addps %%xmm11, %%xmm1\n"
8832           "subps %%xmm11, %%xmm3\n"
8833           "movaps %%xmm12, %%xmm4\n"
8834           "movaps %%xmm12, %%xmm6\n"
8835           "addps %%xmm14, %%xmm4\n"
8836           "subps %%xmm14, %%xmm6\n"
8837           "movaps %%xmm13, %%xmm5\n"
8838           "movaps %%xmm13, %%xmm7\n"
8839           "addps %%xmm15, %%xmm5\n"
8840           "subps %%xmm15, %%xmm7\n"
8841           "movaps %%xmm0, %%xmm8\n"
8842           "movaps %%xmm0, %%xmm12\n"
8843           "addps %%xmm4, %%xmm8\n"
8844           "subps %%xmm4, %%xmm12\n"
8845           "movaps %%xmm1, %%xmm9\n"
8846           "movaps %%xmm1, %%xmm13\n"
8847           "addps %%xmm5, %%xmm9\n"
8848           "subps %%xmm5, %%xmm13\n"
8849           "movaps %%xmm2, %%xmm10\n"
8850           "movaps %%xmm2, %%xmm14\n"
8851           "addps %%xmm6, %%xmm10\n"
8852           "subps %%xmm6, %%xmm14\n"
8853           "movaps %%xmm3, %%xmm11\n"
8854           "movaps %%xmm3, %%xmm15\n"
8855           "addps %%xmm7, %%xmm11\n"
8856           "subps %%xmm7, %%xmm15\n"
8857           "movups %%xmm8, (%0)\n"
8858           "movups %%xmm9, (%1)\n"
8859           "movups %%xmm10, (%2)\n"
8860           "movups %%xmm11, (%3)\n"
8861           "movups %%xmm12, (%4)\n"
8862           "movups %%xmm13, (%5)\n"
8863           "movups %%xmm14, (%6)\n"
8864           "movups %%xmm15, (%7)\n"
8865           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8866         );
8867       }
8868     }
8869     return;
8870   }
8871   if (depth == 21) {
8872     helper_float_24_recursive(buf + 0, 18);
8873     helper_float_24_recursive(buf + 262144, 18);
8874     helper_float_24_recursive(buf + 524288, 18);
8875     helper_float_24_recursive(buf + 786432, 18);
8876     helper_float_24_recursive(buf + 1048576, 18);
8877     helper_float_24_recursive(buf + 1310720, 18);
8878     helper_float_24_recursive(buf + 1572864, 18);
8879     helper_float_24_recursive(buf + 1835008, 18);
8880     for (int j = 0; j < 2097152; j += 2097152) {
8881       for (int k = 0; k < 262144; k += 4) {
8882         __asm__ volatile (
8883           "movups (%0), %%xmm0\n"
8884           "movups (%1), %%xmm1\n"
8885           "movups (%2), %%xmm2\n"
8886           "movups (%3), %%xmm3\n"
8887           "movups (%4), %%xmm4\n"
8888           "movups (%5), %%xmm5\n"
8889           "movups (%6), %%xmm6\n"
8890           "movups (%7), %%xmm7\n"
8891           "movaps %%xmm0, %%xmm8\n"
8892           "movaps %%xmm0, %%xmm9\n"
8893           "addps %%xmm1, %%xmm8\n"
8894           "subps %%xmm1, %%xmm9\n"
8895           "movaps %%xmm2, %%xmm10\n"
8896           "movaps %%xmm2, %%xmm11\n"
8897           "addps %%xmm3, %%xmm10\n"
8898           "subps %%xmm3, %%xmm11\n"
8899           "movaps %%xmm4, %%xmm12\n"
8900           "movaps %%xmm4, %%xmm13\n"
8901           "addps %%xmm5, %%xmm12\n"
8902           "subps %%xmm5, %%xmm13\n"
8903           "movaps %%xmm6, %%xmm14\n"
8904           "movaps %%xmm6, %%xmm15\n"
8905           "addps %%xmm7, %%xmm14\n"
8906           "subps %%xmm7, %%xmm15\n"
8907           "movaps %%xmm8, %%xmm0\n"
8908           "movaps %%xmm8, %%xmm2\n"
8909           "addps %%xmm10, %%xmm0\n"
8910           "subps %%xmm10, %%xmm2\n"
8911           "movaps %%xmm9, %%xmm1\n"
8912           "movaps %%xmm9, %%xmm3\n"
8913           "addps %%xmm11, %%xmm1\n"
8914           "subps %%xmm11, %%xmm3\n"
8915           "movaps %%xmm12, %%xmm4\n"
8916           "movaps %%xmm12, %%xmm6\n"
8917           "addps %%xmm14, %%xmm4\n"
8918           "subps %%xmm14, %%xmm6\n"
8919           "movaps %%xmm13, %%xmm5\n"
8920           "movaps %%xmm13, %%xmm7\n"
8921           "addps %%xmm15, %%xmm5\n"
8922           "subps %%xmm15, %%xmm7\n"
8923           "movaps %%xmm0, %%xmm8\n"
8924           "movaps %%xmm0, %%xmm12\n"
8925           "addps %%xmm4, %%xmm8\n"
8926           "subps %%xmm4, %%xmm12\n"
8927           "movaps %%xmm1, %%xmm9\n"
8928           "movaps %%xmm1, %%xmm13\n"
8929           "addps %%xmm5, %%xmm9\n"
8930           "subps %%xmm5, %%xmm13\n"
8931           "movaps %%xmm2, %%xmm10\n"
8932           "movaps %%xmm2, %%xmm14\n"
8933           "addps %%xmm6, %%xmm10\n"
8934           "subps %%xmm6, %%xmm14\n"
8935           "movaps %%xmm3, %%xmm11\n"
8936           "movaps %%xmm3, %%xmm15\n"
8937           "addps %%xmm7, %%xmm11\n"
8938           "subps %%xmm7, %%xmm15\n"
8939           "movups %%xmm8, (%0)\n"
8940           "movups %%xmm9, (%1)\n"
8941           "movups %%xmm10, (%2)\n"
8942           "movups %%xmm11, (%3)\n"
8943           "movups %%xmm12, (%4)\n"
8944           "movups %%xmm13, (%5)\n"
8945           "movups %%xmm14, (%6)\n"
8946           "movups %%xmm15, (%7)\n"
8947           :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8948         );
8949       }
8950     }
8951     return;
8952   }
8953   if (depth == 24) {
8954     helper_float_24_recursive(buf + 0, 21);
8955     helper_float_24_recursive(buf + 2097152, 21);
8956     helper_float_24_recursive(buf + 4194304, 21);
8957     helper_float_24_recursive(buf + 6291456, 21);
8958     helper_float_24_recursive(buf + 8388608, 21);
8959     helper_float_24_recursive(buf + 10485760, 21);
8960     helper_float_24_recursive(buf + 12582912, 21);
8961     helper_float_24_recursive(buf + 14680064, 21);
8962     for (int j = 0; j < 16777216; j += 16777216) {
8963       for (int k = 0; k < 2097152; k += 4) {
8964         __asm__ volatile (
8965           "movups (%0), %%xmm0\n"
8966           "movups (%1), %%xmm1\n"
8967           "movups (%2), %%xmm2\n"
8968           "movups (%3), %%xmm3\n"
8969           "movups (%4), %%xmm4\n"
8970           "movups (%5), %%xmm5\n"
8971           "movups (%6), %%xmm6\n"
8972           "movups (%7), %%xmm7\n"
8973           "movaps %%xmm0, %%xmm8\n"
8974           "movaps %%xmm0, %%xmm9\n"
8975           "addps %%xmm1, %%xmm8\n"
8976           "subps %%xmm1, %%xmm9\n"
8977           "movaps %%xmm2, %%xmm10\n"
8978           "movaps %%xmm2, %%xmm11\n"
8979           "addps %%xmm3, %%xmm10\n"
8980           "subps %%xmm3, %%xmm11\n"
8981           "movaps %%xmm4, %%xmm12\n"
8982           "movaps %%xmm4, %%xmm13\n"
8983           "addps %%xmm5, %%xmm12\n"
8984           "subps %%xmm5, %%xmm13\n"
8985           "movaps %%xmm6, %%xmm14\n"
8986           "movaps %%xmm6, %%xmm15\n"
8987           "addps %%xmm7, %%xmm14\n"
8988           "subps %%xmm7, %%xmm15\n"
8989           "movaps %%xmm8, %%xmm0\n"
8990           "movaps %%xmm8, %%xmm2\n"
8991           "addps %%xmm10, %%xmm0\n"
8992           "subps %%xmm10, %%xmm2\n"
8993           "movaps %%xmm9, %%xmm1\n"
8994           "movaps %%xmm9, %%xmm3\n"
8995           "addps %%xmm11, %%xmm1\n"
8996           "subps %%xmm11, %%xmm3\n"
8997           "movaps %%xmm12, %%xmm4\n"
8998           "movaps %%xmm12, %%xmm6\n"
8999           "addps %%xmm14, %%xmm4\n"
9000           "subps %%xmm14, %%xmm6\n"
9001           "movaps %%xmm13, %%xmm5\n"
9002           "movaps %%xmm13, %%xmm7\n"
9003           "addps %%xmm15, %%xmm5\n"
9004           "subps %%xmm15, %%xmm7\n"
9005           "movaps %%xmm0, %%xmm8\n"
9006           "movaps %%xmm0, %%xmm12\n"
9007           "addps %%xmm4, %%xmm8\n"
9008           "subps %%xmm4, %%xmm12\n"
9009           "movaps %%xmm1, %%xmm9\n"
9010           "movaps %%xmm1, %%xmm13\n"
9011           "addps %%xmm5, %%xmm9\n"
9012           "subps %%xmm5, %%xmm13\n"
9013           "movaps %%xmm2, %%xmm10\n"
9014           "movaps %%xmm2, %%xmm14\n"
9015           "addps %%xmm6, %%xmm10\n"
9016           "subps %%xmm6, %%xmm14\n"
9017           "movaps %%xmm3, %%xmm11\n"
9018           "movaps %%xmm3, %%xmm15\n"
9019           "addps %%xmm7, %%xmm11\n"
9020           "subps %%xmm7, %%xmm15\n"
9021           "movups %%xmm8, (%0)\n"
9022           "movups %%xmm9, (%1)\n"
9023           "movups %%xmm10, (%2)\n"
9024           "movups %%xmm11, (%3)\n"
9025           "movups %%xmm12, (%4)\n"
9026           "movups %%xmm13, (%5)\n"
9027           "movups %%xmm14, (%6)\n"
9028           "movups %%xmm15, (%7)\n"
9029           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
9030         );
9031       }
9032     }
9033     return;
9034   }
9035 }
9036 void helper_float_24(float *buf);
helper_float_24(float * buf)9037 void helper_float_24(float *buf) {
9038   helper_float_24_recursive(buf, 24);
9039 }
9040 void helper_float_25_recursive(float *buf, int depth);
helper_float_25_recursive(float * buf,int depth)9041 void helper_float_25_recursive(float *buf, int depth) {
9042   if (depth == 8) {
9043     for (int j = 0; j < 256; j += 32) {
9044       for (int k = 0; k < 4; k += 4) {
9045         __asm__ volatile (
9046           "movups (%0), %%xmm0\n"
9047           "movups (%1), %%xmm1\n"
9048           "movups (%2), %%xmm2\n"
9049           "movups (%3), %%xmm3\n"
9050           "movups (%4), %%xmm4\n"
9051           "movups (%5), %%xmm5\n"
9052           "movups (%6), %%xmm6\n"
9053           "movups (%7), %%xmm7\n"
9054           "movaps %%xmm0, %%xmm8\n"
9055           "shufps $160, %%xmm8, %%xmm8\n"
9056           "shufps $245, %%xmm0, %%xmm0\n"
9057           "xorps %%xmm9, %%xmm9\n"
9058           "subps %%xmm0, %%xmm9\n"
9059           "addsubps %%xmm9, %%xmm8\n"
9060           "movaps %%xmm8, %%xmm0\n"
9061           "movaps %%xmm1, %%xmm8\n"
9062           "shufps $160, %%xmm8, %%xmm8\n"
9063           "shufps $245, %%xmm1, %%xmm1\n"
9064           "xorps %%xmm9, %%xmm9\n"
9065           "subps %%xmm1, %%xmm9\n"
9066           "addsubps %%xmm9, %%xmm8\n"
9067           "movaps %%xmm8, %%xmm1\n"
9068           "movaps %%xmm2, %%xmm8\n"
9069           "shufps $160, %%xmm8, %%xmm8\n"
9070           "shufps $245, %%xmm2, %%xmm2\n"
9071           "xorps %%xmm9, %%xmm9\n"
9072           "subps %%xmm2, %%xmm9\n"
9073           "addsubps %%xmm9, %%xmm8\n"
9074           "movaps %%xmm8, %%xmm2\n"
9075           "movaps %%xmm3, %%xmm8\n"
9076           "shufps $160, %%xmm8, %%xmm8\n"
9077           "shufps $245, %%xmm3, %%xmm3\n"
9078           "xorps %%xmm9, %%xmm9\n"
9079           "subps %%xmm3, %%xmm9\n"
9080           "addsubps %%xmm9, %%xmm8\n"
9081           "movaps %%xmm8, %%xmm3\n"
9082           "movaps %%xmm4, %%xmm8\n"
9083           "shufps $160, %%xmm8, %%xmm8\n"
9084           "shufps $245, %%xmm4, %%xmm4\n"
9085           "xorps %%xmm9, %%xmm9\n"
9086           "subps %%xmm4, %%xmm9\n"
9087           "addsubps %%xmm9, %%xmm8\n"
9088           "movaps %%xmm8, %%xmm4\n"
9089           "movaps %%xmm5, %%xmm8\n"
9090           "shufps $160, %%xmm8, %%xmm8\n"
9091           "shufps $245, %%xmm5, %%xmm5\n"
9092           "xorps %%xmm9, %%xmm9\n"
9093           "subps %%xmm5, %%xmm9\n"
9094           "addsubps %%xmm9, %%xmm8\n"
9095           "movaps %%xmm8, %%xmm5\n"
9096           "movaps %%xmm6, %%xmm8\n"
9097           "shufps $160, %%xmm8, %%xmm8\n"
9098           "shufps $245, %%xmm6, %%xmm6\n"
9099           "xorps %%xmm9, %%xmm9\n"
9100           "subps %%xmm6, %%xmm9\n"
9101           "addsubps %%xmm9, %%xmm8\n"
9102           "movaps %%xmm8, %%xmm6\n"
9103           "movaps %%xmm7, %%xmm8\n"
9104           "shufps $160, %%xmm8, %%xmm8\n"
9105           "shufps $245, %%xmm7, %%xmm7\n"
9106           "xorps %%xmm9, %%xmm9\n"
9107           "subps %%xmm7, %%xmm9\n"
9108           "addsubps %%xmm9, %%xmm8\n"
9109           "movaps %%xmm8, %%xmm7\n"
9110           "movaps %%xmm0, %%xmm8\n"
9111           "shufps $68, %%xmm8, %%xmm8\n"
9112           "xorps %%xmm9, %%xmm9\n"
9113           "movaps %%xmm0, %%xmm10\n"
9114           "shufps $14, %%xmm9, %%xmm10\n"
9115           "movaps %%xmm0, %%xmm11\n"
9116           "shufps $224, %%xmm11, %%xmm9\n"
9117           "addps %%xmm8, %%xmm10\n"
9118           "subps %%xmm9, %%xmm10\n"
9119           "movaps %%xmm10, %%xmm0\n"
9120           "movaps %%xmm1, %%xmm8\n"
9121           "shufps $68, %%xmm8, %%xmm8\n"
9122           "xorps %%xmm9, %%xmm9\n"
9123           "movaps %%xmm1, %%xmm10\n"
9124           "shufps $14, %%xmm9, %%xmm10\n"
9125           "movaps %%xmm1, %%xmm11\n"
9126           "shufps $224, %%xmm11, %%xmm9\n"
9127           "addps %%xmm8, %%xmm10\n"
9128           "subps %%xmm9, %%xmm10\n"
9129           "movaps %%xmm10, %%xmm1\n"
9130           "movaps %%xmm2, %%xmm8\n"
9131           "shufps $68, %%xmm8, %%xmm8\n"
9132           "xorps %%xmm9, %%xmm9\n"
9133           "movaps %%xmm2, %%xmm10\n"
9134           "shufps $14, %%xmm9, %%xmm10\n"
9135           "movaps %%xmm2, %%xmm11\n"
9136           "shufps $224, %%xmm11, %%xmm9\n"
9137           "addps %%xmm8, %%xmm10\n"
9138           "subps %%xmm9, %%xmm10\n"
9139           "movaps %%xmm10, %%xmm2\n"
9140           "movaps %%xmm3, %%xmm8\n"
9141           "shufps $68, %%xmm8, %%xmm8\n"
9142           "xorps %%xmm9, %%xmm9\n"
9143           "movaps %%xmm3, %%xmm10\n"
9144           "shufps $14, %%xmm9, %%xmm10\n"
9145           "movaps %%xmm3, %%xmm11\n"
9146           "shufps $224, %%xmm11, %%xmm9\n"
9147           "addps %%xmm8, %%xmm10\n"
9148           "subps %%xmm9, %%xmm10\n"
9149           "movaps %%xmm10, %%xmm3\n"
9150           "movaps %%xmm4, %%xmm8\n"
9151           "shufps $68, %%xmm8, %%xmm8\n"
9152           "xorps %%xmm9, %%xmm9\n"
9153           "movaps %%xmm4, %%xmm10\n"
9154           "shufps $14, %%xmm9, %%xmm10\n"
9155           "movaps %%xmm4, %%xmm11\n"
9156           "shufps $224, %%xmm11, %%xmm9\n"
9157           "addps %%xmm8, %%xmm10\n"
9158           "subps %%xmm9, %%xmm10\n"
9159           "movaps %%xmm10, %%xmm4\n"
9160           "movaps %%xmm5, %%xmm8\n"
9161           "shufps $68, %%xmm8, %%xmm8\n"
9162           "xorps %%xmm9, %%xmm9\n"
9163           "movaps %%xmm5, %%xmm10\n"
9164           "shufps $14, %%xmm9, %%xmm10\n"
9165           "movaps %%xmm5, %%xmm11\n"
9166           "shufps $224, %%xmm11, %%xmm9\n"
9167           "addps %%xmm8, %%xmm10\n"
9168           "subps %%xmm9, %%xmm10\n"
9169           "movaps %%xmm10, %%xmm5\n"
9170           "movaps %%xmm6, %%xmm8\n"
9171           "shufps $68, %%xmm8, %%xmm8\n"
9172           "xorps %%xmm9, %%xmm9\n"
9173           "movaps %%xmm6, %%xmm10\n"
9174           "shufps $14, %%xmm9, %%xmm10\n"
9175           "movaps %%xmm6, %%xmm11\n"
9176           "shufps $224, %%xmm11, %%xmm9\n"
9177           "addps %%xmm8, %%xmm10\n"
9178           "subps %%xmm9, %%xmm10\n"
9179           "movaps %%xmm10, %%xmm6\n"
9180           "movaps %%xmm7, %%xmm8\n"
9181           "shufps $68, %%xmm8, %%xmm8\n"
9182           "xorps %%xmm9, %%xmm9\n"
9183           "movaps %%xmm7, %%xmm10\n"
9184           "shufps $14, %%xmm9, %%xmm10\n"
9185           "movaps %%xmm7, %%xmm11\n"
9186           "shufps $224, %%xmm11, %%xmm9\n"
9187           "addps %%xmm8, %%xmm10\n"
9188           "subps %%xmm9, %%xmm10\n"
9189           "movaps %%xmm10, %%xmm7\n"
9190           "movaps %%xmm0, %%xmm8\n"
9191           "movaps %%xmm0, %%xmm9\n"
9192           "addps %%xmm1, %%xmm8\n"
9193           "subps %%xmm1, %%xmm9\n"
9194           "movaps %%xmm2, %%xmm10\n"
9195           "movaps %%xmm2, %%xmm11\n"
9196           "addps %%xmm3, %%xmm10\n"
9197           "subps %%xmm3, %%xmm11\n"
9198           "movaps %%xmm4, %%xmm12\n"
9199           "movaps %%xmm4, %%xmm13\n"
9200           "addps %%xmm5, %%xmm12\n"
9201           "subps %%xmm5, %%xmm13\n"
9202           "movaps %%xmm6, %%xmm14\n"
9203           "movaps %%xmm6, %%xmm15\n"
9204           "addps %%xmm7, %%xmm14\n"
9205           "subps %%xmm7, %%xmm15\n"
9206           "movaps %%xmm8, %%xmm0\n"
9207           "movaps %%xmm8, %%xmm2\n"
9208           "addps %%xmm10, %%xmm0\n"
9209           "subps %%xmm10, %%xmm2\n"
9210           "movaps %%xmm9, %%xmm1\n"
9211           "movaps %%xmm9, %%xmm3\n"
9212           "addps %%xmm11, %%xmm1\n"
9213           "subps %%xmm11, %%xmm3\n"
9214           "movaps %%xmm12, %%xmm4\n"
9215           "movaps %%xmm12, %%xmm6\n"
9216           "addps %%xmm14, %%xmm4\n"
9217           "subps %%xmm14, %%xmm6\n"
9218           "movaps %%xmm13, %%xmm5\n"
9219           "movaps %%xmm13, %%xmm7\n"
9220           "addps %%xmm15, %%xmm5\n"
9221           "subps %%xmm15, %%xmm7\n"
9222           "movaps %%xmm0, %%xmm8\n"
9223           "movaps %%xmm0, %%xmm12\n"
9224           "addps %%xmm4, %%xmm8\n"
9225           "subps %%xmm4, %%xmm12\n"
9226           "movaps %%xmm1, %%xmm9\n"
9227           "movaps %%xmm1, %%xmm13\n"
9228           "addps %%xmm5, %%xmm9\n"
9229           "subps %%xmm5, %%xmm13\n"
9230           "movaps %%xmm2, %%xmm10\n"
9231           "movaps %%xmm2, %%xmm14\n"
9232           "addps %%xmm6, %%xmm10\n"
9233           "subps %%xmm6, %%xmm14\n"
9234           "movaps %%xmm3, %%xmm11\n"
9235           "movaps %%xmm3, %%xmm15\n"
9236           "addps %%xmm7, %%xmm11\n"
9237           "subps %%xmm7, %%xmm15\n"
9238           "movups %%xmm8, (%0)\n"
9239           "movups %%xmm9, (%1)\n"
9240           "movups %%xmm10, (%2)\n"
9241           "movups %%xmm11, (%3)\n"
9242           "movups %%xmm12, (%4)\n"
9243           "movups %%xmm13, (%5)\n"
9244           "movups %%xmm14, (%6)\n"
9245           "movups %%xmm15, (%7)\n"
9246           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
9247         );
9248       }
9249     }
9250     for (int j = 0; j < 256; j += 256) {
9251       for (int k = 0; k < 32; k += 4) {
9252         __asm__ volatile (
9253           "movups (%0), %%xmm0\n"
9254           "movups (%1), %%xmm1\n"
9255           "movups (%2), %%xmm2\n"
9256           "movups (%3), %%xmm3\n"
9257           "movups (%4), %%xmm4\n"
9258           "movups (%5), %%xmm5\n"
9259           "movups (%6), %%xmm6\n"
9260           "movups (%7), %%xmm7\n"
9261           "movaps %%xmm0, %%xmm8\n"
9262           "movaps %%xmm0, %%xmm9\n"
9263           "addps %%xmm1, %%xmm8\n"
9264           "subps %%xmm1, %%xmm9\n"
9265           "movaps %%xmm2, %%xmm10\n"
9266           "movaps %%xmm2, %%xmm11\n"
9267           "addps %%xmm3, %%xmm10\n"
9268           "subps %%xmm3, %%xmm11\n"
9269           "movaps %%xmm4, %%xmm12\n"
9270           "movaps %%xmm4, %%xmm13\n"
9271           "addps %%xmm5, %%xmm12\n"
9272           "subps %%xmm5, %%xmm13\n"
9273           "movaps %%xmm6, %%xmm14\n"
9274           "movaps %%xmm6, %%xmm15\n"
9275           "addps %%xmm7, %%xmm14\n"
9276           "subps %%xmm7, %%xmm15\n"
9277           "movaps %%xmm8, %%xmm0\n"
9278           "movaps %%xmm8, %%xmm2\n"
9279           "addps %%xmm10, %%xmm0\n"
9280           "subps %%xmm10, %%xmm2\n"
9281           "movaps %%xmm9, %%xmm1\n"
9282           "movaps %%xmm9, %%xmm3\n"
9283           "addps %%xmm11, %%xmm1\n"
9284           "subps %%xmm11, %%xmm3\n"
9285           "movaps %%xmm12, %%xmm4\n"
9286           "movaps %%xmm12, %%xmm6\n"
9287           "addps %%xmm14, %%xmm4\n"
9288           "subps %%xmm14, %%xmm6\n"
9289           "movaps %%xmm13, %%xmm5\n"
9290           "movaps %%xmm13, %%xmm7\n"
9291           "addps %%xmm15, %%xmm5\n"
9292           "subps %%xmm15, %%xmm7\n"
9293           "movaps %%xmm0, %%xmm8\n"
9294           "movaps %%xmm0, %%xmm12\n"
9295           "addps %%xmm4, %%xmm8\n"
9296           "subps %%xmm4, %%xmm12\n"
9297           "movaps %%xmm1, %%xmm9\n"
9298           "movaps %%xmm1, %%xmm13\n"
9299           "addps %%xmm5, %%xmm9\n"
9300           "subps %%xmm5, %%xmm13\n"
9301           "movaps %%xmm2, %%xmm10\n"
9302           "movaps %%xmm2, %%xmm14\n"
9303           "addps %%xmm6, %%xmm10\n"
9304           "subps %%xmm6, %%xmm14\n"
9305           "movaps %%xmm3, %%xmm11\n"
9306           "movaps %%xmm3, %%xmm15\n"
9307           "addps %%xmm7, %%xmm11\n"
9308           "subps %%xmm7, %%xmm15\n"
9309           "movups %%xmm8, (%0)\n"
9310           "movups %%xmm9, (%1)\n"
9311           "movups %%xmm10, (%2)\n"
9312           "movups %%xmm11, (%3)\n"
9313           "movups %%xmm12, (%4)\n"
9314           "movups %%xmm13, (%5)\n"
9315           "movups %%xmm14, (%6)\n"
9316           "movups %%xmm15, (%7)\n"
9317           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
9318         );
9319       }
9320     }
9321     return;
9322   }
9323   if (depth == 11) {
9324     helper_float_25_recursive(buf + 0, 8);
9325     helper_float_25_recursive(buf + 256, 8);
9326     helper_float_25_recursive(buf + 512, 8);
9327     helper_float_25_recursive(buf + 768, 8);
9328     helper_float_25_recursive(buf + 1024, 8);
9329     helper_float_25_recursive(buf + 1280, 8);
9330     helper_float_25_recursive(buf + 1536, 8);
9331     helper_float_25_recursive(buf + 1792, 8);
9332     for (int j = 0; j < 2048; j += 2048) {
9333       for (int k = 0; k < 256; k += 4) {
9334         __asm__ volatile (
9335           "movups (%0), %%xmm0\n"
9336           "movups (%1), %%xmm1\n"
9337           "movups (%2), %%xmm2\n"
9338           "movups (%3), %%xmm3\n"
9339           "movups (%4), %%xmm4\n"
9340           "movups (%5), %%xmm5\n"
9341           "movups (%6), %%xmm6\n"
9342           "movups (%7), %%xmm7\n"
9343           "movaps %%xmm0, %%xmm8\n"
9344           "movaps %%xmm0, %%xmm9\n"
9345           "addps %%xmm1, %%xmm8\n"
9346           "subps %%xmm1, %%xmm9\n"
9347           "movaps %%xmm2, %%xmm10\n"
9348           "movaps %%xmm2, %%xmm11\n"
9349           "addps %%xmm3, %%xmm10\n"
9350           "subps %%xmm3, %%xmm11\n"
9351           "movaps %%xmm4, %%xmm12\n"
9352           "movaps %%xmm4, %%xmm13\n"
9353           "addps %%xmm5, %%xmm12\n"
9354           "subps %%xmm5, %%xmm13\n"
9355           "movaps %%xmm6, %%xmm14\n"
9356           "movaps %%xmm6, %%xmm15\n"
9357           "addps %%xmm7, %%xmm14\n"
9358           "subps %%xmm7, %%xmm15\n"
9359           "movaps %%xmm8, %%xmm0\n"
9360           "movaps %%xmm8, %%xmm2\n"
9361           "addps %%xmm10, %%xmm0\n"
9362           "subps %%xmm10, %%xmm2\n"
9363           "movaps %%xmm9, %%xmm1\n"
9364           "movaps %%xmm9, %%xmm3\n"
9365           "addps %%xmm11, %%xmm1\n"
9366           "subps %%xmm11, %%xmm3\n"
9367           "movaps %%xmm12, %%xmm4\n"
9368           "movaps %%xmm12, %%xmm6\n"
9369           "addps %%xmm14, %%xmm4\n"
9370           "subps %%xmm14, %%xmm6\n"
9371           "movaps %%xmm13, %%xmm5\n"
9372           "movaps %%xmm13, %%xmm7\n"
9373           "addps %%xmm15, %%xmm5\n"
9374           "subps %%xmm15, %%xmm7\n"
9375           "movaps %%xmm0, %%xmm8\n"
9376           "movaps %%xmm0, %%xmm12\n"
9377           "addps %%xmm4, %%xmm8\n"
9378           "subps %%xmm4, %%xmm12\n"
9379           "movaps %%xmm1, %%xmm9\n"
9380           "movaps %%xmm1, %%xmm13\n"
9381           "addps %%xmm5, %%xmm9\n"
9382           "subps %%xmm5, %%xmm13\n"
9383           "movaps %%xmm2, %%xmm10\n"
9384           "movaps %%xmm2, %%xmm14\n"
9385           "addps %%xmm6, %%xmm10\n"
9386           "subps %%xmm6, %%xmm14\n"
9387           "movaps %%xmm3, %%xmm11\n"
9388           "movaps %%xmm3, %%xmm15\n"
9389           "addps %%xmm7, %%xmm11\n"
9390           "subps %%xmm7, %%xmm15\n"
9391           "movups %%xmm8, (%0)\n"
9392           "movups %%xmm9, (%1)\n"
9393           "movups %%xmm10, (%2)\n"
9394           "movups %%xmm11, (%3)\n"
9395           "movups %%xmm12, (%4)\n"
9396           "movups %%xmm13, (%5)\n"
9397           "movups %%xmm14, (%6)\n"
9398           "movups %%xmm15, (%7)\n"
9399           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
9400         );
9401       }
9402     }
9403     return;
9404   }
9405   if (depth == 14) {
9406     helper_float_25_recursive(buf + 0, 11);
9407     helper_float_25_recursive(buf + 2048, 11);
9408     helper_float_25_recursive(buf + 4096, 11);
9409     helper_float_25_recursive(buf + 6144, 11);
9410     helper_float_25_recursive(buf + 8192, 11);
9411     helper_float_25_recursive(buf + 10240, 11);
9412     helper_float_25_recursive(buf + 12288, 11);
9413     helper_float_25_recursive(buf + 14336, 11);
9414     for (int j = 0; j < 16384; j += 16384) {
9415       for (int k = 0; k < 2048; k += 4) {
9416         __asm__ volatile (
9417           "movups (%0), %%xmm0\n"
9418           "movups (%1), %%xmm1\n"
9419           "movups (%2), %%xmm2\n"
9420           "movups (%3), %%xmm3\n"
9421           "movups (%4), %%xmm4\n"
9422           "movups (%5), %%xmm5\n"
9423           "movups (%6), %%xmm6\n"
9424           "movups (%7), %%xmm7\n"
9425           "movaps %%xmm0, %%xmm8\n"
9426           "movaps %%xmm0, %%xmm9\n"
9427           "addps %%xmm1, %%xmm8\n"
9428           "subps %%xmm1, %%xmm9\n"
9429           "movaps %%xmm2, %%xmm10\n"
9430           "movaps %%xmm2, %%xmm11\n"
9431           "addps %%xmm3, %%xmm10\n"
9432           "subps %%xmm3, %%xmm11\n"
9433           "movaps %%xmm4, %%xmm12\n"
9434           "movaps %%xmm4, %%xmm13\n"
9435           "addps %%xmm5, %%xmm12\n"
9436           "subps %%xmm5, %%xmm13\n"
9437           "movaps %%xmm6, %%xmm14\n"
9438           "movaps %%xmm6, %%xmm15\n"
9439           "addps %%xmm7, %%xmm14\n"
9440           "subps %%xmm7, %%xmm15\n"
9441           "movaps %%xmm8, %%xmm0\n"
9442           "movaps %%xmm8, %%xmm2\n"
9443           "addps %%xmm10, %%xmm0\n"
9444           "subps %%xmm10, %%xmm2\n"
9445           "movaps %%xmm9, %%xmm1\n"
9446           "movaps %%xmm9, %%xmm3\n"
9447           "addps %%xmm11, %%xmm1\n"
9448           "subps %%xmm11, %%xmm3\n"
9449           "movaps %%xmm12, %%xmm4\n"
9450           "movaps %%xmm12, %%xmm6\n"
9451           "addps %%xmm14, %%xmm4\n"
9452           "subps %%xmm14, %%xmm6\n"
9453           "movaps %%xmm13, %%xmm5\n"
9454           "movaps %%xmm13, %%xmm7\n"
9455           "addps %%xmm15, %%xmm5\n"
9456           "subps %%xmm15, %%xmm7\n"
9457           "movaps %%xmm0, %%xmm8\n"
9458           "movaps %%xmm0, %%xmm12\n"
9459           "addps %%xmm4, %%xmm8\n"
9460           "subps %%xmm4, %%xmm12\n"
9461           "movaps %%xmm1, %%xmm9\n"
9462           "movaps %%xmm1, %%xmm13\n"
9463           "addps %%xmm5, %%xmm9\n"
9464           "subps %%xmm5, %%xmm13\n"
9465           "movaps %%xmm2, %%xmm10\n"
9466           "movaps %%xmm2, %%xmm14\n"
9467           "addps %%xmm6, %%xmm10\n"
9468           "subps %%xmm6, %%xmm14\n"
9469           "movaps %%xmm3, %%xmm11\n"
9470           "movaps %%xmm3, %%xmm15\n"
9471           "addps %%xmm7, %%xmm11\n"
9472           "subps %%xmm7, %%xmm15\n"
9473           "movups %%xmm8, (%0)\n"
9474           "movups %%xmm9, (%1)\n"
9475           "movups %%xmm10, (%2)\n"
9476           "movups %%xmm11, (%3)\n"
9477           "movups %%xmm12, (%4)\n"
9478           "movups %%xmm13, (%5)\n"
9479           "movups %%xmm14, (%6)\n"
9480           "movups %%xmm15, (%7)\n"
9481           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
9482         );
9483       }
9484     }
9485     return;
9486   }
9487   if (depth == 17) {
9488     helper_float_25_recursive(buf + 0, 14);
9489     helper_float_25_recursive(buf + 16384, 14);
9490     helper_float_25_recursive(buf + 32768, 14);
9491     helper_float_25_recursive(buf + 49152, 14);
9492     helper_float_25_recursive(buf + 65536, 14);
9493     helper_float_25_recursive(buf + 81920, 14);
9494     helper_float_25_recursive(buf + 98304, 14);
9495     helper_float_25_recursive(buf + 114688, 14);
9496     for (int j = 0; j < 131072; j += 131072) {
9497       for (int k = 0; k < 16384; k += 4) {
9498         __asm__ volatile (
9499           "movups (%0), %%xmm0\n"
9500           "movups (%1), %%xmm1\n"
9501           "movups (%2), %%xmm2\n"
9502           "movups (%3), %%xmm3\n"
9503           "movups (%4), %%xmm4\n"
9504           "movups (%5), %%xmm5\n"
9505           "movups (%6), %%xmm6\n"
9506           "movups (%7), %%xmm7\n"
9507           "movaps %%xmm0, %%xmm8\n"
9508           "movaps %%xmm0, %%xmm9\n"
9509           "addps %%xmm1, %%xmm8\n"
9510           "subps %%xmm1, %%xmm9\n"
9511           "movaps %%xmm2, %%xmm10\n"
9512           "movaps %%xmm2, %%xmm11\n"
9513           "addps %%xmm3, %%xmm10\n"
9514           "subps %%xmm3, %%xmm11\n"
9515           "movaps %%xmm4, %%xmm12\n"
9516           "movaps %%xmm4, %%xmm13\n"
9517           "addps %%xmm5, %%xmm12\n"
9518           "subps %%xmm5, %%xmm13\n"
9519           "movaps %%xmm6, %%xmm14\n"
9520           "movaps %%xmm6, %%xmm15\n"
9521           "addps %%xmm7, %%xmm14\n"
9522           "subps %%xmm7, %%xmm15\n"
9523           "movaps %%xmm8, %%xmm0\n"
9524           "movaps %%xmm8, %%xmm2\n"
9525           "addps %%xmm10, %%xmm0\n"
9526           "subps %%xmm10, %%xmm2\n"
9527           "movaps %%xmm9, %%xmm1\n"
9528           "movaps %%xmm9, %%xmm3\n"
9529           "addps %%xmm11, %%xmm1\n"
9530           "subps %%xmm11, %%xmm3\n"
9531           "movaps %%xmm12, %%xmm4\n"
9532           "movaps %%xmm12, %%xmm6\n"
9533           "addps %%xmm14, %%xmm4\n"
9534           "subps %%xmm14, %%xmm6\n"
9535           "movaps %%xmm13, %%xmm5\n"
9536           "movaps %%xmm13, %%xmm7\n"
9537           "addps %%xmm15, %%xmm5\n"
9538           "subps %%xmm15, %%xmm7\n"
9539           "movaps %%xmm0, %%xmm8\n"
9540           "movaps %%xmm0, %%xmm12\n"
9541           "addps %%xmm4, %%xmm8\n"
9542           "subps %%xmm4, %%xmm12\n"
9543           "movaps %%xmm1, %%xmm9\n"
9544           "movaps %%xmm1, %%xmm13\n"
9545           "addps %%xmm5, %%xmm9\n"
9546           "subps %%xmm5, %%xmm13\n"
9547           "movaps %%xmm2, %%xmm10\n"
9548           "movaps %%xmm2, %%xmm14\n"
9549           "addps %%xmm6, %%xmm10\n"
9550           "subps %%xmm6, %%xmm14\n"
9551           "movaps %%xmm3, %%xmm11\n"
9552           "movaps %%xmm3, %%xmm15\n"
9553           "addps %%xmm7, %%xmm11\n"
9554           "subps %%xmm7, %%xmm15\n"
9555           "movups %%xmm8, (%0)\n"
9556           "movups %%xmm9, (%1)\n"
9557           "movups %%xmm10, (%2)\n"
9558           "movups %%xmm11, (%3)\n"
9559           "movups %%xmm12, (%4)\n"
9560           "movups %%xmm13, (%5)\n"
9561           "movups %%xmm14, (%6)\n"
9562           "movups %%xmm15, (%7)\n"
9563           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
9564         );
9565       }
9566     }
9567     return;
9568   }
9569   if (depth == 20) {
9570     helper_float_25_recursive(buf + 0, 17);
9571     helper_float_25_recursive(buf + 131072, 17);
9572     helper_float_25_recursive(buf + 262144, 17);
9573     helper_float_25_recursive(buf + 393216, 17);
9574     helper_float_25_recursive(buf + 524288, 17);
9575     helper_float_25_recursive(buf + 655360, 17);
9576     helper_float_25_recursive(buf + 786432, 17);
9577     helper_float_25_recursive(buf + 917504, 17);
9578     for (int j = 0; j < 1048576; j += 1048576) {
9579       for (int k = 0; k < 131072; k += 4) {
9580         __asm__ volatile (
9581           "movups (%0), %%xmm0\n"
9582           "movups (%1), %%xmm1\n"
9583           "movups (%2), %%xmm2\n"
9584           "movups (%3), %%xmm3\n"
9585           "movups (%4), %%xmm4\n"
9586           "movups (%5), %%xmm5\n"
9587           "movups (%6), %%xmm6\n"
9588           "movups (%7), %%xmm7\n"
9589           "movaps %%xmm0, %%xmm8\n"
9590           "movaps %%xmm0, %%xmm9\n"
9591           "addps %%xmm1, %%xmm8\n"
9592           "subps %%xmm1, %%xmm9\n"
9593           "movaps %%xmm2, %%xmm10\n"
9594           "movaps %%xmm2, %%xmm11\n"
9595           "addps %%xmm3, %%xmm10\n"
9596           "subps %%xmm3, %%xmm11\n"
9597           "movaps %%xmm4, %%xmm12\n"
9598           "movaps %%xmm4, %%xmm13\n"
9599           "addps %%xmm5, %%xmm12\n"
9600           "subps %%xmm5, %%xmm13\n"
9601           "movaps %%xmm6, %%xmm14\n"
9602           "movaps %%xmm6, %%xmm15\n"
9603           "addps %%xmm7, %%xmm14\n"
9604           "subps %%xmm7, %%xmm15\n"
9605           "movaps %%xmm8, %%xmm0\n"
9606           "movaps %%xmm8, %%xmm2\n"
9607           "addps %%xmm10, %%xmm0\n"
9608           "subps %%xmm10, %%xmm2\n"
9609           "movaps %%xmm9, %%xmm1\n"
9610           "movaps %%xmm9, %%xmm3\n"
9611           "addps %%xmm11, %%xmm1\n"
9612           "subps %%xmm11, %%xmm3\n"
9613           "movaps %%xmm12, %%xmm4\n"
9614           "movaps %%xmm12, %%xmm6\n"
9615           "addps %%xmm14, %%xmm4\n"
9616           "subps %%xmm14, %%xmm6\n"
9617           "movaps %%xmm13, %%xmm5\n"
9618           "movaps %%xmm13, %%xmm7\n"
9619           "addps %%xmm15, %%xmm5\n"
9620           "subps %%xmm15, %%xmm7\n"
9621           "movaps %%xmm0, %%xmm8\n"
9622           "movaps %%xmm0, %%xmm12\n"
9623           "addps %%xmm4, %%xmm8\n"
9624           "subps %%xmm4, %%xmm12\n"
9625           "movaps %%xmm1, %%xmm9\n"
9626           "movaps %%xmm1, %%xmm13\n"
9627           "addps %%xmm5, %%xmm9\n"
9628           "subps %%xmm5, %%xmm13\n"
9629           "movaps %%xmm2, %%xmm10\n"
9630           "movaps %%xmm2, %%xmm14\n"
9631           "addps %%xmm6, %%xmm10\n"
9632           "subps %%xmm6, %%xmm14\n"
9633           "movaps %%xmm3, %%xmm11\n"
9634           "movaps %%xmm3, %%xmm15\n"
9635           "addps %%xmm7, %%xmm11\n"
9636           "subps %%xmm7, %%xmm15\n"
9637           "movups %%xmm8, (%0)\n"
9638           "movups %%xmm9, (%1)\n"
9639           "movups %%xmm10, (%2)\n"
9640           "movups %%xmm11, (%3)\n"
9641           "movups %%xmm12, (%4)\n"
9642           "movups %%xmm13, (%5)\n"
9643           "movups %%xmm14, (%6)\n"
9644           "movups %%xmm15, (%7)\n"
9645           :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
9646         );
9647       }
9648     }
9649     return;
9650   }
9651   if (depth == 23) {
9652     helper_float_25_recursive(buf + 0, 20);
9653     helper_float_25_recursive(buf + 1048576, 20);
9654     helper_float_25_recursive(buf + 2097152, 20);
9655     helper_float_25_recursive(buf + 3145728, 20);
9656     helper_float_25_recursive(buf + 4194304, 20);
9657     helper_float_25_recursive(buf + 5242880, 20);
9658     helper_float_25_recursive(buf + 6291456, 20);
9659     helper_float_25_recursive(buf + 7340032, 20);
9660     for (int j = 0; j < 8388608; j += 8388608) {
9661       for (int k = 0; k < 1048576; k += 4) {
9662         __asm__ volatile (
9663           "movups (%0), %%xmm0\n"
9664           "movups (%1), %%xmm1\n"
9665           "movups (%2), %%xmm2\n"
9666           "movups (%3), %%xmm3\n"
9667           "movups (%4), %%xmm4\n"
9668           "movups (%5), %%xmm5\n"
9669           "movups (%6), %%xmm6\n"
9670           "movups (%7), %%xmm7\n"
9671           "movaps %%xmm0, %%xmm8\n"
9672           "movaps %%xmm0, %%xmm9\n"
9673           "addps %%xmm1, %%xmm8\n"
9674           "subps %%xmm1, %%xmm9\n"
9675           "movaps %%xmm2, %%xmm10\n"
9676           "movaps %%xmm2, %%xmm11\n"
9677           "addps %%xmm3, %%xmm10\n"
9678           "subps %%xmm3, %%xmm11\n"
9679           "movaps %%xmm4, %%xmm12\n"
9680           "movaps %%xmm4, %%xmm13\n"
9681           "addps %%xmm5, %%xmm12\n"
9682           "subps %%xmm5, %%xmm13\n"
9683           "movaps %%xmm6, %%xmm14\n"
9684           "movaps %%xmm6, %%xmm15\n"
9685           "addps %%xmm7, %%xmm14\n"
9686           "subps %%xmm7, %%xmm15\n"
9687           "movaps %%xmm8, %%xmm0\n"
9688           "movaps %%xmm8, %%xmm2\n"
9689           "addps %%xmm10, %%xmm0\n"
9690           "subps %%xmm10, %%xmm2\n"
9691           "movaps %%xmm9, %%xmm1\n"
9692           "movaps %%xmm9, %%xmm3\n"
9693           "addps %%xmm11, %%xmm1\n"
9694           "subps %%xmm11, %%xmm3\n"
9695           "movaps %%xmm12, %%xmm4\n"
9696           "movaps %%xmm12, %%xmm6\n"
9697           "addps %%xmm14, %%xmm4\n"
9698           "subps %%xmm14, %%xmm6\n"
9699           "movaps %%xmm13, %%xmm5\n"
9700           "movaps %%xmm13, %%xmm7\n"
9701           "addps %%xmm15, %%xmm5\n"
9702           "subps %%xmm15, %%xmm7\n"
9703           "movaps %%xmm0, %%xmm8\n"
9704           "movaps %%xmm0, %%xmm12\n"
9705           "addps %%xmm4, %%xmm8\n"
9706           "subps %%xmm4, %%xmm12\n"
9707           "movaps %%xmm1, %%xmm9\n"
9708           "movaps %%xmm1, %%xmm13\n"
9709           "addps %%xmm5, %%xmm9\n"
9710           "subps %%xmm5, %%xmm13\n"
9711           "movaps %%xmm2, %%xmm10\n"
9712           "movaps %%xmm2, %%xmm14\n"
9713           "addps %%xmm6, %%xmm10\n"
9714           "subps %%xmm6, %%xmm14\n"
9715           "movaps %%xmm3, %%xmm11\n"
9716           "movaps %%xmm3, %%xmm15\n"
9717           "addps %%xmm7, %%xmm11\n"
9718           "subps %%xmm7, %%xmm15\n"
9719           "movups %%xmm8, (%0)\n"
9720           "movups %%xmm9, (%1)\n"
9721           "movups %%xmm10, (%2)\n"
9722           "movups %%xmm11, (%3)\n"
9723           "movups %%xmm12, (%4)\n"
9724           "movups %%xmm13, (%5)\n"
9725           "movups %%xmm14, (%6)\n"
9726           "movups %%xmm15, (%7)\n"
9727           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
9728         );
9729       }
9730     }
9731     return;
9732   }
9733   if (depth == 25) {
9734     helper_float_25_recursive(buf + 0, 23);
9735     helper_float_25_recursive(buf + 8388608, 23);
9736     helper_float_25_recursive(buf + 16777216, 23);
9737     helper_float_25_recursive(buf + 25165824, 23);
9738     for (int j = 0; j < 33554432; j += 33554432) {
9739       for (int k = 0; k < 8388608; k += 4) {
9740         __asm__ volatile (
9741           "movups (%0), %%xmm0\n"
9742           "movups (%1), %%xmm1\n"
9743           "movups (%2), %%xmm2\n"
9744           "movups (%3), %%xmm3\n"
9745           "movaps %%xmm0, %%xmm8\n"
9746           "movaps %%xmm0, %%xmm9\n"
9747           "addps %%xmm1, %%xmm8\n"
9748           "subps %%xmm1, %%xmm9\n"
9749           "movaps %%xmm2, %%xmm10\n"
9750           "movaps %%xmm2, %%xmm11\n"
9751           "addps %%xmm3, %%xmm10\n"
9752           "subps %%xmm3, %%xmm11\n"
9753           "movaps %%xmm8, %%xmm0\n"
9754           "movaps %%xmm8, %%xmm2\n"
9755           "addps %%xmm10, %%xmm0\n"
9756           "subps %%xmm10, %%xmm2\n"
9757           "movaps %%xmm9, %%xmm1\n"
9758           "movaps %%xmm9, %%xmm3\n"
9759           "addps %%xmm11, %%xmm1\n"
9760           "subps %%xmm11, %%xmm3\n"
9761           "movups %%xmm0, (%0)\n"
9762           "movups %%xmm1, (%1)\n"
9763           "movups %%xmm2, (%2)\n"
9764           "movups %%xmm3, (%3)\n"
9765           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
9766         );
9767       }
9768     }
9769     return;
9770   }
9771 }
9772 void helper_float_25(float *buf);
helper_float_25(float * buf)9773 void helper_float_25(float *buf) {
9774   helper_float_25_recursive(buf, 25);
9775 }
9776 void helper_float_26_recursive(float *buf, int depth);
helper_float_26_recursive(float * buf,int depth)9777 void helper_float_26_recursive(float *buf, int depth) {
9778   if (depth == 5) {
9779     for (int j = 0; j < 32; j += 32) {
9780       for (int k = 0; k < 4; k += 4) {
9781         __asm__ volatile (
9782           "movups (%0), %%xmm0\n"
9783           "movups (%1), %%xmm1\n"
9784           "movups (%2), %%xmm2\n"
9785           "movups (%3), %%xmm3\n"
9786           "movups (%4), %%xmm4\n"
9787           "movups (%5), %%xmm5\n"
9788           "movups (%6), %%xmm6\n"
9789           "movups (%7), %%xmm7\n"
9790           "movaps %%xmm0, %%xmm8\n"
9791           "shufps $160, %%xmm8, %%xmm8\n"
9792           "shufps $245, %%xmm0, %%xmm0\n"
9793           "xorps %%xmm9, %%xmm9\n"
9794           "subps %%xmm0, %%xmm9\n"
9795           "addsubps %%xmm9, %%xmm8\n"
9796           "movaps %%xmm8, %%xmm0\n"
9797           "movaps %%xmm1, %%xmm8\n"
9798           "shufps $160, %%xmm8, %%xmm8\n"
9799           "shufps $245, %%xmm1, %%xmm1\n"
9800           "xorps %%xmm9, %%xmm9\n"
9801           "subps %%xmm1, %%xmm9\n"
9802           "addsubps %%xmm9, %%xmm8\n"
9803           "movaps %%xmm8, %%xmm1\n"
9804           "movaps %%xmm2, %%xmm8\n"
9805           "shufps $160, %%xmm8, %%xmm8\n"
9806           "shufps $245, %%xmm2, %%xmm2\n"
9807           "xorps %%xmm9, %%xmm9\n"
9808           "subps %%xmm2, %%xmm9\n"
9809           "addsubps %%xmm9, %%xmm8\n"
9810           "movaps %%xmm8, %%xmm2\n"
9811           "movaps %%xmm3, %%xmm8\n"
9812           "shufps $160, %%xmm8, %%xmm8\n"
9813           "shufps $245, %%xmm3, %%xmm3\n"
9814           "xorps %%xmm9, %%xmm9\n"
9815           "subps %%xmm3, %%xmm9\n"
9816           "addsubps %%xmm9, %%xmm8\n"
9817           "movaps %%xmm8, %%xmm3\n"
9818           "movaps %%xmm4, %%xmm8\n"
9819           "shufps $160, %%xmm8, %%xmm8\n"
9820           "shufps $245, %%xmm4, %%xmm4\n"
9821           "xorps %%xmm9, %%xmm9\n"
9822           "subps %%xmm4, %%xmm9\n"
9823           "addsubps %%xmm9, %%xmm8\n"
9824           "movaps %%xmm8, %%xmm4\n"
9825           "movaps %%xmm5, %%xmm8\n"
9826           "shufps $160, %%xmm8, %%xmm8\n"
9827           "shufps $245, %%xmm5, %%xmm5\n"
9828           "xorps %%xmm9, %%xmm9\n"
9829           "subps %%xmm5, %%xmm9\n"
9830           "addsubps %%xmm9, %%xmm8\n"
9831           "movaps %%xmm8, %%xmm5\n"
9832           "movaps %%xmm6, %%xmm8\n"
9833           "shufps $160, %%xmm8, %%xmm8\n"
9834           "shufps $245, %%xmm6, %%xmm6\n"
9835           "xorps %%xmm9, %%xmm9\n"
9836           "subps %%xmm6, %%xmm9\n"
9837           "addsubps %%xmm9, %%xmm8\n"
9838           "movaps %%xmm8, %%xmm6\n"
9839           "movaps %%xmm7, %%xmm8\n"
9840           "shufps $160, %%xmm8, %%xmm8\n"
9841           "shufps $245, %%xmm7, %%xmm7\n"
9842           "xorps %%xmm9, %%xmm9\n"
9843           "subps %%xmm7, %%xmm9\n"
9844           "addsubps %%xmm9, %%xmm8\n"
9845           "movaps %%xmm8, %%xmm7\n"
9846           "movaps %%xmm0, %%xmm8\n"
9847           "shufps $68, %%xmm8, %%xmm8\n"
9848           "xorps %%xmm9, %%xmm9\n"
9849           "movaps %%xmm0, %%xmm10\n"
9850           "shufps $14, %%xmm9, %%xmm10\n"
9851           "movaps %%xmm0, %%xmm11\n"
9852           "shufps $224, %%xmm11, %%xmm9\n"
9853           "addps %%xmm8, %%xmm10\n"
9854           "subps %%xmm9, %%xmm10\n"
9855           "movaps %%xmm10, %%xmm0\n"
9856           "movaps %%xmm1, %%xmm8\n"
9857           "shufps $68, %%xmm8, %%xmm8\n"
9858           "xorps %%xmm9, %%xmm9\n"
9859           "movaps %%xmm1, %%xmm10\n"
9860           "shufps $14, %%xmm9, %%xmm10\n"
9861           "movaps %%xmm1, %%xmm11\n"
9862           "shufps $224, %%xmm11, %%xmm9\n"
9863           "addps %%xmm8, %%xmm10\n"
9864           "subps %%xmm9, %%xmm10\n"
9865           "movaps %%xmm10, %%xmm1\n"
9866           "movaps %%xmm2, %%xmm8\n"
9867           "shufps $68, %%xmm8, %%xmm8\n"
9868           "xorps %%xmm9, %%xmm9\n"
9869           "movaps %%xmm2, %%xmm10\n"
9870           "shufps $14, %%xmm9, %%xmm10\n"
9871           "movaps %%xmm2, %%xmm11\n"
9872           "shufps $224, %%xmm11, %%xmm9\n"
9873           "addps %%xmm8, %%xmm10\n"
9874           "subps %%xmm9, %%xmm10\n"
9875           "movaps %%xmm10, %%xmm2\n"
9876           "movaps %%xmm3, %%xmm8\n"
9877           "shufps $68, %%xmm8, %%xmm8\n"
9878           "xorps %%xmm9, %%xmm9\n"
9879           "movaps %%xmm3, %%xmm10\n"
9880           "shufps $14, %%xmm9, %%xmm10\n"
9881           "movaps %%xmm3, %%xmm11\n"
9882           "shufps $224, %%xmm11, %%xmm9\n"
9883           "addps %%xmm8, %%xmm10\n"
9884           "subps %%xmm9, %%xmm10\n"
9885           "movaps %%xmm10, %%xmm3\n"
9886           "movaps %%xmm4, %%xmm8\n"
9887           "shufps $68, %%xmm8, %%xmm8\n"
9888           "xorps %%xmm9, %%xmm9\n"
9889           "movaps %%xmm4, %%xmm10\n"
9890           "shufps $14, %%xmm9, %%xmm10\n"
9891           "movaps %%xmm4, %%xmm11\n"
9892           "shufps $224, %%xmm11, %%xmm9\n"
9893           "addps %%xmm8, %%xmm10\n"
9894           "subps %%xmm9, %%xmm10\n"
9895           "movaps %%xmm10, %%xmm4\n"
9896           "movaps %%xmm5, %%xmm8\n"
9897           "shufps $68, %%xmm8, %%xmm8\n"
9898           "xorps %%xmm9, %%xmm9\n"
9899           "movaps %%xmm5, %%xmm10\n"
9900           "shufps $14, %%xmm9, %%xmm10\n"
9901           "movaps %%xmm5, %%xmm11\n"
9902           "shufps $224, %%xmm11, %%xmm9\n"
9903           "addps %%xmm8, %%xmm10\n"
9904           "subps %%xmm9, %%xmm10\n"
9905           "movaps %%xmm10, %%xmm5\n"
9906           "movaps %%xmm6, %%xmm8\n"
9907           "shufps $68, %%xmm8, %%xmm8\n"
9908           "xorps %%xmm9, %%xmm9\n"
9909           "movaps %%xmm6, %%xmm10\n"
9910           "shufps $14, %%xmm9, %%xmm10\n"
9911           "movaps %%xmm6, %%xmm11\n"
9912           "shufps $224, %%xmm11, %%xmm9\n"
9913           "addps %%xmm8, %%xmm10\n"
9914           "subps %%xmm9, %%xmm10\n"
9915           "movaps %%xmm10, %%xmm6\n"
9916           "movaps %%xmm7, %%xmm8\n"
9917           "shufps $68, %%xmm8, %%xmm8\n"
9918           "xorps %%xmm9, %%xmm9\n"
9919           "movaps %%xmm7, %%xmm10\n"
9920           "shufps $14, %%xmm9, %%xmm10\n"
9921           "movaps %%xmm7, %%xmm11\n"
9922           "shufps $224, %%xmm11, %%xmm9\n"
9923           "addps %%xmm8, %%xmm10\n"
9924           "subps %%xmm9, %%xmm10\n"
9925           "movaps %%xmm10, %%xmm7\n"
9926           "movaps %%xmm0, %%xmm8\n"
9927           "movaps %%xmm0, %%xmm9\n"
9928           "addps %%xmm1, %%xmm8\n"
9929           "subps %%xmm1, %%xmm9\n"
9930           "movaps %%xmm2, %%xmm10\n"
9931           "movaps %%xmm2, %%xmm11\n"
9932           "addps %%xmm3, %%xmm10\n"
9933           "subps %%xmm3, %%xmm11\n"
9934           "movaps %%xmm4, %%xmm12\n"
9935           "movaps %%xmm4, %%xmm13\n"
9936           "addps %%xmm5, %%xmm12\n"
9937           "subps %%xmm5, %%xmm13\n"
9938           "movaps %%xmm6, %%xmm14\n"
9939           "movaps %%xmm6, %%xmm15\n"
9940           "addps %%xmm7, %%xmm14\n"
9941           "subps %%xmm7, %%xmm15\n"
9942           "movaps %%xmm8, %%xmm0\n"
9943           "movaps %%xmm8, %%xmm2\n"
9944           "addps %%xmm10, %%xmm0\n"
9945           "subps %%xmm10, %%xmm2\n"
9946           "movaps %%xmm9, %%xmm1\n"
9947           "movaps %%xmm9, %%xmm3\n"
9948           "addps %%xmm11, %%xmm1\n"
9949           "subps %%xmm11, %%xmm3\n"
9950           "movaps %%xmm12, %%xmm4\n"
9951           "movaps %%xmm12, %%xmm6\n"
9952           "addps %%xmm14, %%xmm4\n"
9953           "subps %%xmm14, %%xmm6\n"
9954           "movaps %%xmm13, %%xmm5\n"
9955           "movaps %%xmm13, %%xmm7\n"
9956           "addps %%xmm15, %%xmm5\n"
9957           "subps %%xmm15, %%xmm7\n"
9958           "movaps %%xmm0, %%xmm8\n"
9959           "movaps %%xmm0, %%xmm12\n"
9960           "addps %%xmm4, %%xmm8\n"
9961           "subps %%xmm4, %%xmm12\n"
9962           "movaps %%xmm1, %%xmm9\n"
9963           "movaps %%xmm1, %%xmm13\n"
9964           "addps %%xmm5, %%xmm9\n"
9965           "subps %%xmm5, %%xmm13\n"
9966           "movaps %%xmm2, %%xmm10\n"
9967           "movaps %%xmm2, %%xmm14\n"
9968           "addps %%xmm6, %%xmm10\n"
9969           "subps %%xmm6, %%xmm14\n"
9970           "movaps %%xmm3, %%xmm11\n"
9971           "movaps %%xmm3, %%xmm15\n"
9972           "addps %%xmm7, %%xmm11\n"
9973           "subps %%xmm7, %%xmm15\n"
9974           "movups %%xmm8, (%0)\n"
9975           "movups %%xmm9, (%1)\n"
9976           "movups %%xmm10, (%2)\n"
9977           "movups %%xmm11, (%3)\n"
9978           "movups %%xmm12, (%4)\n"
9979           "movups %%xmm13, (%5)\n"
9980           "movups %%xmm14, (%6)\n"
9981           "movups %%xmm15, (%7)\n"
9982           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
9983         );
9984       }
9985     }
9986     return;
9987   }
9988   if (depth == 8) {
9989     helper_float_26_recursive(buf + 0, 5);
9990     helper_float_26_recursive(buf + 32, 5);
9991     helper_float_26_recursive(buf + 64, 5);
9992     helper_float_26_recursive(buf + 96, 5);
9993     helper_float_26_recursive(buf + 128, 5);
9994     helper_float_26_recursive(buf + 160, 5);
9995     helper_float_26_recursive(buf + 192, 5);
9996     helper_float_26_recursive(buf + 224, 5);
9997     for (int j = 0; j < 256; j += 256) {
9998       for (int k = 0; k < 32; k += 4) {
9999         __asm__ volatile (
10000           "movups (%0), %%xmm0\n"
10001           "movups (%1), %%xmm1\n"
10002           "movups (%2), %%xmm2\n"
10003           "movups (%3), %%xmm3\n"
10004           "movups (%4), %%xmm4\n"
10005           "movups (%5), %%xmm5\n"
10006           "movups (%6), %%xmm6\n"
10007           "movups (%7), %%xmm7\n"
10008           "movaps %%xmm0, %%xmm8\n"
10009           "movaps %%xmm0, %%xmm9\n"
10010           "addps %%xmm1, %%xmm8\n"
10011           "subps %%xmm1, %%xmm9\n"
10012           "movaps %%xmm2, %%xmm10\n"
10013           "movaps %%xmm2, %%xmm11\n"
10014           "addps %%xmm3, %%xmm10\n"
10015           "subps %%xmm3, %%xmm11\n"
10016           "movaps %%xmm4, %%xmm12\n"
10017           "movaps %%xmm4, %%xmm13\n"
10018           "addps %%xmm5, %%xmm12\n"
10019           "subps %%xmm5, %%xmm13\n"
10020           "movaps %%xmm6, %%xmm14\n"
10021           "movaps %%xmm6, %%xmm15\n"
10022           "addps %%xmm7, %%xmm14\n"
10023           "subps %%xmm7, %%xmm15\n"
10024           "movaps %%xmm8, %%xmm0\n"
10025           "movaps %%xmm8, %%xmm2\n"
10026           "addps %%xmm10, %%xmm0\n"
10027           "subps %%xmm10, %%xmm2\n"
10028           "movaps %%xmm9, %%xmm1\n"
10029           "movaps %%xmm9, %%xmm3\n"
10030           "addps %%xmm11, %%xmm1\n"
10031           "subps %%xmm11, %%xmm3\n"
10032           "movaps %%xmm12, %%xmm4\n"
10033           "movaps %%xmm12, %%xmm6\n"
10034           "addps %%xmm14, %%xmm4\n"
10035           "subps %%xmm14, %%xmm6\n"
10036           "movaps %%xmm13, %%xmm5\n"
10037           "movaps %%xmm13, %%xmm7\n"
10038           "addps %%xmm15, %%xmm5\n"
10039           "subps %%xmm15, %%xmm7\n"
10040           "movaps %%xmm0, %%xmm8\n"
10041           "movaps %%xmm0, %%xmm12\n"
10042           "addps %%xmm4, %%xmm8\n"
10043           "subps %%xmm4, %%xmm12\n"
10044           "movaps %%xmm1, %%xmm9\n"
10045           "movaps %%xmm1, %%xmm13\n"
10046           "addps %%xmm5, %%xmm9\n"
10047           "subps %%xmm5, %%xmm13\n"
10048           "movaps %%xmm2, %%xmm10\n"
10049           "movaps %%xmm2, %%xmm14\n"
10050           "addps %%xmm6, %%xmm10\n"
10051           "subps %%xmm6, %%xmm14\n"
10052           "movaps %%xmm3, %%xmm11\n"
10053           "movaps %%xmm3, %%xmm15\n"
10054           "addps %%xmm7, %%xmm11\n"
10055           "subps %%xmm7, %%xmm15\n"
10056           "movups %%xmm8, (%0)\n"
10057           "movups %%xmm9, (%1)\n"
10058           "movups %%xmm10, (%2)\n"
10059           "movups %%xmm11, (%3)\n"
10060           "movups %%xmm12, (%4)\n"
10061           "movups %%xmm13, (%5)\n"
10062           "movups %%xmm14, (%6)\n"
10063           "movups %%xmm15, (%7)\n"
10064           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10065         );
10066       }
10067     }
10068     return;
10069   }
10070   if (depth == 11) {
10071     helper_float_26_recursive(buf + 0, 8);
10072     helper_float_26_recursive(buf + 256, 8);
10073     helper_float_26_recursive(buf + 512, 8);
10074     helper_float_26_recursive(buf + 768, 8);
10075     helper_float_26_recursive(buf + 1024, 8);
10076     helper_float_26_recursive(buf + 1280, 8);
10077     helper_float_26_recursive(buf + 1536, 8);
10078     helper_float_26_recursive(buf + 1792, 8);
10079     for (int j = 0; j < 2048; j += 2048) {
10080       for (int k = 0; k < 256; k += 4) {
10081         __asm__ volatile (
10082           "movups (%0), %%xmm0\n"
10083           "movups (%1), %%xmm1\n"
10084           "movups (%2), %%xmm2\n"
10085           "movups (%3), %%xmm3\n"
10086           "movups (%4), %%xmm4\n"
10087           "movups (%5), %%xmm5\n"
10088           "movups (%6), %%xmm6\n"
10089           "movups (%7), %%xmm7\n"
10090           "movaps %%xmm0, %%xmm8\n"
10091           "movaps %%xmm0, %%xmm9\n"
10092           "addps %%xmm1, %%xmm8\n"
10093           "subps %%xmm1, %%xmm9\n"
10094           "movaps %%xmm2, %%xmm10\n"
10095           "movaps %%xmm2, %%xmm11\n"
10096           "addps %%xmm3, %%xmm10\n"
10097           "subps %%xmm3, %%xmm11\n"
10098           "movaps %%xmm4, %%xmm12\n"
10099           "movaps %%xmm4, %%xmm13\n"
10100           "addps %%xmm5, %%xmm12\n"
10101           "subps %%xmm5, %%xmm13\n"
10102           "movaps %%xmm6, %%xmm14\n"
10103           "movaps %%xmm6, %%xmm15\n"
10104           "addps %%xmm7, %%xmm14\n"
10105           "subps %%xmm7, %%xmm15\n"
10106           "movaps %%xmm8, %%xmm0\n"
10107           "movaps %%xmm8, %%xmm2\n"
10108           "addps %%xmm10, %%xmm0\n"
10109           "subps %%xmm10, %%xmm2\n"
10110           "movaps %%xmm9, %%xmm1\n"
10111           "movaps %%xmm9, %%xmm3\n"
10112           "addps %%xmm11, %%xmm1\n"
10113           "subps %%xmm11, %%xmm3\n"
10114           "movaps %%xmm12, %%xmm4\n"
10115           "movaps %%xmm12, %%xmm6\n"
10116           "addps %%xmm14, %%xmm4\n"
10117           "subps %%xmm14, %%xmm6\n"
10118           "movaps %%xmm13, %%xmm5\n"
10119           "movaps %%xmm13, %%xmm7\n"
10120           "addps %%xmm15, %%xmm5\n"
10121           "subps %%xmm15, %%xmm7\n"
10122           "movaps %%xmm0, %%xmm8\n"
10123           "movaps %%xmm0, %%xmm12\n"
10124           "addps %%xmm4, %%xmm8\n"
10125           "subps %%xmm4, %%xmm12\n"
10126           "movaps %%xmm1, %%xmm9\n"
10127           "movaps %%xmm1, %%xmm13\n"
10128           "addps %%xmm5, %%xmm9\n"
10129           "subps %%xmm5, %%xmm13\n"
10130           "movaps %%xmm2, %%xmm10\n"
10131           "movaps %%xmm2, %%xmm14\n"
10132           "addps %%xmm6, %%xmm10\n"
10133           "subps %%xmm6, %%xmm14\n"
10134           "movaps %%xmm3, %%xmm11\n"
10135           "movaps %%xmm3, %%xmm15\n"
10136           "addps %%xmm7, %%xmm11\n"
10137           "subps %%xmm7, %%xmm15\n"
10138           "movups %%xmm8, (%0)\n"
10139           "movups %%xmm9, (%1)\n"
10140           "movups %%xmm10, (%2)\n"
10141           "movups %%xmm11, (%3)\n"
10142           "movups %%xmm12, (%4)\n"
10143           "movups %%xmm13, (%5)\n"
10144           "movups %%xmm14, (%6)\n"
10145           "movups %%xmm15, (%7)\n"
10146           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10147         );
10148       }
10149     }
10150     return;
10151   }
10152   if (depth == 14) {
10153     helper_float_26_recursive(buf + 0, 11);
10154     helper_float_26_recursive(buf + 2048, 11);
10155     helper_float_26_recursive(buf + 4096, 11);
10156     helper_float_26_recursive(buf + 6144, 11);
10157     helper_float_26_recursive(buf + 8192, 11);
10158     helper_float_26_recursive(buf + 10240, 11);
10159     helper_float_26_recursive(buf + 12288, 11);
10160     helper_float_26_recursive(buf + 14336, 11);
10161     for (int j = 0; j < 16384; j += 16384) {
10162       for (int k = 0; k < 2048; k += 4) {
10163         __asm__ volatile (
10164           "movups (%0), %%xmm0\n"
10165           "movups (%1), %%xmm1\n"
10166           "movups (%2), %%xmm2\n"
10167           "movups (%3), %%xmm3\n"
10168           "movups (%4), %%xmm4\n"
10169           "movups (%5), %%xmm5\n"
10170           "movups (%6), %%xmm6\n"
10171           "movups (%7), %%xmm7\n"
10172           "movaps %%xmm0, %%xmm8\n"
10173           "movaps %%xmm0, %%xmm9\n"
10174           "addps %%xmm1, %%xmm8\n"
10175           "subps %%xmm1, %%xmm9\n"
10176           "movaps %%xmm2, %%xmm10\n"
10177           "movaps %%xmm2, %%xmm11\n"
10178           "addps %%xmm3, %%xmm10\n"
10179           "subps %%xmm3, %%xmm11\n"
10180           "movaps %%xmm4, %%xmm12\n"
10181           "movaps %%xmm4, %%xmm13\n"
10182           "addps %%xmm5, %%xmm12\n"
10183           "subps %%xmm5, %%xmm13\n"
10184           "movaps %%xmm6, %%xmm14\n"
10185           "movaps %%xmm6, %%xmm15\n"
10186           "addps %%xmm7, %%xmm14\n"
10187           "subps %%xmm7, %%xmm15\n"
10188           "movaps %%xmm8, %%xmm0\n"
10189           "movaps %%xmm8, %%xmm2\n"
10190           "addps %%xmm10, %%xmm0\n"
10191           "subps %%xmm10, %%xmm2\n"
10192           "movaps %%xmm9, %%xmm1\n"
10193           "movaps %%xmm9, %%xmm3\n"
10194           "addps %%xmm11, %%xmm1\n"
10195           "subps %%xmm11, %%xmm3\n"
10196           "movaps %%xmm12, %%xmm4\n"
10197           "movaps %%xmm12, %%xmm6\n"
10198           "addps %%xmm14, %%xmm4\n"
10199           "subps %%xmm14, %%xmm6\n"
10200           "movaps %%xmm13, %%xmm5\n"
10201           "movaps %%xmm13, %%xmm7\n"
10202           "addps %%xmm15, %%xmm5\n"
10203           "subps %%xmm15, %%xmm7\n"
10204           "movaps %%xmm0, %%xmm8\n"
10205           "movaps %%xmm0, %%xmm12\n"
10206           "addps %%xmm4, %%xmm8\n"
10207           "subps %%xmm4, %%xmm12\n"
10208           "movaps %%xmm1, %%xmm9\n"
10209           "movaps %%xmm1, %%xmm13\n"
10210           "addps %%xmm5, %%xmm9\n"
10211           "subps %%xmm5, %%xmm13\n"
10212           "movaps %%xmm2, %%xmm10\n"
10213           "movaps %%xmm2, %%xmm14\n"
10214           "addps %%xmm6, %%xmm10\n"
10215           "subps %%xmm6, %%xmm14\n"
10216           "movaps %%xmm3, %%xmm11\n"
10217           "movaps %%xmm3, %%xmm15\n"
10218           "addps %%xmm7, %%xmm11\n"
10219           "subps %%xmm7, %%xmm15\n"
10220           "movups %%xmm8, (%0)\n"
10221           "movups %%xmm9, (%1)\n"
10222           "movups %%xmm10, (%2)\n"
10223           "movups %%xmm11, (%3)\n"
10224           "movups %%xmm12, (%4)\n"
10225           "movups %%xmm13, (%5)\n"
10226           "movups %%xmm14, (%6)\n"
10227           "movups %%xmm15, (%7)\n"
10228           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10229         );
10230       }
10231     }
10232     return;
10233   }
10234   if (depth == 17) {
10235     helper_float_26_recursive(buf + 0, 14);
10236     helper_float_26_recursive(buf + 16384, 14);
10237     helper_float_26_recursive(buf + 32768, 14);
10238     helper_float_26_recursive(buf + 49152, 14);
10239     helper_float_26_recursive(buf + 65536, 14);
10240     helper_float_26_recursive(buf + 81920, 14);
10241     helper_float_26_recursive(buf + 98304, 14);
10242     helper_float_26_recursive(buf + 114688, 14);
10243     for (int j = 0; j < 131072; j += 131072) {
10244       for (int k = 0; k < 16384; k += 4) {
10245         __asm__ volatile (
10246           "movups (%0), %%xmm0\n"
10247           "movups (%1), %%xmm1\n"
10248           "movups (%2), %%xmm2\n"
10249           "movups (%3), %%xmm3\n"
10250           "movups (%4), %%xmm4\n"
10251           "movups (%5), %%xmm5\n"
10252           "movups (%6), %%xmm6\n"
10253           "movups (%7), %%xmm7\n"
10254           "movaps %%xmm0, %%xmm8\n"
10255           "movaps %%xmm0, %%xmm9\n"
10256           "addps %%xmm1, %%xmm8\n"
10257           "subps %%xmm1, %%xmm9\n"
10258           "movaps %%xmm2, %%xmm10\n"
10259           "movaps %%xmm2, %%xmm11\n"
10260           "addps %%xmm3, %%xmm10\n"
10261           "subps %%xmm3, %%xmm11\n"
10262           "movaps %%xmm4, %%xmm12\n"
10263           "movaps %%xmm4, %%xmm13\n"
10264           "addps %%xmm5, %%xmm12\n"
10265           "subps %%xmm5, %%xmm13\n"
10266           "movaps %%xmm6, %%xmm14\n"
10267           "movaps %%xmm6, %%xmm15\n"
10268           "addps %%xmm7, %%xmm14\n"
10269           "subps %%xmm7, %%xmm15\n"
10270           "movaps %%xmm8, %%xmm0\n"
10271           "movaps %%xmm8, %%xmm2\n"
10272           "addps %%xmm10, %%xmm0\n"
10273           "subps %%xmm10, %%xmm2\n"
10274           "movaps %%xmm9, %%xmm1\n"
10275           "movaps %%xmm9, %%xmm3\n"
10276           "addps %%xmm11, %%xmm1\n"
10277           "subps %%xmm11, %%xmm3\n"
10278           "movaps %%xmm12, %%xmm4\n"
10279           "movaps %%xmm12, %%xmm6\n"
10280           "addps %%xmm14, %%xmm4\n"
10281           "subps %%xmm14, %%xmm6\n"
10282           "movaps %%xmm13, %%xmm5\n"
10283           "movaps %%xmm13, %%xmm7\n"
10284           "addps %%xmm15, %%xmm5\n"
10285           "subps %%xmm15, %%xmm7\n"
10286           "movaps %%xmm0, %%xmm8\n"
10287           "movaps %%xmm0, %%xmm12\n"
10288           "addps %%xmm4, %%xmm8\n"
10289           "subps %%xmm4, %%xmm12\n"
10290           "movaps %%xmm1, %%xmm9\n"
10291           "movaps %%xmm1, %%xmm13\n"
10292           "addps %%xmm5, %%xmm9\n"
10293           "subps %%xmm5, %%xmm13\n"
10294           "movaps %%xmm2, %%xmm10\n"
10295           "movaps %%xmm2, %%xmm14\n"
10296           "addps %%xmm6, %%xmm10\n"
10297           "subps %%xmm6, %%xmm14\n"
10298           "movaps %%xmm3, %%xmm11\n"
10299           "movaps %%xmm3, %%xmm15\n"
10300           "addps %%xmm7, %%xmm11\n"
10301           "subps %%xmm7, %%xmm15\n"
10302           "movups %%xmm8, (%0)\n"
10303           "movups %%xmm9, (%1)\n"
10304           "movups %%xmm10, (%2)\n"
10305           "movups %%xmm11, (%3)\n"
10306           "movups %%xmm12, (%4)\n"
10307           "movups %%xmm13, (%5)\n"
10308           "movups %%xmm14, (%6)\n"
10309           "movups %%xmm15, (%7)\n"
10310           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10311         );
10312       }
10313     }
10314     return;
10315   }
10316   if (depth == 20) {
10317     helper_float_26_recursive(buf + 0, 17);
10318     helper_float_26_recursive(buf + 131072, 17);
10319     helper_float_26_recursive(buf + 262144, 17);
10320     helper_float_26_recursive(buf + 393216, 17);
10321     helper_float_26_recursive(buf + 524288, 17);
10322     helper_float_26_recursive(buf + 655360, 17);
10323     helper_float_26_recursive(buf + 786432, 17);
10324     helper_float_26_recursive(buf + 917504, 17);
10325     for (int j = 0; j < 1048576; j += 1048576) {
10326       for (int k = 0; k < 131072; k += 4) {
10327         __asm__ volatile (
10328           "movups (%0), %%xmm0\n"
10329           "movups (%1), %%xmm1\n"
10330           "movups (%2), %%xmm2\n"
10331           "movups (%3), %%xmm3\n"
10332           "movups (%4), %%xmm4\n"
10333           "movups (%5), %%xmm5\n"
10334           "movups (%6), %%xmm6\n"
10335           "movups (%7), %%xmm7\n"
10336           "movaps %%xmm0, %%xmm8\n"
10337           "movaps %%xmm0, %%xmm9\n"
10338           "addps %%xmm1, %%xmm8\n"
10339           "subps %%xmm1, %%xmm9\n"
10340           "movaps %%xmm2, %%xmm10\n"
10341           "movaps %%xmm2, %%xmm11\n"
10342           "addps %%xmm3, %%xmm10\n"
10343           "subps %%xmm3, %%xmm11\n"
10344           "movaps %%xmm4, %%xmm12\n"
10345           "movaps %%xmm4, %%xmm13\n"
10346           "addps %%xmm5, %%xmm12\n"
10347           "subps %%xmm5, %%xmm13\n"
10348           "movaps %%xmm6, %%xmm14\n"
10349           "movaps %%xmm6, %%xmm15\n"
10350           "addps %%xmm7, %%xmm14\n"
10351           "subps %%xmm7, %%xmm15\n"
10352           "movaps %%xmm8, %%xmm0\n"
10353           "movaps %%xmm8, %%xmm2\n"
10354           "addps %%xmm10, %%xmm0\n"
10355           "subps %%xmm10, %%xmm2\n"
10356           "movaps %%xmm9, %%xmm1\n"
10357           "movaps %%xmm9, %%xmm3\n"
10358           "addps %%xmm11, %%xmm1\n"
10359           "subps %%xmm11, %%xmm3\n"
10360           "movaps %%xmm12, %%xmm4\n"
10361           "movaps %%xmm12, %%xmm6\n"
10362           "addps %%xmm14, %%xmm4\n"
10363           "subps %%xmm14, %%xmm6\n"
10364           "movaps %%xmm13, %%xmm5\n"
10365           "movaps %%xmm13, %%xmm7\n"
10366           "addps %%xmm15, %%xmm5\n"
10367           "subps %%xmm15, %%xmm7\n"
10368           "movaps %%xmm0, %%xmm8\n"
10369           "movaps %%xmm0, %%xmm12\n"
10370           "addps %%xmm4, %%xmm8\n"
10371           "subps %%xmm4, %%xmm12\n"
10372           "movaps %%xmm1, %%xmm9\n"
10373           "movaps %%xmm1, %%xmm13\n"
10374           "addps %%xmm5, %%xmm9\n"
10375           "subps %%xmm5, %%xmm13\n"
10376           "movaps %%xmm2, %%xmm10\n"
10377           "movaps %%xmm2, %%xmm14\n"
10378           "addps %%xmm6, %%xmm10\n"
10379           "subps %%xmm6, %%xmm14\n"
10380           "movaps %%xmm3, %%xmm11\n"
10381           "movaps %%xmm3, %%xmm15\n"
10382           "addps %%xmm7, %%xmm11\n"
10383           "subps %%xmm7, %%xmm15\n"
10384           "movups %%xmm8, (%0)\n"
10385           "movups %%xmm9, (%1)\n"
10386           "movups %%xmm10, (%2)\n"
10387           "movups %%xmm11, (%3)\n"
10388           "movups %%xmm12, (%4)\n"
10389           "movups %%xmm13, (%5)\n"
10390           "movups %%xmm14, (%6)\n"
10391           "movups %%xmm15, (%7)\n"
10392           :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10393         );
10394       }
10395     }
10396     return;
10397   }
10398   if (depth == 23) {
10399     helper_float_26_recursive(buf + 0, 20);
10400     helper_float_26_recursive(buf + 1048576, 20);
10401     helper_float_26_recursive(buf + 2097152, 20);
10402     helper_float_26_recursive(buf + 3145728, 20);
10403     helper_float_26_recursive(buf + 4194304, 20);
10404     helper_float_26_recursive(buf + 5242880, 20);
10405     helper_float_26_recursive(buf + 6291456, 20);
10406     helper_float_26_recursive(buf + 7340032, 20);
10407     for (int j = 0; j < 8388608; j += 8388608) {
10408       for (int k = 0; k < 1048576; k += 4) {
10409         __asm__ volatile (
10410           "movups (%0), %%xmm0\n"
10411           "movups (%1), %%xmm1\n"
10412           "movups (%2), %%xmm2\n"
10413           "movups (%3), %%xmm3\n"
10414           "movups (%4), %%xmm4\n"
10415           "movups (%5), %%xmm5\n"
10416           "movups (%6), %%xmm6\n"
10417           "movups (%7), %%xmm7\n"
10418           "movaps %%xmm0, %%xmm8\n"
10419           "movaps %%xmm0, %%xmm9\n"
10420           "addps %%xmm1, %%xmm8\n"
10421           "subps %%xmm1, %%xmm9\n"
10422           "movaps %%xmm2, %%xmm10\n"
10423           "movaps %%xmm2, %%xmm11\n"
10424           "addps %%xmm3, %%xmm10\n"
10425           "subps %%xmm3, %%xmm11\n"
10426           "movaps %%xmm4, %%xmm12\n"
10427           "movaps %%xmm4, %%xmm13\n"
10428           "addps %%xmm5, %%xmm12\n"
10429           "subps %%xmm5, %%xmm13\n"
10430           "movaps %%xmm6, %%xmm14\n"
10431           "movaps %%xmm6, %%xmm15\n"
10432           "addps %%xmm7, %%xmm14\n"
10433           "subps %%xmm7, %%xmm15\n"
10434           "movaps %%xmm8, %%xmm0\n"
10435           "movaps %%xmm8, %%xmm2\n"
10436           "addps %%xmm10, %%xmm0\n"
10437           "subps %%xmm10, %%xmm2\n"
10438           "movaps %%xmm9, %%xmm1\n"
10439           "movaps %%xmm9, %%xmm3\n"
10440           "addps %%xmm11, %%xmm1\n"
10441           "subps %%xmm11, %%xmm3\n"
10442           "movaps %%xmm12, %%xmm4\n"
10443           "movaps %%xmm12, %%xmm6\n"
10444           "addps %%xmm14, %%xmm4\n"
10445           "subps %%xmm14, %%xmm6\n"
10446           "movaps %%xmm13, %%xmm5\n"
10447           "movaps %%xmm13, %%xmm7\n"
10448           "addps %%xmm15, %%xmm5\n"
10449           "subps %%xmm15, %%xmm7\n"
10450           "movaps %%xmm0, %%xmm8\n"
10451           "movaps %%xmm0, %%xmm12\n"
10452           "addps %%xmm4, %%xmm8\n"
10453           "subps %%xmm4, %%xmm12\n"
10454           "movaps %%xmm1, %%xmm9\n"
10455           "movaps %%xmm1, %%xmm13\n"
10456           "addps %%xmm5, %%xmm9\n"
10457           "subps %%xmm5, %%xmm13\n"
10458           "movaps %%xmm2, %%xmm10\n"
10459           "movaps %%xmm2, %%xmm14\n"
10460           "addps %%xmm6, %%xmm10\n"
10461           "subps %%xmm6, %%xmm14\n"
10462           "movaps %%xmm3, %%xmm11\n"
10463           "movaps %%xmm3, %%xmm15\n"
10464           "addps %%xmm7, %%xmm11\n"
10465           "subps %%xmm7, %%xmm15\n"
10466           "movups %%xmm8, (%0)\n"
10467           "movups %%xmm9, (%1)\n"
10468           "movups %%xmm10, (%2)\n"
10469           "movups %%xmm11, (%3)\n"
10470           "movups %%xmm12, (%4)\n"
10471           "movups %%xmm13, (%5)\n"
10472           "movups %%xmm14, (%6)\n"
10473           "movups %%xmm15, (%7)\n"
10474           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10475         );
10476       }
10477     }
10478     return;
10479   }
10480   if (depth == 26) {
10481     helper_float_26_recursive(buf + 0, 23);
10482     helper_float_26_recursive(buf + 8388608, 23);
10483     helper_float_26_recursive(buf + 16777216, 23);
10484     helper_float_26_recursive(buf + 25165824, 23);
10485     helper_float_26_recursive(buf + 33554432, 23);
10486     helper_float_26_recursive(buf + 41943040, 23);
10487     helper_float_26_recursive(buf + 50331648, 23);
10488     helper_float_26_recursive(buf + 58720256, 23);
10489     for (int j = 0; j < 67108864; j += 67108864) {
10490       for (int k = 0; k < 8388608; k += 4) {
10491         __asm__ volatile (
10492           "movups (%0), %%xmm0\n"
10493           "movups (%1), %%xmm1\n"
10494           "movups (%2), %%xmm2\n"
10495           "movups (%3), %%xmm3\n"
10496           "movups (%4), %%xmm4\n"
10497           "movups (%5), %%xmm5\n"
10498           "movups (%6), %%xmm6\n"
10499           "movups (%7), %%xmm7\n"
10500           "movaps %%xmm0, %%xmm8\n"
10501           "movaps %%xmm0, %%xmm9\n"
10502           "addps %%xmm1, %%xmm8\n"
10503           "subps %%xmm1, %%xmm9\n"
10504           "movaps %%xmm2, %%xmm10\n"
10505           "movaps %%xmm2, %%xmm11\n"
10506           "addps %%xmm3, %%xmm10\n"
10507           "subps %%xmm3, %%xmm11\n"
10508           "movaps %%xmm4, %%xmm12\n"
10509           "movaps %%xmm4, %%xmm13\n"
10510           "addps %%xmm5, %%xmm12\n"
10511           "subps %%xmm5, %%xmm13\n"
10512           "movaps %%xmm6, %%xmm14\n"
10513           "movaps %%xmm6, %%xmm15\n"
10514           "addps %%xmm7, %%xmm14\n"
10515           "subps %%xmm7, %%xmm15\n"
10516           "movaps %%xmm8, %%xmm0\n"
10517           "movaps %%xmm8, %%xmm2\n"
10518           "addps %%xmm10, %%xmm0\n"
10519           "subps %%xmm10, %%xmm2\n"
10520           "movaps %%xmm9, %%xmm1\n"
10521           "movaps %%xmm9, %%xmm3\n"
10522           "addps %%xmm11, %%xmm1\n"
10523           "subps %%xmm11, %%xmm3\n"
10524           "movaps %%xmm12, %%xmm4\n"
10525           "movaps %%xmm12, %%xmm6\n"
10526           "addps %%xmm14, %%xmm4\n"
10527           "subps %%xmm14, %%xmm6\n"
10528           "movaps %%xmm13, %%xmm5\n"
10529           "movaps %%xmm13, %%xmm7\n"
10530           "addps %%xmm15, %%xmm5\n"
10531           "subps %%xmm15, %%xmm7\n"
10532           "movaps %%xmm0, %%xmm8\n"
10533           "movaps %%xmm0, %%xmm12\n"
10534           "addps %%xmm4, %%xmm8\n"
10535           "subps %%xmm4, %%xmm12\n"
10536           "movaps %%xmm1, %%xmm9\n"
10537           "movaps %%xmm1, %%xmm13\n"
10538           "addps %%xmm5, %%xmm9\n"
10539           "subps %%xmm5, %%xmm13\n"
10540           "movaps %%xmm2, %%xmm10\n"
10541           "movaps %%xmm2, %%xmm14\n"
10542           "addps %%xmm6, %%xmm10\n"
10543           "subps %%xmm6, %%xmm14\n"
10544           "movaps %%xmm3, %%xmm11\n"
10545           "movaps %%xmm3, %%xmm15\n"
10546           "addps %%xmm7, %%xmm11\n"
10547           "subps %%xmm7, %%xmm15\n"
10548           "movups %%xmm8, (%0)\n"
10549           "movups %%xmm9, (%1)\n"
10550           "movups %%xmm10, (%2)\n"
10551           "movups %%xmm11, (%3)\n"
10552           "movups %%xmm12, (%4)\n"
10553           "movups %%xmm13, (%5)\n"
10554           "movups %%xmm14, (%6)\n"
10555           "movups %%xmm15, (%7)\n"
10556           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10557         );
10558       }
10559     }
10560     return;
10561   }
10562 }
10563 void helper_float_26(float *buf);
helper_float_26(float * buf)10564 void helper_float_26(float *buf) {
10565   helper_float_26_recursive(buf, 26);
10566 }
10567 void helper_float_27_recursive(float *buf, int depth);
helper_float_27_recursive(float * buf,int depth)10568 void helper_float_27_recursive(float *buf, int depth) {
10569   if (depth == 12) {
10570     for (int j = 0; j < 4096; j += 32) {
10571       for (int k = 0; k < 4; k += 4) {
10572         __asm__ volatile (
10573           "movups (%0), %%xmm0\n"
10574           "movups (%1), %%xmm1\n"
10575           "movups (%2), %%xmm2\n"
10576           "movups (%3), %%xmm3\n"
10577           "movups (%4), %%xmm4\n"
10578           "movups (%5), %%xmm5\n"
10579           "movups (%6), %%xmm6\n"
10580           "movups (%7), %%xmm7\n"
10581           "movaps %%xmm0, %%xmm8\n"
10582           "shufps $160, %%xmm8, %%xmm8\n"
10583           "shufps $245, %%xmm0, %%xmm0\n"
10584           "xorps %%xmm9, %%xmm9\n"
10585           "subps %%xmm0, %%xmm9\n"
10586           "addsubps %%xmm9, %%xmm8\n"
10587           "movaps %%xmm8, %%xmm0\n"
10588           "movaps %%xmm1, %%xmm8\n"
10589           "shufps $160, %%xmm8, %%xmm8\n"
10590           "shufps $245, %%xmm1, %%xmm1\n"
10591           "xorps %%xmm9, %%xmm9\n"
10592           "subps %%xmm1, %%xmm9\n"
10593           "addsubps %%xmm9, %%xmm8\n"
10594           "movaps %%xmm8, %%xmm1\n"
10595           "movaps %%xmm2, %%xmm8\n"
10596           "shufps $160, %%xmm8, %%xmm8\n"
10597           "shufps $245, %%xmm2, %%xmm2\n"
10598           "xorps %%xmm9, %%xmm9\n"
10599           "subps %%xmm2, %%xmm9\n"
10600           "addsubps %%xmm9, %%xmm8\n"
10601           "movaps %%xmm8, %%xmm2\n"
10602           "movaps %%xmm3, %%xmm8\n"
10603           "shufps $160, %%xmm8, %%xmm8\n"
10604           "shufps $245, %%xmm3, %%xmm3\n"
10605           "xorps %%xmm9, %%xmm9\n"
10606           "subps %%xmm3, %%xmm9\n"
10607           "addsubps %%xmm9, %%xmm8\n"
10608           "movaps %%xmm8, %%xmm3\n"
10609           "movaps %%xmm4, %%xmm8\n"
10610           "shufps $160, %%xmm8, %%xmm8\n"
10611           "shufps $245, %%xmm4, %%xmm4\n"
10612           "xorps %%xmm9, %%xmm9\n"
10613           "subps %%xmm4, %%xmm9\n"
10614           "addsubps %%xmm9, %%xmm8\n"
10615           "movaps %%xmm8, %%xmm4\n"
10616           "movaps %%xmm5, %%xmm8\n"
10617           "shufps $160, %%xmm8, %%xmm8\n"
10618           "shufps $245, %%xmm5, %%xmm5\n"
10619           "xorps %%xmm9, %%xmm9\n"
10620           "subps %%xmm5, %%xmm9\n"
10621           "addsubps %%xmm9, %%xmm8\n"
10622           "movaps %%xmm8, %%xmm5\n"
10623           "movaps %%xmm6, %%xmm8\n"
10624           "shufps $160, %%xmm8, %%xmm8\n"
10625           "shufps $245, %%xmm6, %%xmm6\n"
10626           "xorps %%xmm9, %%xmm9\n"
10627           "subps %%xmm6, %%xmm9\n"
10628           "addsubps %%xmm9, %%xmm8\n"
10629           "movaps %%xmm8, %%xmm6\n"
10630           "movaps %%xmm7, %%xmm8\n"
10631           "shufps $160, %%xmm8, %%xmm8\n"
10632           "shufps $245, %%xmm7, %%xmm7\n"
10633           "xorps %%xmm9, %%xmm9\n"
10634           "subps %%xmm7, %%xmm9\n"
10635           "addsubps %%xmm9, %%xmm8\n"
10636           "movaps %%xmm8, %%xmm7\n"
10637           "movaps %%xmm0, %%xmm8\n"
10638           "shufps $68, %%xmm8, %%xmm8\n"
10639           "xorps %%xmm9, %%xmm9\n"
10640           "movaps %%xmm0, %%xmm10\n"
10641           "shufps $14, %%xmm9, %%xmm10\n"
10642           "movaps %%xmm0, %%xmm11\n"
10643           "shufps $224, %%xmm11, %%xmm9\n"
10644           "addps %%xmm8, %%xmm10\n"
10645           "subps %%xmm9, %%xmm10\n"
10646           "movaps %%xmm10, %%xmm0\n"
10647           "movaps %%xmm1, %%xmm8\n"
10648           "shufps $68, %%xmm8, %%xmm8\n"
10649           "xorps %%xmm9, %%xmm9\n"
10650           "movaps %%xmm1, %%xmm10\n"
10651           "shufps $14, %%xmm9, %%xmm10\n"
10652           "movaps %%xmm1, %%xmm11\n"
10653           "shufps $224, %%xmm11, %%xmm9\n"
10654           "addps %%xmm8, %%xmm10\n"
10655           "subps %%xmm9, %%xmm10\n"
10656           "movaps %%xmm10, %%xmm1\n"
10657           "movaps %%xmm2, %%xmm8\n"
10658           "shufps $68, %%xmm8, %%xmm8\n"
10659           "xorps %%xmm9, %%xmm9\n"
10660           "movaps %%xmm2, %%xmm10\n"
10661           "shufps $14, %%xmm9, %%xmm10\n"
10662           "movaps %%xmm2, %%xmm11\n"
10663           "shufps $224, %%xmm11, %%xmm9\n"
10664           "addps %%xmm8, %%xmm10\n"
10665           "subps %%xmm9, %%xmm10\n"
10666           "movaps %%xmm10, %%xmm2\n"
10667           "movaps %%xmm3, %%xmm8\n"
10668           "shufps $68, %%xmm8, %%xmm8\n"
10669           "xorps %%xmm9, %%xmm9\n"
10670           "movaps %%xmm3, %%xmm10\n"
10671           "shufps $14, %%xmm9, %%xmm10\n"
10672           "movaps %%xmm3, %%xmm11\n"
10673           "shufps $224, %%xmm11, %%xmm9\n"
10674           "addps %%xmm8, %%xmm10\n"
10675           "subps %%xmm9, %%xmm10\n"
10676           "movaps %%xmm10, %%xmm3\n"
10677           "movaps %%xmm4, %%xmm8\n"
10678           "shufps $68, %%xmm8, %%xmm8\n"
10679           "xorps %%xmm9, %%xmm9\n"
10680           "movaps %%xmm4, %%xmm10\n"
10681           "shufps $14, %%xmm9, %%xmm10\n"
10682           "movaps %%xmm4, %%xmm11\n"
10683           "shufps $224, %%xmm11, %%xmm9\n"
10684           "addps %%xmm8, %%xmm10\n"
10685           "subps %%xmm9, %%xmm10\n"
10686           "movaps %%xmm10, %%xmm4\n"
10687           "movaps %%xmm5, %%xmm8\n"
10688           "shufps $68, %%xmm8, %%xmm8\n"
10689           "xorps %%xmm9, %%xmm9\n"
10690           "movaps %%xmm5, %%xmm10\n"
10691           "shufps $14, %%xmm9, %%xmm10\n"
10692           "movaps %%xmm5, %%xmm11\n"
10693           "shufps $224, %%xmm11, %%xmm9\n"
10694           "addps %%xmm8, %%xmm10\n"
10695           "subps %%xmm9, %%xmm10\n"
10696           "movaps %%xmm10, %%xmm5\n"
10697           "movaps %%xmm6, %%xmm8\n"
10698           "shufps $68, %%xmm8, %%xmm8\n"
10699           "xorps %%xmm9, %%xmm9\n"
10700           "movaps %%xmm6, %%xmm10\n"
10701           "shufps $14, %%xmm9, %%xmm10\n"
10702           "movaps %%xmm6, %%xmm11\n"
10703           "shufps $224, %%xmm11, %%xmm9\n"
10704           "addps %%xmm8, %%xmm10\n"
10705           "subps %%xmm9, %%xmm10\n"
10706           "movaps %%xmm10, %%xmm6\n"
10707           "movaps %%xmm7, %%xmm8\n"
10708           "shufps $68, %%xmm8, %%xmm8\n"
10709           "xorps %%xmm9, %%xmm9\n"
10710           "movaps %%xmm7, %%xmm10\n"
10711           "shufps $14, %%xmm9, %%xmm10\n"
10712           "movaps %%xmm7, %%xmm11\n"
10713           "shufps $224, %%xmm11, %%xmm9\n"
10714           "addps %%xmm8, %%xmm10\n"
10715           "subps %%xmm9, %%xmm10\n"
10716           "movaps %%xmm10, %%xmm7\n"
10717           "movaps %%xmm0, %%xmm8\n"
10718           "movaps %%xmm0, %%xmm9\n"
10719           "addps %%xmm1, %%xmm8\n"
10720           "subps %%xmm1, %%xmm9\n"
10721           "movaps %%xmm2, %%xmm10\n"
10722           "movaps %%xmm2, %%xmm11\n"
10723           "addps %%xmm3, %%xmm10\n"
10724           "subps %%xmm3, %%xmm11\n"
10725           "movaps %%xmm4, %%xmm12\n"
10726           "movaps %%xmm4, %%xmm13\n"
10727           "addps %%xmm5, %%xmm12\n"
10728           "subps %%xmm5, %%xmm13\n"
10729           "movaps %%xmm6, %%xmm14\n"
10730           "movaps %%xmm6, %%xmm15\n"
10731           "addps %%xmm7, %%xmm14\n"
10732           "subps %%xmm7, %%xmm15\n"
10733           "movaps %%xmm8, %%xmm0\n"
10734           "movaps %%xmm8, %%xmm2\n"
10735           "addps %%xmm10, %%xmm0\n"
10736           "subps %%xmm10, %%xmm2\n"
10737           "movaps %%xmm9, %%xmm1\n"
10738           "movaps %%xmm9, %%xmm3\n"
10739           "addps %%xmm11, %%xmm1\n"
10740           "subps %%xmm11, %%xmm3\n"
10741           "movaps %%xmm12, %%xmm4\n"
10742           "movaps %%xmm12, %%xmm6\n"
10743           "addps %%xmm14, %%xmm4\n"
10744           "subps %%xmm14, %%xmm6\n"
10745           "movaps %%xmm13, %%xmm5\n"
10746           "movaps %%xmm13, %%xmm7\n"
10747           "addps %%xmm15, %%xmm5\n"
10748           "subps %%xmm15, %%xmm7\n"
10749           "movaps %%xmm0, %%xmm8\n"
10750           "movaps %%xmm0, %%xmm12\n"
10751           "addps %%xmm4, %%xmm8\n"
10752           "subps %%xmm4, %%xmm12\n"
10753           "movaps %%xmm1, %%xmm9\n"
10754           "movaps %%xmm1, %%xmm13\n"
10755           "addps %%xmm5, %%xmm9\n"
10756           "subps %%xmm5, %%xmm13\n"
10757           "movaps %%xmm2, %%xmm10\n"
10758           "movaps %%xmm2, %%xmm14\n"
10759           "addps %%xmm6, %%xmm10\n"
10760           "subps %%xmm6, %%xmm14\n"
10761           "movaps %%xmm3, %%xmm11\n"
10762           "movaps %%xmm3, %%xmm15\n"
10763           "addps %%xmm7, %%xmm11\n"
10764           "subps %%xmm7, %%xmm15\n"
10765           "movups %%xmm8, (%0)\n"
10766           "movups %%xmm9, (%1)\n"
10767           "movups %%xmm10, (%2)\n"
10768           "movups %%xmm11, (%3)\n"
10769           "movups %%xmm12, (%4)\n"
10770           "movups %%xmm13, (%5)\n"
10771           "movups %%xmm14, (%6)\n"
10772           "movups %%xmm15, (%7)\n"
10773           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10774         );
10775       }
10776     }
10777     for (int j = 0; j < 4096; j += 256) {
10778       for (int k = 0; k < 32; k += 4) {
10779         __asm__ volatile (
10780           "movups (%0), %%xmm0\n"
10781           "movups (%1), %%xmm1\n"
10782           "movups (%2), %%xmm2\n"
10783           "movups (%3), %%xmm3\n"
10784           "movups (%4), %%xmm4\n"
10785           "movups (%5), %%xmm5\n"
10786           "movups (%6), %%xmm6\n"
10787           "movups (%7), %%xmm7\n"
10788           "movaps %%xmm0, %%xmm8\n"
10789           "movaps %%xmm0, %%xmm9\n"
10790           "addps %%xmm1, %%xmm8\n"
10791           "subps %%xmm1, %%xmm9\n"
10792           "movaps %%xmm2, %%xmm10\n"
10793           "movaps %%xmm2, %%xmm11\n"
10794           "addps %%xmm3, %%xmm10\n"
10795           "subps %%xmm3, %%xmm11\n"
10796           "movaps %%xmm4, %%xmm12\n"
10797           "movaps %%xmm4, %%xmm13\n"
10798           "addps %%xmm5, %%xmm12\n"
10799           "subps %%xmm5, %%xmm13\n"
10800           "movaps %%xmm6, %%xmm14\n"
10801           "movaps %%xmm6, %%xmm15\n"
10802           "addps %%xmm7, %%xmm14\n"
10803           "subps %%xmm7, %%xmm15\n"
10804           "movaps %%xmm8, %%xmm0\n"
10805           "movaps %%xmm8, %%xmm2\n"
10806           "addps %%xmm10, %%xmm0\n"
10807           "subps %%xmm10, %%xmm2\n"
10808           "movaps %%xmm9, %%xmm1\n"
10809           "movaps %%xmm9, %%xmm3\n"
10810           "addps %%xmm11, %%xmm1\n"
10811           "subps %%xmm11, %%xmm3\n"
10812           "movaps %%xmm12, %%xmm4\n"
10813           "movaps %%xmm12, %%xmm6\n"
10814           "addps %%xmm14, %%xmm4\n"
10815           "subps %%xmm14, %%xmm6\n"
10816           "movaps %%xmm13, %%xmm5\n"
10817           "movaps %%xmm13, %%xmm7\n"
10818           "addps %%xmm15, %%xmm5\n"
10819           "subps %%xmm15, %%xmm7\n"
10820           "movaps %%xmm0, %%xmm8\n"
10821           "movaps %%xmm0, %%xmm12\n"
10822           "addps %%xmm4, %%xmm8\n"
10823           "subps %%xmm4, %%xmm12\n"
10824           "movaps %%xmm1, %%xmm9\n"
10825           "movaps %%xmm1, %%xmm13\n"
10826           "addps %%xmm5, %%xmm9\n"
10827           "subps %%xmm5, %%xmm13\n"
10828           "movaps %%xmm2, %%xmm10\n"
10829           "movaps %%xmm2, %%xmm14\n"
10830           "addps %%xmm6, %%xmm10\n"
10831           "subps %%xmm6, %%xmm14\n"
10832           "movaps %%xmm3, %%xmm11\n"
10833           "movaps %%xmm3, %%xmm15\n"
10834           "addps %%xmm7, %%xmm11\n"
10835           "subps %%xmm7, %%xmm15\n"
10836           "movups %%xmm8, (%0)\n"
10837           "movups %%xmm9, (%1)\n"
10838           "movups %%xmm10, (%2)\n"
10839           "movups %%xmm11, (%3)\n"
10840           "movups %%xmm12, (%4)\n"
10841           "movups %%xmm13, (%5)\n"
10842           "movups %%xmm14, (%6)\n"
10843           "movups %%xmm15, (%7)\n"
10844           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10845         );
10846       }
10847     }
10848     for (int j = 0; j < 4096; j += 2048) {
10849       for (int k = 0; k < 256; k += 4) {
10850         __asm__ volatile (
10851           "movups (%0), %%xmm0\n"
10852           "movups (%1), %%xmm1\n"
10853           "movups (%2), %%xmm2\n"
10854           "movups (%3), %%xmm3\n"
10855           "movups (%4), %%xmm4\n"
10856           "movups (%5), %%xmm5\n"
10857           "movups (%6), %%xmm6\n"
10858           "movups (%7), %%xmm7\n"
10859           "movaps %%xmm0, %%xmm8\n"
10860           "movaps %%xmm0, %%xmm9\n"
10861           "addps %%xmm1, %%xmm8\n"
10862           "subps %%xmm1, %%xmm9\n"
10863           "movaps %%xmm2, %%xmm10\n"
10864           "movaps %%xmm2, %%xmm11\n"
10865           "addps %%xmm3, %%xmm10\n"
10866           "subps %%xmm3, %%xmm11\n"
10867           "movaps %%xmm4, %%xmm12\n"
10868           "movaps %%xmm4, %%xmm13\n"
10869           "addps %%xmm5, %%xmm12\n"
10870           "subps %%xmm5, %%xmm13\n"
10871           "movaps %%xmm6, %%xmm14\n"
10872           "movaps %%xmm6, %%xmm15\n"
10873           "addps %%xmm7, %%xmm14\n"
10874           "subps %%xmm7, %%xmm15\n"
10875           "movaps %%xmm8, %%xmm0\n"
10876           "movaps %%xmm8, %%xmm2\n"
10877           "addps %%xmm10, %%xmm0\n"
10878           "subps %%xmm10, %%xmm2\n"
10879           "movaps %%xmm9, %%xmm1\n"
10880           "movaps %%xmm9, %%xmm3\n"
10881           "addps %%xmm11, %%xmm1\n"
10882           "subps %%xmm11, %%xmm3\n"
10883           "movaps %%xmm12, %%xmm4\n"
10884           "movaps %%xmm12, %%xmm6\n"
10885           "addps %%xmm14, %%xmm4\n"
10886           "subps %%xmm14, %%xmm6\n"
10887           "movaps %%xmm13, %%xmm5\n"
10888           "movaps %%xmm13, %%xmm7\n"
10889           "addps %%xmm15, %%xmm5\n"
10890           "subps %%xmm15, %%xmm7\n"
10891           "movaps %%xmm0, %%xmm8\n"
10892           "movaps %%xmm0, %%xmm12\n"
10893           "addps %%xmm4, %%xmm8\n"
10894           "subps %%xmm4, %%xmm12\n"
10895           "movaps %%xmm1, %%xmm9\n"
10896           "movaps %%xmm1, %%xmm13\n"
10897           "addps %%xmm5, %%xmm9\n"
10898           "subps %%xmm5, %%xmm13\n"
10899           "movaps %%xmm2, %%xmm10\n"
10900           "movaps %%xmm2, %%xmm14\n"
10901           "addps %%xmm6, %%xmm10\n"
10902           "subps %%xmm6, %%xmm14\n"
10903           "movaps %%xmm3, %%xmm11\n"
10904           "movaps %%xmm3, %%xmm15\n"
10905           "addps %%xmm7, %%xmm11\n"
10906           "subps %%xmm7, %%xmm15\n"
10907           "movups %%xmm8, (%0)\n"
10908           "movups %%xmm9, (%1)\n"
10909           "movups %%xmm10, (%2)\n"
10910           "movups %%xmm11, (%3)\n"
10911           "movups %%xmm12, (%4)\n"
10912           "movups %%xmm13, (%5)\n"
10913           "movups %%xmm14, (%6)\n"
10914           "movups %%xmm15, (%7)\n"
10915           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10916         );
10917       }
10918     }
10919     for (int j = 0; j < 4096; j += 4096) {
10920       for (int k = 0; k < 2048; k += 4) {
10921         __asm__ volatile (
10922           "movups (%0), %%xmm0\n"
10923           "movups (%1), %%xmm1\n"
10924           "movaps %%xmm0, %%xmm8\n"
10925           "movaps %%xmm0, %%xmm9\n"
10926           "addps %%xmm1, %%xmm8\n"
10927           "subps %%xmm1, %%xmm9\n"
10928           "movups %%xmm8, (%0)\n"
10929           "movups %%xmm9, (%1)\n"
10930           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10931         );
10932       }
10933     }
10934     return;
10935   }
10936   if (depth == 15) {
10937     helper_float_27_recursive(buf + 0, 12);
10938     helper_float_27_recursive(buf + 4096, 12);
10939     helper_float_27_recursive(buf + 8192, 12);
10940     helper_float_27_recursive(buf + 12288, 12);
10941     helper_float_27_recursive(buf + 16384, 12);
10942     helper_float_27_recursive(buf + 20480, 12);
10943     helper_float_27_recursive(buf + 24576, 12);
10944     helper_float_27_recursive(buf + 28672, 12);
10945     for (int j = 0; j < 32768; j += 32768) {
10946       for (int k = 0; k < 4096; k += 4) {
10947         __asm__ volatile (
10948           "movups (%0), %%xmm0\n"
10949           "movups (%1), %%xmm1\n"
10950           "movups (%2), %%xmm2\n"
10951           "movups (%3), %%xmm3\n"
10952           "movups (%4), %%xmm4\n"
10953           "movups (%5), %%xmm5\n"
10954           "movups (%6), %%xmm6\n"
10955           "movups (%7), %%xmm7\n"
10956           "movaps %%xmm0, %%xmm8\n"
10957           "movaps %%xmm0, %%xmm9\n"
10958           "addps %%xmm1, %%xmm8\n"
10959           "subps %%xmm1, %%xmm9\n"
10960           "movaps %%xmm2, %%xmm10\n"
10961           "movaps %%xmm2, %%xmm11\n"
10962           "addps %%xmm3, %%xmm10\n"
10963           "subps %%xmm3, %%xmm11\n"
10964           "movaps %%xmm4, %%xmm12\n"
10965           "movaps %%xmm4, %%xmm13\n"
10966           "addps %%xmm5, %%xmm12\n"
10967           "subps %%xmm5, %%xmm13\n"
10968           "movaps %%xmm6, %%xmm14\n"
10969           "movaps %%xmm6, %%xmm15\n"
10970           "addps %%xmm7, %%xmm14\n"
10971           "subps %%xmm7, %%xmm15\n"
10972           "movaps %%xmm8, %%xmm0\n"
10973           "movaps %%xmm8, %%xmm2\n"
10974           "addps %%xmm10, %%xmm0\n"
10975           "subps %%xmm10, %%xmm2\n"
10976           "movaps %%xmm9, %%xmm1\n"
10977           "movaps %%xmm9, %%xmm3\n"
10978           "addps %%xmm11, %%xmm1\n"
10979           "subps %%xmm11, %%xmm3\n"
10980           "movaps %%xmm12, %%xmm4\n"
10981           "movaps %%xmm12, %%xmm6\n"
10982           "addps %%xmm14, %%xmm4\n"
10983           "subps %%xmm14, %%xmm6\n"
10984           "movaps %%xmm13, %%xmm5\n"
10985           "movaps %%xmm13, %%xmm7\n"
10986           "addps %%xmm15, %%xmm5\n"
10987           "subps %%xmm15, %%xmm7\n"
10988           "movaps %%xmm0, %%xmm8\n"
10989           "movaps %%xmm0, %%xmm12\n"
10990           "addps %%xmm4, %%xmm8\n"
10991           "subps %%xmm4, %%xmm12\n"
10992           "movaps %%xmm1, %%xmm9\n"
10993           "movaps %%xmm1, %%xmm13\n"
10994           "addps %%xmm5, %%xmm9\n"
10995           "subps %%xmm5, %%xmm13\n"
10996           "movaps %%xmm2, %%xmm10\n"
10997           "movaps %%xmm2, %%xmm14\n"
10998           "addps %%xmm6, %%xmm10\n"
10999           "subps %%xmm6, %%xmm14\n"
11000           "movaps %%xmm3, %%xmm11\n"
11001           "movaps %%xmm3, %%xmm15\n"
11002           "addps %%xmm7, %%xmm11\n"
11003           "subps %%xmm7, %%xmm15\n"
11004           "movups %%xmm8, (%0)\n"
11005           "movups %%xmm9, (%1)\n"
11006           "movups %%xmm10, (%2)\n"
11007           "movups %%xmm11, (%3)\n"
11008           "movups %%xmm12, (%4)\n"
11009           "movups %%xmm13, (%5)\n"
11010           "movups %%xmm14, (%6)\n"
11011           "movups %%xmm15, (%7)\n"
11012           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11013         );
11014       }
11015     }
11016     return;
11017   }
11018   if (depth == 18) {
11019     helper_float_27_recursive(buf + 0, 15);
11020     helper_float_27_recursive(buf + 32768, 15);
11021     helper_float_27_recursive(buf + 65536, 15);
11022     helper_float_27_recursive(buf + 98304, 15);
11023     helper_float_27_recursive(buf + 131072, 15);
11024     helper_float_27_recursive(buf + 163840, 15);
11025     helper_float_27_recursive(buf + 196608, 15);
11026     helper_float_27_recursive(buf + 229376, 15);
11027     for (int j = 0; j < 262144; j += 262144) {
11028       for (int k = 0; k < 32768; k += 4) {
11029         __asm__ volatile (
11030           "movups (%0), %%xmm0\n"
11031           "movups (%1), %%xmm1\n"
11032           "movups (%2), %%xmm2\n"
11033           "movups (%3), %%xmm3\n"
11034           "movups (%4), %%xmm4\n"
11035           "movups (%5), %%xmm5\n"
11036           "movups (%6), %%xmm6\n"
11037           "movups (%7), %%xmm7\n"
11038           "movaps %%xmm0, %%xmm8\n"
11039           "movaps %%xmm0, %%xmm9\n"
11040           "addps %%xmm1, %%xmm8\n"
11041           "subps %%xmm1, %%xmm9\n"
11042           "movaps %%xmm2, %%xmm10\n"
11043           "movaps %%xmm2, %%xmm11\n"
11044           "addps %%xmm3, %%xmm10\n"
11045           "subps %%xmm3, %%xmm11\n"
11046           "movaps %%xmm4, %%xmm12\n"
11047           "movaps %%xmm4, %%xmm13\n"
11048           "addps %%xmm5, %%xmm12\n"
11049           "subps %%xmm5, %%xmm13\n"
11050           "movaps %%xmm6, %%xmm14\n"
11051           "movaps %%xmm6, %%xmm15\n"
11052           "addps %%xmm7, %%xmm14\n"
11053           "subps %%xmm7, %%xmm15\n"
11054           "movaps %%xmm8, %%xmm0\n"
11055           "movaps %%xmm8, %%xmm2\n"
11056           "addps %%xmm10, %%xmm0\n"
11057           "subps %%xmm10, %%xmm2\n"
11058           "movaps %%xmm9, %%xmm1\n"
11059           "movaps %%xmm9, %%xmm3\n"
11060           "addps %%xmm11, %%xmm1\n"
11061           "subps %%xmm11, %%xmm3\n"
11062           "movaps %%xmm12, %%xmm4\n"
11063           "movaps %%xmm12, %%xmm6\n"
11064           "addps %%xmm14, %%xmm4\n"
11065           "subps %%xmm14, %%xmm6\n"
11066           "movaps %%xmm13, %%xmm5\n"
11067           "movaps %%xmm13, %%xmm7\n"
11068           "addps %%xmm15, %%xmm5\n"
11069           "subps %%xmm15, %%xmm7\n"
11070           "movaps %%xmm0, %%xmm8\n"
11071           "movaps %%xmm0, %%xmm12\n"
11072           "addps %%xmm4, %%xmm8\n"
11073           "subps %%xmm4, %%xmm12\n"
11074           "movaps %%xmm1, %%xmm9\n"
11075           "movaps %%xmm1, %%xmm13\n"
11076           "addps %%xmm5, %%xmm9\n"
11077           "subps %%xmm5, %%xmm13\n"
11078           "movaps %%xmm2, %%xmm10\n"
11079           "movaps %%xmm2, %%xmm14\n"
11080           "addps %%xmm6, %%xmm10\n"
11081           "subps %%xmm6, %%xmm14\n"
11082           "movaps %%xmm3, %%xmm11\n"
11083           "movaps %%xmm3, %%xmm15\n"
11084           "addps %%xmm7, %%xmm11\n"
11085           "subps %%xmm7, %%xmm15\n"
11086           "movups %%xmm8, (%0)\n"
11087           "movups %%xmm9, (%1)\n"
11088           "movups %%xmm10, (%2)\n"
11089           "movups %%xmm11, (%3)\n"
11090           "movups %%xmm12, (%4)\n"
11091           "movups %%xmm13, (%5)\n"
11092           "movups %%xmm14, (%6)\n"
11093           "movups %%xmm15, (%7)\n"
11094           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11095         );
11096       }
11097     }
11098     return;
11099   }
11100   if (depth == 21) {
11101     helper_float_27_recursive(buf + 0, 18);
11102     helper_float_27_recursive(buf + 262144, 18);
11103     helper_float_27_recursive(buf + 524288, 18);
11104     helper_float_27_recursive(buf + 786432, 18);
11105     helper_float_27_recursive(buf + 1048576, 18);
11106     helper_float_27_recursive(buf + 1310720, 18);
11107     helper_float_27_recursive(buf + 1572864, 18);
11108     helper_float_27_recursive(buf + 1835008, 18);
11109     for (int j = 0; j < 2097152; j += 2097152) {
11110       for (int k = 0; k < 262144; k += 4) {
11111         __asm__ volatile (
11112           "movups (%0), %%xmm0\n"
11113           "movups (%1), %%xmm1\n"
11114           "movups (%2), %%xmm2\n"
11115           "movups (%3), %%xmm3\n"
11116           "movups (%4), %%xmm4\n"
11117           "movups (%5), %%xmm5\n"
11118           "movups (%6), %%xmm6\n"
11119           "movups (%7), %%xmm7\n"
11120           "movaps %%xmm0, %%xmm8\n"
11121           "movaps %%xmm0, %%xmm9\n"
11122           "addps %%xmm1, %%xmm8\n"
11123           "subps %%xmm1, %%xmm9\n"
11124           "movaps %%xmm2, %%xmm10\n"
11125           "movaps %%xmm2, %%xmm11\n"
11126           "addps %%xmm3, %%xmm10\n"
11127           "subps %%xmm3, %%xmm11\n"
11128           "movaps %%xmm4, %%xmm12\n"
11129           "movaps %%xmm4, %%xmm13\n"
11130           "addps %%xmm5, %%xmm12\n"
11131           "subps %%xmm5, %%xmm13\n"
11132           "movaps %%xmm6, %%xmm14\n"
11133           "movaps %%xmm6, %%xmm15\n"
11134           "addps %%xmm7, %%xmm14\n"
11135           "subps %%xmm7, %%xmm15\n"
11136           "movaps %%xmm8, %%xmm0\n"
11137           "movaps %%xmm8, %%xmm2\n"
11138           "addps %%xmm10, %%xmm0\n"
11139           "subps %%xmm10, %%xmm2\n"
11140           "movaps %%xmm9, %%xmm1\n"
11141           "movaps %%xmm9, %%xmm3\n"
11142           "addps %%xmm11, %%xmm1\n"
11143           "subps %%xmm11, %%xmm3\n"
11144           "movaps %%xmm12, %%xmm4\n"
11145           "movaps %%xmm12, %%xmm6\n"
11146           "addps %%xmm14, %%xmm4\n"
11147           "subps %%xmm14, %%xmm6\n"
11148           "movaps %%xmm13, %%xmm5\n"
11149           "movaps %%xmm13, %%xmm7\n"
11150           "addps %%xmm15, %%xmm5\n"
11151           "subps %%xmm15, %%xmm7\n"
11152           "movaps %%xmm0, %%xmm8\n"
11153           "movaps %%xmm0, %%xmm12\n"
11154           "addps %%xmm4, %%xmm8\n"
11155           "subps %%xmm4, %%xmm12\n"
11156           "movaps %%xmm1, %%xmm9\n"
11157           "movaps %%xmm1, %%xmm13\n"
11158           "addps %%xmm5, %%xmm9\n"
11159           "subps %%xmm5, %%xmm13\n"
11160           "movaps %%xmm2, %%xmm10\n"
11161           "movaps %%xmm2, %%xmm14\n"
11162           "addps %%xmm6, %%xmm10\n"
11163           "subps %%xmm6, %%xmm14\n"
11164           "movaps %%xmm3, %%xmm11\n"
11165           "movaps %%xmm3, %%xmm15\n"
11166           "addps %%xmm7, %%xmm11\n"
11167           "subps %%xmm7, %%xmm15\n"
11168           "movups %%xmm8, (%0)\n"
11169           "movups %%xmm9, (%1)\n"
11170           "movups %%xmm10, (%2)\n"
11171           "movups %%xmm11, (%3)\n"
11172           "movups %%xmm12, (%4)\n"
11173           "movups %%xmm13, (%5)\n"
11174           "movups %%xmm14, (%6)\n"
11175           "movups %%xmm15, (%7)\n"
11176           :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11177         );
11178       }
11179     }
11180     return;
11181   }
11182   if (depth == 24) {
11183     helper_float_27_recursive(buf + 0, 21);
11184     helper_float_27_recursive(buf + 2097152, 21);
11185     helper_float_27_recursive(buf + 4194304, 21);
11186     helper_float_27_recursive(buf + 6291456, 21);
11187     helper_float_27_recursive(buf + 8388608, 21);
11188     helper_float_27_recursive(buf + 10485760, 21);
11189     helper_float_27_recursive(buf + 12582912, 21);
11190     helper_float_27_recursive(buf + 14680064, 21);
11191     for (int j = 0; j < 16777216; j += 16777216) {
11192       for (int k = 0; k < 2097152; k += 4) {
11193         __asm__ volatile (
11194           "movups (%0), %%xmm0\n"
11195           "movups (%1), %%xmm1\n"
11196           "movups (%2), %%xmm2\n"
11197           "movups (%3), %%xmm3\n"
11198           "movups (%4), %%xmm4\n"
11199           "movups (%5), %%xmm5\n"
11200           "movups (%6), %%xmm6\n"
11201           "movups (%7), %%xmm7\n"
11202           "movaps %%xmm0, %%xmm8\n"
11203           "movaps %%xmm0, %%xmm9\n"
11204           "addps %%xmm1, %%xmm8\n"
11205           "subps %%xmm1, %%xmm9\n"
11206           "movaps %%xmm2, %%xmm10\n"
11207           "movaps %%xmm2, %%xmm11\n"
11208           "addps %%xmm3, %%xmm10\n"
11209           "subps %%xmm3, %%xmm11\n"
11210           "movaps %%xmm4, %%xmm12\n"
11211           "movaps %%xmm4, %%xmm13\n"
11212           "addps %%xmm5, %%xmm12\n"
11213           "subps %%xmm5, %%xmm13\n"
11214           "movaps %%xmm6, %%xmm14\n"
11215           "movaps %%xmm6, %%xmm15\n"
11216           "addps %%xmm7, %%xmm14\n"
11217           "subps %%xmm7, %%xmm15\n"
11218           "movaps %%xmm8, %%xmm0\n"
11219           "movaps %%xmm8, %%xmm2\n"
11220           "addps %%xmm10, %%xmm0\n"
11221           "subps %%xmm10, %%xmm2\n"
11222           "movaps %%xmm9, %%xmm1\n"
11223           "movaps %%xmm9, %%xmm3\n"
11224           "addps %%xmm11, %%xmm1\n"
11225           "subps %%xmm11, %%xmm3\n"
11226           "movaps %%xmm12, %%xmm4\n"
11227           "movaps %%xmm12, %%xmm6\n"
11228           "addps %%xmm14, %%xmm4\n"
11229           "subps %%xmm14, %%xmm6\n"
11230           "movaps %%xmm13, %%xmm5\n"
11231           "movaps %%xmm13, %%xmm7\n"
11232           "addps %%xmm15, %%xmm5\n"
11233           "subps %%xmm15, %%xmm7\n"
11234           "movaps %%xmm0, %%xmm8\n"
11235           "movaps %%xmm0, %%xmm12\n"
11236           "addps %%xmm4, %%xmm8\n"
11237           "subps %%xmm4, %%xmm12\n"
11238           "movaps %%xmm1, %%xmm9\n"
11239           "movaps %%xmm1, %%xmm13\n"
11240           "addps %%xmm5, %%xmm9\n"
11241           "subps %%xmm5, %%xmm13\n"
11242           "movaps %%xmm2, %%xmm10\n"
11243           "movaps %%xmm2, %%xmm14\n"
11244           "addps %%xmm6, %%xmm10\n"
11245           "subps %%xmm6, %%xmm14\n"
11246           "movaps %%xmm3, %%xmm11\n"
11247           "movaps %%xmm3, %%xmm15\n"
11248           "addps %%xmm7, %%xmm11\n"
11249           "subps %%xmm7, %%xmm15\n"
11250           "movups %%xmm8, (%0)\n"
11251           "movups %%xmm9, (%1)\n"
11252           "movups %%xmm10, (%2)\n"
11253           "movups %%xmm11, (%3)\n"
11254           "movups %%xmm12, (%4)\n"
11255           "movups %%xmm13, (%5)\n"
11256           "movups %%xmm14, (%6)\n"
11257           "movups %%xmm15, (%7)\n"
11258           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11259         );
11260       }
11261     }
11262     return;
11263   }
11264   if (depth == 27) {
11265     helper_float_27_recursive(buf + 0, 24);
11266     helper_float_27_recursive(buf + 16777216, 24);
11267     helper_float_27_recursive(buf + 33554432, 24);
11268     helper_float_27_recursive(buf + 50331648, 24);
11269     helper_float_27_recursive(buf + 67108864, 24);
11270     helper_float_27_recursive(buf + 83886080, 24);
11271     helper_float_27_recursive(buf + 100663296, 24);
11272     helper_float_27_recursive(buf + 117440512, 24);
11273     for (int j = 0; j < 134217728; j += 134217728) {
11274       for (int k = 0; k < 16777216; k += 4) {
11275         __asm__ volatile (
11276           "movups (%0), %%xmm0\n"
11277           "movups (%1), %%xmm1\n"
11278           "movups (%2), %%xmm2\n"
11279           "movups (%3), %%xmm3\n"
11280           "movups (%4), %%xmm4\n"
11281           "movups (%5), %%xmm5\n"
11282           "movups (%6), %%xmm6\n"
11283           "movups (%7), %%xmm7\n"
11284           "movaps %%xmm0, %%xmm8\n"
11285           "movaps %%xmm0, %%xmm9\n"
11286           "addps %%xmm1, %%xmm8\n"
11287           "subps %%xmm1, %%xmm9\n"
11288           "movaps %%xmm2, %%xmm10\n"
11289           "movaps %%xmm2, %%xmm11\n"
11290           "addps %%xmm3, %%xmm10\n"
11291           "subps %%xmm3, %%xmm11\n"
11292           "movaps %%xmm4, %%xmm12\n"
11293           "movaps %%xmm4, %%xmm13\n"
11294           "addps %%xmm5, %%xmm12\n"
11295           "subps %%xmm5, %%xmm13\n"
11296           "movaps %%xmm6, %%xmm14\n"
11297           "movaps %%xmm6, %%xmm15\n"
11298           "addps %%xmm7, %%xmm14\n"
11299           "subps %%xmm7, %%xmm15\n"
11300           "movaps %%xmm8, %%xmm0\n"
11301           "movaps %%xmm8, %%xmm2\n"
11302           "addps %%xmm10, %%xmm0\n"
11303           "subps %%xmm10, %%xmm2\n"
11304           "movaps %%xmm9, %%xmm1\n"
11305           "movaps %%xmm9, %%xmm3\n"
11306           "addps %%xmm11, %%xmm1\n"
11307           "subps %%xmm11, %%xmm3\n"
11308           "movaps %%xmm12, %%xmm4\n"
11309           "movaps %%xmm12, %%xmm6\n"
11310           "addps %%xmm14, %%xmm4\n"
11311           "subps %%xmm14, %%xmm6\n"
11312           "movaps %%xmm13, %%xmm5\n"
11313           "movaps %%xmm13, %%xmm7\n"
11314           "addps %%xmm15, %%xmm5\n"
11315           "subps %%xmm15, %%xmm7\n"
11316           "movaps %%xmm0, %%xmm8\n"
11317           "movaps %%xmm0, %%xmm12\n"
11318           "addps %%xmm4, %%xmm8\n"
11319           "subps %%xmm4, %%xmm12\n"
11320           "movaps %%xmm1, %%xmm9\n"
11321           "movaps %%xmm1, %%xmm13\n"
11322           "addps %%xmm5, %%xmm9\n"
11323           "subps %%xmm5, %%xmm13\n"
11324           "movaps %%xmm2, %%xmm10\n"
11325           "movaps %%xmm2, %%xmm14\n"
11326           "addps %%xmm6, %%xmm10\n"
11327           "subps %%xmm6, %%xmm14\n"
11328           "movaps %%xmm3, %%xmm11\n"
11329           "movaps %%xmm3, %%xmm15\n"
11330           "addps %%xmm7, %%xmm11\n"
11331           "subps %%xmm7, %%xmm15\n"
11332           "movups %%xmm8, (%0)\n"
11333           "movups %%xmm9, (%1)\n"
11334           "movups %%xmm10, (%2)\n"
11335           "movups %%xmm11, (%3)\n"
11336           "movups %%xmm12, (%4)\n"
11337           "movups %%xmm13, (%5)\n"
11338           "movups %%xmm14, (%6)\n"
11339           "movups %%xmm15, (%7)\n"
11340           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11341         );
11342       }
11343     }
11344     return;
11345   }
11346 }
11347 void helper_float_27(float *buf);
helper_float_27(float * buf)11348 void helper_float_27(float *buf) {
11349   helper_float_27_recursive(buf, 27);
11350 }
11351 void helper_float_28_recursive(float *buf, int depth);
helper_float_28_recursive(float * buf,int depth)11352 void helper_float_28_recursive(float *buf, int depth) {
11353   if (depth == 16) {
11354     for (int j = 0; j < 65536; j += 32) {
11355       for (int k = 0; k < 4; k += 4) {
11356         __asm__ volatile (
11357           "movups (%0), %%xmm0\n"
11358           "movups (%1), %%xmm1\n"
11359           "movups (%2), %%xmm2\n"
11360           "movups (%3), %%xmm3\n"
11361           "movups (%4), %%xmm4\n"
11362           "movups (%5), %%xmm5\n"
11363           "movups (%6), %%xmm6\n"
11364           "movups (%7), %%xmm7\n"
11365           "movaps %%xmm0, %%xmm8\n"
11366           "shufps $160, %%xmm8, %%xmm8\n"
11367           "shufps $245, %%xmm0, %%xmm0\n"
11368           "xorps %%xmm9, %%xmm9\n"
11369           "subps %%xmm0, %%xmm9\n"
11370           "addsubps %%xmm9, %%xmm8\n"
11371           "movaps %%xmm8, %%xmm0\n"
11372           "movaps %%xmm1, %%xmm8\n"
11373           "shufps $160, %%xmm8, %%xmm8\n"
11374           "shufps $245, %%xmm1, %%xmm1\n"
11375           "xorps %%xmm9, %%xmm9\n"
11376           "subps %%xmm1, %%xmm9\n"
11377           "addsubps %%xmm9, %%xmm8\n"
11378           "movaps %%xmm8, %%xmm1\n"
11379           "movaps %%xmm2, %%xmm8\n"
11380           "shufps $160, %%xmm8, %%xmm8\n"
11381           "shufps $245, %%xmm2, %%xmm2\n"
11382           "xorps %%xmm9, %%xmm9\n"
11383           "subps %%xmm2, %%xmm9\n"
11384           "addsubps %%xmm9, %%xmm8\n"
11385           "movaps %%xmm8, %%xmm2\n"
11386           "movaps %%xmm3, %%xmm8\n"
11387           "shufps $160, %%xmm8, %%xmm8\n"
11388           "shufps $245, %%xmm3, %%xmm3\n"
11389           "xorps %%xmm9, %%xmm9\n"
11390           "subps %%xmm3, %%xmm9\n"
11391           "addsubps %%xmm9, %%xmm8\n"
11392           "movaps %%xmm8, %%xmm3\n"
11393           "movaps %%xmm4, %%xmm8\n"
11394           "shufps $160, %%xmm8, %%xmm8\n"
11395           "shufps $245, %%xmm4, %%xmm4\n"
11396           "xorps %%xmm9, %%xmm9\n"
11397           "subps %%xmm4, %%xmm9\n"
11398           "addsubps %%xmm9, %%xmm8\n"
11399           "movaps %%xmm8, %%xmm4\n"
11400           "movaps %%xmm5, %%xmm8\n"
11401           "shufps $160, %%xmm8, %%xmm8\n"
11402           "shufps $245, %%xmm5, %%xmm5\n"
11403           "xorps %%xmm9, %%xmm9\n"
11404           "subps %%xmm5, %%xmm9\n"
11405           "addsubps %%xmm9, %%xmm8\n"
11406           "movaps %%xmm8, %%xmm5\n"
11407           "movaps %%xmm6, %%xmm8\n"
11408           "shufps $160, %%xmm8, %%xmm8\n"
11409           "shufps $245, %%xmm6, %%xmm6\n"
11410           "xorps %%xmm9, %%xmm9\n"
11411           "subps %%xmm6, %%xmm9\n"
11412           "addsubps %%xmm9, %%xmm8\n"
11413           "movaps %%xmm8, %%xmm6\n"
11414           "movaps %%xmm7, %%xmm8\n"
11415           "shufps $160, %%xmm8, %%xmm8\n"
11416           "shufps $245, %%xmm7, %%xmm7\n"
11417           "xorps %%xmm9, %%xmm9\n"
11418           "subps %%xmm7, %%xmm9\n"
11419           "addsubps %%xmm9, %%xmm8\n"
11420           "movaps %%xmm8, %%xmm7\n"
11421           "movaps %%xmm0, %%xmm8\n"
11422           "shufps $68, %%xmm8, %%xmm8\n"
11423           "xorps %%xmm9, %%xmm9\n"
11424           "movaps %%xmm0, %%xmm10\n"
11425           "shufps $14, %%xmm9, %%xmm10\n"
11426           "movaps %%xmm0, %%xmm11\n"
11427           "shufps $224, %%xmm11, %%xmm9\n"
11428           "addps %%xmm8, %%xmm10\n"
11429           "subps %%xmm9, %%xmm10\n"
11430           "movaps %%xmm10, %%xmm0\n"
11431           "movaps %%xmm1, %%xmm8\n"
11432           "shufps $68, %%xmm8, %%xmm8\n"
11433           "xorps %%xmm9, %%xmm9\n"
11434           "movaps %%xmm1, %%xmm10\n"
11435           "shufps $14, %%xmm9, %%xmm10\n"
11436           "movaps %%xmm1, %%xmm11\n"
11437           "shufps $224, %%xmm11, %%xmm9\n"
11438           "addps %%xmm8, %%xmm10\n"
11439           "subps %%xmm9, %%xmm10\n"
11440           "movaps %%xmm10, %%xmm1\n"
11441           "movaps %%xmm2, %%xmm8\n"
11442           "shufps $68, %%xmm8, %%xmm8\n"
11443           "xorps %%xmm9, %%xmm9\n"
11444           "movaps %%xmm2, %%xmm10\n"
11445           "shufps $14, %%xmm9, %%xmm10\n"
11446           "movaps %%xmm2, %%xmm11\n"
11447           "shufps $224, %%xmm11, %%xmm9\n"
11448           "addps %%xmm8, %%xmm10\n"
11449           "subps %%xmm9, %%xmm10\n"
11450           "movaps %%xmm10, %%xmm2\n"
11451           "movaps %%xmm3, %%xmm8\n"
11452           "shufps $68, %%xmm8, %%xmm8\n"
11453           "xorps %%xmm9, %%xmm9\n"
11454           "movaps %%xmm3, %%xmm10\n"
11455           "shufps $14, %%xmm9, %%xmm10\n"
11456           "movaps %%xmm3, %%xmm11\n"
11457           "shufps $224, %%xmm11, %%xmm9\n"
11458           "addps %%xmm8, %%xmm10\n"
11459           "subps %%xmm9, %%xmm10\n"
11460           "movaps %%xmm10, %%xmm3\n"
11461           "movaps %%xmm4, %%xmm8\n"
11462           "shufps $68, %%xmm8, %%xmm8\n"
11463           "xorps %%xmm9, %%xmm9\n"
11464           "movaps %%xmm4, %%xmm10\n"
11465           "shufps $14, %%xmm9, %%xmm10\n"
11466           "movaps %%xmm4, %%xmm11\n"
11467           "shufps $224, %%xmm11, %%xmm9\n"
11468           "addps %%xmm8, %%xmm10\n"
11469           "subps %%xmm9, %%xmm10\n"
11470           "movaps %%xmm10, %%xmm4\n"
11471           "movaps %%xmm5, %%xmm8\n"
11472           "shufps $68, %%xmm8, %%xmm8\n"
11473           "xorps %%xmm9, %%xmm9\n"
11474           "movaps %%xmm5, %%xmm10\n"
11475           "shufps $14, %%xmm9, %%xmm10\n"
11476           "movaps %%xmm5, %%xmm11\n"
11477           "shufps $224, %%xmm11, %%xmm9\n"
11478           "addps %%xmm8, %%xmm10\n"
11479           "subps %%xmm9, %%xmm10\n"
11480           "movaps %%xmm10, %%xmm5\n"
11481           "movaps %%xmm6, %%xmm8\n"
11482           "shufps $68, %%xmm8, %%xmm8\n"
11483           "xorps %%xmm9, %%xmm9\n"
11484           "movaps %%xmm6, %%xmm10\n"
11485           "shufps $14, %%xmm9, %%xmm10\n"
11486           "movaps %%xmm6, %%xmm11\n"
11487           "shufps $224, %%xmm11, %%xmm9\n"
11488           "addps %%xmm8, %%xmm10\n"
11489           "subps %%xmm9, %%xmm10\n"
11490           "movaps %%xmm10, %%xmm6\n"
11491           "movaps %%xmm7, %%xmm8\n"
11492           "shufps $68, %%xmm8, %%xmm8\n"
11493           "xorps %%xmm9, %%xmm9\n"
11494           "movaps %%xmm7, %%xmm10\n"
11495           "shufps $14, %%xmm9, %%xmm10\n"
11496           "movaps %%xmm7, %%xmm11\n"
11497           "shufps $224, %%xmm11, %%xmm9\n"
11498           "addps %%xmm8, %%xmm10\n"
11499           "subps %%xmm9, %%xmm10\n"
11500           "movaps %%xmm10, %%xmm7\n"
11501           "movaps %%xmm0, %%xmm8\n"
11502           "movaps %%xmm0, %%xmm9\n"
11503           "addps %%xmm1, %%xmm8\n"
11504           "subps %%xmm1, %%xmm9\n"
11505           "movaps %%xmm2, %%xmm10\n"
11506           "movaps %%xmm2, %%xmm11\n"
11507           "addps %%xmm3, %%xmm10\n"
11508           "subps %%xmm3, %%xmm11\n"
11509           "movaps %%xmm4, %%xmm12\n"
11510           "movaps %%xmm4, %%xmm13\n"
11511           "addps %%xmm5, %%xmm12\n"
11512           "subps %%xmm5, %%xmm13\n"
11513           "movaps %%xmm6, %%xmm14\n"
11514           "movaps %%xmm6, %%xmm15\n"
11515           "addps %%xmm7, %%xmm14\n"
11516           "subps %%xmm7, %%xmm15\n"
11517           "movaps %%xmm8, %%xmm0\n"
11518           "movaps %%xmm8, %%xmm2\n"
11519           "addps %%xmm10, %%xmm0\n"
11520           "subps %%xmm10, %%xmm2\n"
11521           "movaps %%xmm9, %%xmm1\n"
11522           "movaps %%xmm9, %%xmm3\n"
11523           "addps %%xmm11, %%xmm1\n"
11524           "subps %%xmm11, %%xmm3\n"
11525           "movaps %%xmm12, %%xmm4\n"
11526           "movaps %%xmm12, %%xmm6\n"
11527           "addps %%xmm14, %%xmm4\n"
11528           "subps %%xmm14, %%xmm6\n"
11529           "movaps %%xmm13, %%xmm5\n"
11530           "movaps %%xmm13, %%xmm7\n"
11531           "addps %%xmm15, %%xmm5\n"
11532           "subps %%xmm15, %%xmm7\n"
11533           "movaps %%xmm0, %%xmm8\n"
11534           "movaps %%xmm0, %%xmm12\n"
11535           "addps %%xmm4, %%xmm8\n"
11536           "subps %%xmm4, %%xmm12\n"
11537           "movaps %%xmm1, %%xmm9\n"
11538           "movaps %%xmm1, %%xmm13\n"
11539           "addps %%xmm5, %%xmm9\n"
11540           "subps %%xmm5, %%xmm13\n"
11541           "movaps %%xmm2, %%xmm10\n"
11542           "movaps %%xmm2, %%xmm14\n"
11543           "addps %%xmm6, %%xmm10\n"
11544           "subps %%xmm6, %%xmm14\n"
11545           "movaps %%xmm3, %%xmm11\n"
11546           "movaps %%xmm3, %%xmm15\n"
11547           "addps %%xmm7, %%xmm11\n"
11548           "subps %%xmm7, %%xmm15\n"
11549           "movups %%xmm8, (%0)\n"
11550           "movups %%xmm9, (%1)\n"
11551           "movups %%xmm10, (%2)\n"
11552           "movups %%xmm11, (%3)\n"
11553           "movups %%xmm12, (%4)\n"
11554           "movups %%xmm13, (%5)\n"
11555           "movups %%xmm14, (%6)\n"
11556           "movups %%xmm15, (%7)\n"
11557           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11558         );
11559       }
11560     }
11561     for (int j = 0; j < 65536; j += 256) {
11562       for (int k = 0; k < 32; k += 4) {
11563         __asm__ volatile (
11564           "movups (%0), %%xmm0\n"
11565           "movups (%1), %%xmm1\n"
11566           "movups (%2), %%xmm2\n"
11567           "movups (%3), %%xmm3\n"
11568           "movups (%4), %%xmm4\n"
11569           "movups (%5), %%xmm5\n"
11570           "movups (%6), %%xmm6\n"
11571           "movups (%7), %%xmm7\n"
11572           "movaps %%xmm0, %%xmm8\n"
11573           "movaps %%xmm0, %%xmm9\n"
11574           "addps %%xmm1, %%xmm8\n"
11575           "subps %%xmm1, %%xmm9\n"
11576           "movaps %%xmm2, %%xmm10\n"
11577           "movaps %%xmm2, %%xmm11\n"
11578           "addps %%xmm3, %%xmm10\n"
11579           "subps %%xmm3, %%xmm11\n"
11580           "movaps %%xmm4, %%xmm12\n"
11581           "movaps %%xmm4, %%xmm13\n"
11582           "addps %%xmm5, %%xmm12\n"
11583           "subps %%xmm5, %%xmm13\n"
11584           "movaps %%xmm6, %%xmm14\n"
11585           "movaps %%xmm6, %%xmm15\n"
11586           "addps %%xmm7, %%xmm14\n"
11587           "subps %%xmm7, %%xmm15\n"
11588           "movaps %%xmm8, %%xmm0\n"
11589           "movaps %%xmm8, %%xmm2\n"
11590           "addps %%xmm10, %%xmm0\n"
11591           "subps %%xmm10, %%xmm2\n"
11592           "movaps %%xmm9, %%xmm1\n"
11593           "movaps %%xmm9, %%xmm3\n"
11594           "addps %%xmm11, %%xmm1\n"
11595           "subps %%xmm11, %%xmm3\n"
11596           "movaps %%xmm12, %%xmm4\n"
11597           "movaps %%xmm12, %%xmm6\n"
11598           "addps %%xmm14, %%xmm4\n"
11599           "subps %%xmm14, %%xmm6\n"
11600           "movaps %%xmm13, %%xmm5\n"
11601           "movaps %%xmm13, %%xmm7\n"
11602           "addps %%xmm15, %%xmm5\n"
11603           "subps %%xmm15, %%xmm7\n"
11604           "movaps %%xmm0, %%xmm8\n"
11605           "movaps %%xmm0, %%xmm12\n"
11606           "addps %%xmm4, %%xmm8\n"
11607           "subps %%xmm4, %%xmm12\n"
11608           "movaps %%xmm1, %%xmm9\n"
11609           "movaps %%xmm1, %%xmm13\n"
11610           "addps %%xmm5, %%xmm9\n"
11611           "subps %%xmm5, %%xmm13\n"
11612           "movaps %%xmm2, %%xmm10\n"
11613           "movaps %%xmm2, %%xmm14\n"
11614           "addps %%xmm6, %%xmm10\n"
11615           "subps %%xmm6, %%xmm14\n"
11616           "movaps %%xmm3, %%xmm11\n"
11617           "movaps %%xmm3, %%xmm15\n"
11618           "addps %%xmm7, %%xmm11\n"
11619           "subps %%xmm7, %%xmm15\n"
11620           "movups %%xmm8, (%0)\n"
11621           "movups %%xmm9, (%1)\n"
11622           "movups %%xmm10, (%2)\n"
11623           "movups %%xmm11, (%3)\n"
11624           "movups %%xmm12, (%4)\n"
11625           "movups %%xmm13, (%5)\n"
11626           "movups %%xmm14, (%6)\n"
11627           "movups %%xmm15, (%7)\n"
11628           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11629         );
11630       }
11631     }
11632     for (int j = 0; j < 65536; j += 2048) {
11633       for (int k = 0; k < 256; k += 4) {
11634         __asm__ volatile (
11635           "movups (%0), %%xmm0\n"
11636           "movups (%1), %%xmm1\n"
11637           "movups (%2), %%xmm2\n"
11638           "movups (%3), %%xmm3\n"
11639           "movups (%4), %%xmm4\n"
11640           "movups (%5), %%xmm5\n"
11641           "movups (%6), %%xmm6\n"
11642           "movups (%7), %%xmm7\n"
11643           "movaps %%xmm0, %%xmm8\n"
11644           "movaps %%xmm0, %%xmm9\n"
11645           "addps %%xmm1, %%xmm8\n"
11646           "subps %%xmm1, %%xmm9\n"
11647           "movaps %%xmm2, %%xmm10\n"
11648           "movaps %%xmm2, %%xmm11\n"
11649           "addps %%xmm3, %%xmm10\n"
11650           "subps %%xmm3, %%xmm11\n"
11651           "movaps %%xmm4, %%xmm12\n"
11652           "movaps %%xmm4, %%xmm13\n"
11653           "addps %%xmm5, %%xmm12\n"
11654           "subps %%xmm5, %%xmm13\n"
11655           "movaps %%xmm6, %%xmm14\n"
11656           "movaps %%xmm6, %%xmm15\n"
11657           "addps %%xmm7, %%xmm14\n"
11658           "subps %%xmm7, %%xmm15\n"
11659           "movaps %%xmm8, %%xmm0\n"
11660           "movaps %%xmm8, %%xmm2\n"
11661           "addps %%xmm10, %%xmm0\n"
11662           "subps %%xmm10, %%xmm2\n"
11663           "movaps %%xmm9, %%xmm1\n"
11664           "movaps %%xmm9, %%xmm3\n"
11665           "addps %%xmm11, %%xmm1\n"
11666           "subps %%xmm11, %%xmm3\n"
11667           "movaps %%xmm12, %%xmm4\n"
11668           "movaps %%xmm12, %%xmm6\n"
11669           "addps %%xmm14, %%xmm4\n"
11670           "subps %%xmm14, %%xmm6\n"
11671           "movaps %%xmm13, %%xmm5\n"
11672           "movaps %%xmm13, %%xmm7\n"
11673           "addps %%xmm15, %%xmm5\n"
11674           "subps %%xmm15, %%xmm7\n"
11675           "movaps %%xmm0, %%xmm8\n"
11676           "movaps %%xmm0, %%xmm12\n"
11677           "addps %%xmm4, %%xmm8\n"
11678           "subps %%xmm4, %%xmm12\n"
11679           "movaps %%xmm1, %%xmm9\n"
11680           "movaps %%xmm1, %%xmm13\n"
11681           "addps %%xmm5, %%xmm9\n"
11682           "subps %%xmm5, %%xmm13\n"
11683           "movaps %%xmm2, %%xmm10\n"
11684           "movaps %%xmm2, %%xmm14\n"
11685           "addps %%xmm6, %%xmm10\n"
11686           "subps %%xmm6, %%xmm14\n"
11687           "movaps %%xmm3, %%xmm11\n"
11688           "movaps %%xmm3, %%xmm15\n"
11689           "addps %%xmm7, %%xmm11\n"
11690           "subps %%xmm7, %%xmm15\n"
11691           "movups %%xmm8, (%0)\n"
11692           "movups %%xmm9, (%1)\n"
11693           "movups %%xmm10, (%2)\n"
11694           "movups %%xmm11, (%3)\n"
11695           "movups %%xmm12, (%4)\n"
11696           "movups %%xmm13, (%5)\n"
11697           "movups %%xmm14, (%6)\n"
11698           "movups %%xmm15, (%7)\n"
11699           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11700         );
11701       }
11702     }
11703     for (int j = 0; j < 65536; j += 16384) {
11704       for (int k = 0; k < 2048; k += 4) {
11705         __asm__ volatile (
11706           "movups (%0), %%xmm0\n"
11707           "movups (%1), %%xmm1\n"
11708           "movups (%2), %%xmm2\n"
11709           "movups (%3), %%xmm3\n"
11710           "movups (%4), %%xmm4\n"
11711           "movups (%5), %%xmm5\n"
11712           "movups (%6), %%xmm6\n"
11713           "movups (%7), %%xmm7\n"
11714           "movaps %%xmm0, %%xmm8\n"
11715           "movaps %%xmm0, %%xmm9\n"
11716           "addps %%xmm1, %%xmm8\n"
11717           "subps %%xmm1, %%xmm9\n"
11718           "movaps %%xmm2, %%xmm10\n"
11719           "movaps %%xmm2, %%xmm11\n"
11720           "addps %%xmm3, %%xmm10\n"
11721           "subps %%xmm3, %%xmm11\n"
11722           "movaps %%xmm4, %%xmm12\n"
11723           "movaps %%xmm4, %%xmm13\n"
11724           "addps %%xmm5, %%xmm12\n"
11725           "subps %%xmm5, %%xmm13\n"
11726           "movaps %%xmm6, %%xmm14\n"
11727           "movaps %%xmm6, %%xmm15\n"
11728           "addps %%xmm7, %%xmm14\n"
11729           "subps %%xmm7, %%xmm15\n"
11730           "movaps %%xmm8, %%xmm0\n"
11731           "movaps %%xmm8, %%xmm2\n"
11732           "addps %%xmm10, %%xmm0\n"
11733           "subps %%xmm10, %%xmm2\n"
11734           "movaps %%xmm9, %%xmm1\n"
11735           "movaps %%xmm9, %%xmm3\n"
11736           "addps %%xmm11, %%xmm1\n"
11737           "subps %%xmm11, %%xmm3\n"
11738           "movaps %%xmm12, %%xmm4\n"
11739           "movaps %%xmm12, %%xmm6\n"
11740           "addps %%xmm14, %%xmm4\n"
11741           "subps %%xmm14, %%xmm6\n"
11742           "movaps %%xmm13, %%xmm5\n"
11743           "movaps %%xmm13, %%xmm7\n"
11744           "addps %%xmm15, %%xmm5\n"
11745           "subps %%xmm15, %%xmm7\n"
11746           "movaps %%xmm0, %%xmm8\n"
11747           "movaps %%xmm0, %%xmm12\n"
11748           "addps %%xmm4, %%xmm8\n"
11749           "subps %%xmm4, %%xmm12\n"
11750           "movaps %%xmm1, %%xmm9\n"
11751           "movaps %%xmm1, %%xmm13\n"
11752           "addps %%xmm5, %%xmm9\n"
11753           "subps %%xmm5, %%xmm13\n"
11754           "movaps %%xmm2, %%xmm10\n"
11755           "movaps %%xmm2, %%xmm14\n"
11756           "addps %%xmm6, %%xmm10\n"
11757           "subps %%xmm6, %%xmm14\n"
11758           "movaps %%xmm3, %%xmm11\n"
11759           "movaps %%xmm3, %%xmm15\n"
11760           "addps %%xmm7, %%xmm11\n"
11761           "subps %%xmm7, %%xmm15\n"
11762           "movups %%xmm8, (%0)\n"
11763           "movups %%xmm9, (%1)\n"
11764           "movups %%xmm10, (%2)\n"
11765           "movups %%xmm11, (%3)\n"
11766           "movups %%xmm12, (%4)\n"
11767           "movups %%xmm13, (%5)\n"
11768           "movups %%xmm14, (%6)\n"
11769           "movups %%xmm15, (%7)\n"
11770           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11771         );
11772       }
11773     }
11774     for (int j = 0; j < 65536; j += 65536) {
11775       for (int k = 0; k < 16384; k += 4) {
11776         __asm__ volatile (
11777           "movups (%0), %%xmm0\n"
11778           "movups (%1), %%xmm1\n"
11779           "movups (%2), %%xmm2\n"
11780           "movups (%3), %%xmm3\n"
11781           "movaps %%xmm0, %%xmm8\n"
11782           "movaps %%xmm0, %%xmm9\n"
11783           "addps %%xmm1, %%xmm8\n"
11784           "subps %%xmm1, %%xmm9\n"
11785           "movaps %%xmm2, %%xmm10\n"
11786           "movaps %%xmm2, %%xmm11\n"
11787           "addps %%xmm3, %%xmm10\n"
11788           "subps %%xmm3, %%xmm11\n"
11789           "movaps %%xmm8, %%xmm0\n"
11790           "movaps %%xmm8, %%xmm2\n"
11791           "addps %%xmm10, %%xmm0\n"
11792           "subps %%xmm10, %%xmm2\n"
11793           "movaps %%xmm9, %%xmm1\n"
11794           "movaps %%xmm9, %%xmm3\n"
11795           "addps %%xmm11, %%xmm1\n"
11796           "subps %%xmm11, %%xmm3\n"
11797           "movups %%xmm0, (%0)\n"
11798           "movups %%xmm1, (%1)\n"
11799           "movups %%xmm2, (%2)\n"
11800           "movups %%xmm3, (%3)\n"
11801           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11802         );
11803       }
11804     }
11805     return;
11806   }
11807   if (depth == 19) {
11808     helper_float_28_recursive(buf + 0, 16);
11809     helper_float_28_recursive(buf + 65536, 16);
11810     helper_float_28_recursive(buf + 131072, 16);
11811     helper_float_28_recursive(buf + 196608, 16);
11812     helper_float_28_recursive(buf + 262144, 16);
11813     helper_float_28_recursive(buf + 327680, 16);
11814     helper_float_28_recursive(buf + 393216, 16);
11815     helper_float_28_recursive(buf + 458752, 16);
11816     for (int j = 0; j < 524288; j += 524288) {
11817       for (int k = 0; k < 65536; k += 4) {
11818         __asm__ volatile (
11819           "movups (%0), %%xmm0\n"
11820           "movups (%1), %%xmm1\n"
11821           "movups (%2), %%xmm2\n"
11822           "movups (%3), %%xmm3\n"
11823           "movups (%4), %%xmm4\n"
11824           "movups (%5), %%xmm5\n"
11825           "movups (%6), %%xmm6\n"
11826           "movups (%7), %%xmm7\n"
11827           "movaps %%xmm0, %%xmm8\n"
11828           "movaps %%xmm0, %%xmm9\n"
11829           "addps %%xmm1, %%xmm8\n"
11830           "subps %%xmm1, %%xmm9\n"
11831           "movaps %%xmm2, %%xmm10\n"
11832           "movaps %%xmm2, %%xmm11\n"
11833           "addps %%xmm3, %%xmm10\n"
11834           "subps %%xmm3, %%xmm11\n"
11835           "movaps %%xmm4, %%xmm12\n"
11836           "movaps %%xmm4, %%xmm13\n"
11837           "addps %%xmm5, %%xmm12\n"
11838           "subps %%xmm5, %%xmm13\n"
11839           "movaps %%xmm6, %%xmm14\n"
11840           "movaps %%xmm6, %%xmm15\n"
11841           "addps %%xmm7, %%xmm14\n"
11842           "subps %%xmm7, %%xmm15\n"
11843           "movaps %%xmm8, %%xmm0\n"
11844           "movaps %%xmm8, %%xmm2\n"
11845           "addps %%xmm10, %%xmm0\n"
11846           "subps %%xmm10, %%xmm2\n"
11847           "movaps %%xmm9, %%xmm1\n"
11848           "movaps %%xmm9, %%xmm3\n"
11849           "addps %%xmm11, %%xmm1\n"
11850           "subps %%xmm11, %%xmm3\n"
11851           "movaps %%xmm12, %%xmm4\n"
11852           "movaps %%xmm12, %%xmm6\n"
11853           "addps %%xmm14, %%xmm4\n"
11854           "subps %%xmm14, %%xmm6\n"
11855           "movaps %%xmm13, %%xmm5\n"
11856           "movaps %%xmm13, %%xmm7\n"
11857           "addps %%xmm15, %%xmm5\n"
11858           "subps %%xmm15, %%xmm7\n"
11859           "movaps %%xmm0, %%xmm8\n"
11860           "movaps %%xmm0, %%xmm12\n"
11861           "addps %%xmm4, %%xmm8\n"
11862           "subps %%xmm4, %%xmm12\n"
11863           "movaps %%xmm1, %%xmm9\n"
11864           "movaps %%xmm1, %%xmm13\n"
11865           "addps %%xmm5, %%xmm9\n"
11866           "subps %%xmm5, %%xmm13\n"
11867           "movaps %%xmm2, %%xmm10\n"
11868           "movaps %%xmm2, %%xmm14\n"
11869           "addps %%xmm6, %%xmm10\n"
11870           "subps %%xmm6, %%xmm14\n"
11871           "movaps %%xmm3, %%xmm11\n"
11872           "movaps %%xmm3, %%xmm15\n"
11873           "addps %%xmm7, %%xmm11\n"
11874           "subps %%xmm7, %%xmm15\n"
11875           "movups %%xmm8, (%0)\n"
11876           "movups %%xmm9, (%1)\n"
11877           "movups %%xmm10, (%2)\n"
11878           "movups %%xmm11, (%3)\n"
11879           "movups %%xmm12, (%4)\n"
11880           "movups %%xmm13, (%5)\n"
11881           "movups %%xmm14, (%6)\n"
11882           "movups %%xmm15, (%7)\n"
11883           :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11884         );
11885       }
11886     }
11887     return;
11888   }
11889   if (depth == 22) {
11890     helper_float_28_recursive(buf + 0, 19);
11891     helper_float_28_recursive(buf + 524288, 19);
11892     helper_float_28_recursive(buf + 1048576, 19);
11893     helper_float_28_recursive(buf + 1572864, 19);
11894     helper_float_28_recursive(buf + 2097152, 19);
11895     helper_float_28_recursive(buf + 2621440, 19);
11896     helper_float_28_recursive(buf + 3145728, 19);
11897     helper_float_28_recursive(buf + 3670016, 19);
11898     for (int j = 0; j < 4194304; j += 4194304) {
11899       for (int k = 0; k < 524288; k += 4) {
11900         __asm__ volatile (
11901           "movups (%0), %%xmm0\n"
11902           "movups (%1), %%xmm1\n"
11903           "movups (%2), %%xmm2\n"
11904           "movups (%3), %%xmm3\n"
11905           "movups (%4), %%xmm4\n"
11906           "movups (%5), %%xmm5\n"
11907           "movups (%6), %%xmm6\n"
11908           "movups (%7), %%xmm7\n"
11909           "movaps %%xmm0, %%xmm8\n"
11910           "movaps %%xmm0, %%xmm9\n"
11911           "addps %%xmm1, %%xmm8\n"
11912           "subps %%xmm1, %%xmm9\n"
11913           "movaps %%xmm2, %%xmm10\n"
11914           "movaps %%xmm2, %%xmm11\n"
11915           "addps %%xmm3, %%xmm10\n"
11916           "subps %%xmm3, %%xmm11\n"
11917           "movaps %%xmm4, %%xmm12\n"
11918           "movaps %%xmm4, %%xmm13\n"
11919           "addps %%xmm5, %%xmm12\n"
11920           "subps %%xmm5, %%xmm13\n"
11921           "movaps %%xmm6, %%xmm14\n"
11922           "movaps %%xmm6, %%xmm15\n"
11923           "addps %%xmm7, %%xmm14\n"
11924           "subps %%xmm7, %%xmm15\n"
11925           "movaps %%xmm8, %%xmm0\n"
11926           "movaps %%xmm8, %%xmm2\n"
11927           "addps %%xmm10, %%xmm0\n"
11928           "subps %%xmm10, %%xmm2\n"
11929           "movaps %%xmm9, %%xmm1\n"
11930           "movaps %%xmm9, %%xmm3\n"
11931           "addps %%xmm11, %%xmm1\n"
11932           "subps %%xmm11, %%xmm3\n"
11933           "movaps %%xmm12, %%xmm4\n"
11934           "movaps %%xmm12, %%xmm6\n"
11935           "addps %%xmm14, %%xmm4\n"
11936           "subps %%xmm14, %%xmm6\n"
11937           "movaps %%xmm13, %%xmm5\n"
11938           "movaps %%xmm13, %%xmm7\n"
11939           "addps %%xmm15, %%xmm5\n"
11940           "subps %%xmm15, %%xmm7\n"
11941           "movaps %%xmm0, %%xmm8\n"
11942           "movaps %%xmm0, %%xmm12\n"
11943           "addps %%xmm4, %%xmm8\n"
11944           "subps %%xmm4, %%xmm12\n"
11945           "movaps %%xmm1, %%xmm9\n"
11946           "movaps %%xmm1, %%xmm13\n"
11947           "addps %%xmm5, %%xmm9\n"
11948           "subps %%xmm5, %%xmm13\n"
11949           "movaps %%xmm2, %%xmm10\n"
11950           "movaps %%xmm2, %%xmm14\n"
11951           "addps %%xmm6, %%xmm10\n"
11952           "subps %%xmm6, %%xmm14\n"
11953           "movaps %%xmm3, %%xmm11\n"
11954           "movaps %%xmm3, %%xmm15\n"
11955           "addps %%xmm7, %%xmm11\n"
11956           "subps %%xmm7, %%xmm15\n"
11957           "movups %%xmm8, (%0)\n"
11958           "movups %%xmm9, (%1)\n"
11959           "movups %%xmm10, (%2)\n"
11960           "movups %%xmm11, (%3)\n"
11961           "movups %%xmm12, (%4)\n"
11962           "movups %%xmm13, (%5)\n"
11963           "movups %%xmm14, (%6)\n"
11964           "movups %%xmm15, (%7)\n"
11965           :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11966         );
11967       }
11968     }
11969     return;
11970   }
11971   if (depth == 25) {
11972     helper_float_28_recursive(buf + 0, 22);
11973     helper_float_28_recursive(buf + 4194304, 22);
11974     helper_float_28_recursive(buf + 8388608, 22);
11975     helper_float_28_recursive(buf + 12582912, 22);
11976     helper_float_28_recursive(buf + 16777216, 22);
11977     helper_float_28_recursive(buf + 20971520, 22);
11978     helper_float_28_recursive(buf + 25165824, 22);
11979     helper_float_28_recursive(buf + 29360128, 22);
11980     for (int j = 0; j < 33554432; j += 33554432) {
11981       for (int k = 0; k < 4194304; k += 4) {
11982         __asm__ volatile (
11983           "movups (%0), %%xmm0\n"
11984           "movups (%1), %%xmm1\n"
11985           "movups (%2), %%xmm2\n"
11986           "movups (%3), %%xmm3\n"
11987           "movups (%4), %%xmm4\n"
11988           "movups (%5), %%xmm5\n"
11989           "movups (%6), %%xmm6\n"
11990           "movups (%7), %%xmm7\n"
11991           "movaps %%xmm0, %%xmm8\n"
11992           "movaps %%xmm0, %%xmm9\n"
11993           "addps %%xmm1, %%xmm8\n"
11994           "subps %%xmm1, %%xmm9\n"
11995           "movaps %%xmm2, %%xmm10\n"
11996           "movaps %%xmm2, %%xmm11\n"
11997           "addps %%xmm3, %%xmm10\n"
11998           "subps %%xmm3, %%xmm11\n"
11999           "movaps %%xmm4, %%xmm12\n"
12000           "movaps %%xmm4, %%xmm13\n"
12001           "addps %%xmm5, %%xmm12\n"
12002           "subps %%xmm5, %%xmm13\n"
12003           "movaps %%xmm6, %%xmm14\n"
12004           "movaps %%xmm6, %%xmm15\n"
12005           "addps %%xmm7, %%xmm14\n"
12006           "subps %%xmm7, %%xmm15\n"
12007           "movaps %%xmm8, %%xmm0\n"
12008           "movaps %%xmm8, %%xmm2\n"
12009           "addps %%xmm10, %%xmm0\n"
12010           "subps %%xmm10, %%xmm2\n"
12011           "movaps %%xmm9, %%xmm1\n"
12012           "movaps %%xmm9, %%xmm3\n"
12013           "addps %%xmm11, %%xmm1\n"
12014           "subps %%xmm11, %%xmm3\n"
12015           "movaps %%xmm12, %%xmm4\n"
12016           "movaps %%xmm12, %%xmm6\n"
12017           "addps %%xmm14, %%xmm4\n"
12018           "subps %%xmm14, %%xmm6\n"
12019           "movaps %%xmm13, %%xmm5\n"
12020           "movaps %%xmm13, %%xmm7\n"
12021           "addps %%xmm15, %%xmm5\n"
12022           "subps %%xmm15, %%xmm7\n"
12023           "movaps %%xmm0, %%xmm8\n"
12024           "movaps %%xmm0, %%xmm12\n"
12025           "addps %%xmm4, %%xmm8\n"
12026           "subps %%xmm4, %%xmm12\n"
12027           "movaps %%xmm1, %%xmm9\n"
12028           "movaps %%xmm1, %%xmm13\n"
12029           "addps %%xmm5, %%xmm9\n"
12030           "subps %%xmm5, %%xmm13\n"
12031           "movaps %%xmm2, %%xmm10\n"
12032           "movaps %%xmm2, %%xmm14\n"
12033           "addps %%xmm6, %%xmm10\n"
12034           "subps %%xmm6, %%xmm14\n"
12035           "movaps %%xmm3, %%xmm11\n"
12036           "movaps %%xmm3, %%xmm15\n"
12037           "addps %%xmm7, %%xmm11\n"
12038           "subps %%xmm7, %%xmm15\n"
12039           "movups %%xmm8, (%0)\n"
12040           "movups %%xmm9, (%1)\n"
12041           "movups %%xmm10, (%2)\n"
12042           "movups %%xmm11, (%3)\n"
12043           "movups %%xmm12, (%4)\n"
12044           "movups %%xmm13, (%5)\n"
12045           "movups %%xmm14, (%6)\n"
12046           "movups %%xmm15, (%7)\n"
12047           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12048         );
12049       }
12050     }
12051     return;
12052   }
12053   if (depth == 28) {
12054     helper_float_28_recursive(buf + 0, 25);
12055     helper_float_28_recursive(buf + 33554432, 25);
12056     helper_float_28_recursive(buf + 67108864, 25);
12057     helper_float_28_recursive(buf + 100663296, 25);
12058     helper_float_28_recursive(buf + 134217728, 25);
12059     helper_float_28_recursive(buf + 167772160, 25);
12060     helper_float_28_recursive(buf + 201326592, 25);
12061     helper_float_28_recursive(buf + 234881024, 25);
12062     for (int j = 0; j < 268435456; j += 268435456) {
12063       for (int k = 0; k < 33554432; k += 4) {
12064         __asm__ volatile (
12065           "movups (%0), %%xmm0\n"
12066           "movups (%1), %%xmm1\n"
12067           "movups (%2), %%xmm2\n"
12068           "movups (%3), %%xmm3\n"
12069           "movups (%4), %%xmm4\n"
12070           "movups (%5), %%xmm5\n"
12071           "movups (%6), %%xmm6\n"
12072           "movups (%7), %%xmm7\n"
12073           "movaps %%xmm0, %%xmm8\n"
12074           "movaps %%xmm0, %%xmm9\n"
12075           "addps %%xmm1, %%xmm8\n"
12076           "subps %%xmm1, %%xmm9\n"
12077           "movaps %%xmm2, %%xmm10\n"
12078           "movaps %%xmm2, %%xmm11\n"
12079           "addps %%xmm3, %%xmm10\n"
12080           "subps %%xmm3, %%xmm11\n"
12081           "movaps %%xmm4, %%xmm12\n"
12082           "movaps %%xmm4, %%xmm13\n"
12083           "addps %%xmm5, %%xmm12\n"
12084           "subps %%xmm5, %%xmm13\n"
12085           "movaps %%xmm6, %%xmm14\n"
12086           "movaps %%xmm6, %%xmm15\n"
12087           "addps %%xmm7, %%xmm14\n"
12088           "subps %%xmm7, %%xmm15\n"
12089           "movaps %%xmm8, %%xmm0\n"
12090           "movaps %%xmm8, %%xmm2\n"
12091           "addps %%xmm10, %%xmm0\n"
12092           "subps %%xmm10, %%xmm2\n"
12093           "movaps %%xmm9, %%xmm1\n"
12094           "movaps %%xmm9, %%xmm3\n"
12095           "addps %%xmm11, %%xmm1\n"
12096           "subps %%xmm11, %%xmm3\n"
12097           "movaps %%xmm12, %%xmm4\n"
12098           "movaps %%xmm12, %%xmm6\n"
12099           "addps %%xmm14, %%xmm4\n"
12100           "subps %%xmm14, %%xmm6\n"
12101           "movaps %%xmm13, %%xmm5\n"
12102           "movaps %%xmm13, %%xmm7\n"
12103           "addps %%xmm15, %%xmm5\n"
12104           "subps %%xmm15, %%xmm7\n"
12105           "movaps %%xmm0, %%xmm8\n"
12106           "movaps %%xmm0, %%xmm12\n"
12107           "addps %%xmm4, %%xmm8\n"
12108           "subps %%xmm4, %%xmm12\n"
12109           "movaps %%xmm1, %%xmm9\n"
12110           "movaps %%xmm1, %%xmm13\n"
12111           "addps %%xmm5, %%xmm9\n"
12112           "subps %%xmm5, %%xmm13\n"
12113           "movaps %%xmm2, %%xmm10\n"
12114           "movaps %%xmm2, %%xmm14\n"
12115           "addps %%xmm6, %%xmm10\n"
12116           "subps %%xmm6, %%xmm14\n"
12117           "movaps %%xmm3, %%xmm11\n"
12118           "movaps %%xmm3, %%xmm15\n"
12119           "addps %%xmm7, %%xmm11\n"
12120           "subps %%xmm7, %%xmm15\n"
12121           "movups %%xmm8, (%0)\n"
12122           "movups %%xmm9, (%1)\n"
12123           "movups %%xmm10, (%2)\n"
12124           "movups %%xmm11, (%3)\n"
12125           "movups %%xmm12, (%4)\n"
12126           "movups %%xmm13, (%5)\n"
12127           "movups %%xmm14, (%6)\n"
12128           "movups %%xmm15, (%7)\n"
12129           :: "r"(buf + j + k + 0), "r"(buf + j + k + 33554432), "r"(buf + j + k + 67108864), "r"(buf + j + k + 100663296), "r"(buf + j + k + 134217728), "r"(buf + j + k + 167772160), "r"(buf + j + k + 201326592), "r"(buf + j + k + 234881024) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12130         );
12131       }
12132     }
12133     return;
12134   }
12135 }
12136 void helper_float_28(float *buf);
helper_float_28(float * buf)12137 void helper_float_28(float *buf) {
12138   helper_float_28_recursive(buf, 28);
12139 }
12140 void helper_float_29_recursive(float *buf, int depth);
helper_float_29_recursive(float * buf,int depth)12141 void helper_float_29_recursive(float *buf, int depth) {
12142   if (depth == 12) {
12143     for (int j = 0; j < 4096; j += 32) {
12144       for (int k = 0; k < 4; k += 4) {
12145         __asm__ volatile (
12146           "movups (%0), %%xmm0\n"
12147           "movups (%1), %%xmm1\n"
12148           "movups (%2), %%xmm2\n"
12149           "movups (%3), %%xmm3\n"
12150           "movups (%4), %%xmm4\n"
12151           "movups (%5), %%xmm5\n"
12152           "movups (%6), %%xmm6\n"
12153           "movups (%7), %%xmm7\n"
12154           "movaps %%xmm0, %%xmm8\n"
12155           "shufps $160, %%xmm8, %%xmm8\n"
12156           "shufps $245, %%xmm0, %%xmm0\n"
12157           "xorps %%xmm9, %%xmm9\n"
12158           "subps %%xmm0, %%xmm9\n"
12159           "addsubps %%xmm9, %%xmm8\n"
12160           "movaps %%xmm8, %%xmm0\n"
12161           "movaps %%xmm1, %%xmm8\n"
12162           "shufps $160, %%xmm8, %%xmm8\n"
12163           "shufps $245, %%xmm1, %%xmm1\n"
12164           "xorps %%xmm9, %%xmm9\n"
12165           "subps %%xmm1, %%xmm9\n"
12166           "addsubps %%xmm9, %%xmm8\n"
12167           "movaps %%xmm8, %%xmm1\n"
12168           "movaps %%xmm2, %%xmm8\n"
12169           "shufps $160, %%xmm8, %%xmm8\n"
12170           "shufps $245, %%xmm2, %%xmm2\n"
12171           "xorps %%xmm9, %%xmm9\n"
12172           "subps %%xmm2, %%xmm9\n"
12173           "addsubps %%xmm9, %%xmm8\n"
12174           "movaps %%xmm8, %%xmm2\n"
12175           "movaps %%xmm3, %%xmm8\n"
12176           "shufps $160, %%xmm8, %%xmm8\n"
12177           "shufps $245, %%xmm3, %%xmm3\n"
12178           "xorps %%xmm9, %%xmm9\n"
12179           "subps %%xmm3, %%xmm9\n"
12180           "addsubps %%xmm9, %%xmm8\n"
12181           "movaps %%xmm8, %%xmm3\n"
12182           "movaps %%xmm4, %%xmm8\n"
12183           "shufps $160, %%xmm8, %%xmm8\n"
12184           "shufps $245, %%xmm4, %%xmm4\n"
12185           "xorps %%xmm9, %%xmm9\n"
12186           "subps %%xmm4, %%xmm9\n"
12187           "addsubps %%xmm9, %%xmm8\n"
12188           "movaps %%xmm8, %%xmm4\n"
12189           "movaps %%xmm5, %%xmm8\n"
12190           "shufps $160, %%xmm8, %%xmm8\n"
12191           "shufps $245, %%xmm5, %%xmm5\n"
12192           "xorps %%xmm9, %%xmm9\n"
12193           "subps %%xmm5, %%xmm9\n"
12194           "addsubps %%xmm9, %%xmm8\n"
12195           "movaps %%xmm8, %%xmm5\n"
12196           "movaps %%xmm6, %%xmm8\n"
12197           "shufps $160, %%xmm8, %%xmm8\n"
12198           "shufps $245, %%xmm6, %%xmm6\n"
12199           "xorps %%xmm9, %%xmm9\n"
12200           "subps %%xmm6, %%xmm9\n"
12201           "addsubps %%xmm9, %%xmm8\n"
12202           "movaps %%xmm8, %%xmm6\n"
12203           "movaps %%xmm7, %%xmm8\n"
12204           "shufps $160, %%xmm8, %%xmm8\n"
12205           "shufps $245, %%xmm7, %%xmm7\n"
12206           "xorps %%xmm9, %%xmm9\n"
12207           "subps %%xmm7, %%xmm9\n"
12208           "addsubps %%xmm9, %%xmm8\n"
12209           "movaps %%xmm8, %%xmm7\n"
12210           "movaps %%xmm0, %%xmm8\n"
12211           "shufps $68, %%xmm8, %%xmm8\n"
12212           "xorps %%xmm9, %%xmm9\n"
12213           "movaps %%xmm0, %%xmm10\n"
12214           "shufps $14, %%xmm9, %%xmm10\n"
12215           "movaps %%xmm0, %%xmm11\n"
12216           "shufps $224, %%xmm11, %%xmm9\n"
12217           "addps %%xmm8, %%xmm10\n"
12218           "subps %%xmm9, %%xmm10\n"
12219           "movaps %%xmm10, %%xmm0\n"
12220           "movaps %%xmm1, %%xmm8\n"
12221           "shufps $68, %%xmm8, %%xmm8\n"
12222           "xorps %%xmm9, %%xmm9\n"
12223           "movaps %%xmm1, %%xmm10\n"
12224           "shufps $14, %%xmm9, %%xmm10\n"
12225           "movaps %%xmm1, %%xmm11\n"
12226           "shufps $224, %%xmm11, %%xmm9\n"
12227           "addps %%xmm8, %%xmm10\n"
12228           "subps %%xmm9, %%xmm10\n"
12229           "movaps %%xmm10, %%xmm1\n"
12230           "movaps %%xmm2, %%xmm8\n"
12231           "shufps $68, %%xmm8, %%xmm8\n"
12232           "xorps %%xmm9, %%xmm9\n"
12233           "movaps %%xmm2, %%xmm10\n"
12234           "shufps $14, %%xmm9, %%xmm10\n"
12235           "movaps %%xmm2, %%xmm11\n"
12236           "shufps $224, %%xmm11, %%xmm9\n"
12237           "addps %%xmm8, %%xmm10\n"
12238           "subps %%xmm9, %%xmm10\n"
12239           "movaps %%xmm10, %%xmm2\n"
12240           "movaps %%xmm3, %%xmm8\n"
12241           "shufps $68, %%xmm8, %%xmm8\n"
12242           "xorps %%xmm9, %%xmm9\n"
12243           "movaps %%xmm3, %%xmm10\n"
12244           "shufps $14, %%xmm9, %%xmm10\n"
12245           "movaps %%xmm3, %%xmm11\n"
12246           "shufps $224, %%xmm11, %%xmm9\n"
12247           "addps %%xmm8, %%xmm10\n"
12248           "subps %%xmm9, %%xmm10\n"
12249           "movaps %%xmm10, %%xmm3\n"
12250           "movaps %%xmm4, %%xmm8\n"
12251           "shufps $68, %%xmm8, %%xmm8\n"
12252           "xorps %%xmm9, %%xmm9\n"
12253           "movaps %%xmm4, %%xmm10\n"
12254           "shufps $14, %%xmm9, %%xmm10\n"
12255           "movaps %%xmm4, %%xmm11\n"
12256           "shufps $224, %%xmm11, %%xmm9\n"
12257           "addps %%xmm8, %%xmm10\n"
12258           "subps %%xmm9, %%xmm10\n"
12259           "movaps %%xmm10, %%xmm4\n"
12260           "movaps %%xmm5, %%xmm8\n"
12261           "shufps $68, %%xmm8, %%xmm8\n"
12262           "xorps %%xmm9, %%xmm9\n"
12263           "movaps %%xmm5, %%xmm10\n"
12264           "shufps $14, %%xmm9, %%xmm10\n"
12265           "movaps %%xmm5, %%xmm11\n"
12266           "shufps $224, %%xmm11, %%xmm9\n"
12267           "addps %%xmm8, %%xmm10\n"
12268           "subps %%xmm9, %%xmm10\n"
12269           "movaps %%xmm10, %%xmm5\n"
12270           "movaps %%xmm6, %%xmm8\n"
12271           "shufps $68, %%xmm8, %%xmm8\n"
12272           "xorps %%xmm9, %%xmm9\n"
12273           "movaps %%xmm6, %%xmm10\n"
12274           "shufps $14, %%xmm9, %%xmm10\n"
12275           "movaps %%xmm6, %%xmm11\n"
12276           "shufps $224, %%xmm11, %%xmm9\n"
12277           "addps %%xmm8, %%xmm10\n"
12278           "subps %%xmm9, %%xmm10\n"
12279           "movaps %%xmm10, %%xmm6\n"
12280           "movaps %%xmm7, %%xmm8\n"
12281           "shufps $68, %%xmm8, %%xmm8\n"
12282           "xorps %%xmm9, %%xmm9\n"
12283           "movaps %%xmm7, %%xmm10\n"
12284           "shufps $14, %%xmm9, %%xmm10\n"
12285           "movaps %%xmm7, %%xmm11\n"
12286           "shufps $224, %%xmm11, %%xmm9\n"
12287           "addps %%xmm8, %%xmm10\n"
12288           "subps %%xmm9, %%xmm10\n"
12289           "movaps %%xmm10, %%xmm7\n"
12290           "movaps %%xmm0, %%xmm8\n"
12291           "movaps %%xmm0, %%xmm9\n"
12292           "addps %%xmm1, %%xmm8\n"
12293           "subps %%xmm1, %%xmm9\n"
12294           "movaps %%xmm2, %%xmm10\n"
12295           "movaps %%xmm2, %%xmm11\n"
12296           "addps %%xmm3, %%xmm10\n"
12297           "subps %%xmm3, %%xmm11\n"
12298           "movaps %%xmm4, %%xmm12\n"
12299           "movaps %%xmm4, %%xmm13\n"
12300           "addps %%xmm5, %%xmm12\n"
12301           "subps %%xmm5, %%xmm13\n"
12302           "movaps %%xmm6, %%xmm14\n"
12303           "movaps %%xmm6, %%xmm15\n"
12304           "addps %%xmm7, %%xmm14\n"
12305           "subps %%xmm7, %%xmm15\n"
12306           "movaps %%xmm8, %%xmm0\n"
12307           "movaps %%xmm8, %%xmm2\n"
12308           "addps %%xmm10, %%xmm0\n"
12309           "subps %%xmm10, %%xmm2\n"
12310           "movaps %%xmm9, %%xmm1\n"
12311           "movaps %%xmm9, %%xmm3\n"
12312           "addps %%xmm11, %%xmm1\n"
12313           "subps %%xmm11, %%xmm3\n"
12314           "movaps %%xmm12, %%xmm4\n"
12315           "movaps %%xmm12, %%xmm6\n"
12316           "addps %%xmm14, %%xmm4\n"
12317           "subps %%xmm14, %%xmm6\n"
12318           "movaps %%xmm13, %%xmm5\n"
12319           "movaps %%xmm13, %%xmm7\n"
12320           "addps %%xmm15, %%xmm5\n"
12321           "subps %%xmm15, %%xmm7\n"
12322           "movaps %%xmm0, %%xmm8\n"
12323           "movaps %%xmm0, %%xmm12\n"
12324           "addps %%xmm4, %%xmm8\n"
12325           "subps %%xmm4, %%xmm12\n"
12326           "movaps %%xmm1, %%xmm9\n"
12327           "movaps %%xmm1, %%xmm13\n"
12328           "addps %%xmm5, %%xmm9\n"
12329           "subps %%xmm5, %%xmm13\n"
12330           "movaps %%xmm2, %%xmm10\n"
12331           "movaps %%xmm2, %%xmm14\n"
12332           "addps %%xmm6, %%xmm10\n"
12333           "subps %%xmm6, %%xmm14\n"
12334           "movaps %%xmm3, %%xmm11\n"
12335           "movaps %%xmm3, %%xmm15\n"
12336           "addps %%xmm7, %%xmm11\n"
12337           "subps %%xmm7, %%xmm15\n"
12338           "movups %%xmm8, (%0)\n"
12339           "movups %%xmm9, (%1)\n"
12340           "movups %%xmm10, (%2)\n"
12341           "movups %%xmm11, (%3)\n"
12342           "movups %%xmm12, (%4)\n"
12343           "movups %%xmm13, (%5)\n"
12344           "movups %%xmm14, (%6)\n"
12345           "movups %%xmm15, (%7)\n"
12346           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12347         );
12348       }
12349     }
12350     for (int j = 0; j < 4096; j += 256) {
12351       for (int k = 0; k < 32; k += 4) {
12352         __asm__ volatile (
12353           "movups (%0), %%xmm0\n"
12354           "movups (%1), %%xmm1\n"
12355           "movups (%2), %%xmm2\n"
12356           "movups (%3), %%xmm3\n"
12357           "movups (%4), %%xmm4\n"
12358           "movups (%5), %%xmm5\n"
12359           "movups (%6), %%xmm6\n"
12360           "movups (%7), %%xmm7\n"
12361           "movaps %%xmm0, %%xmm8\n"
12362           "movaps %%xmm0, %%xmm9\n"
12363           "addps %%xmm1, %%xmm8\n"
12364           "subps %%xmm1, %%xmm9\n"
12365           "movaps %%xmm2, %%xmm10\n"
12366           "movaps %%xmm2, %%xmm11\n"
12367           "addps %%xmm3, %%xmm10\n"
12368           "subps %%xmm3, %%xmm11\n"
12369           "movaps %%xmm4, %%xmm12\n"
12370           "movaps %%xmm4, %%xmm13\n"
12371           "addps %%xmm5, %%xmm12\n"
12372           "subps %%xmm5, %%xmm13\n"
12373           "movaps %%xmm6, %%xmm14\n"
12374           "movaps %%xmm6, %%xmm15\n"
12375           "addps %%xmm7, %%xmm14\n"
12376           "subps %%xmm7, %%xmm15\n"
12377           "movaps %%xmm8, %%xmm0\n"
12378           "movaps %%xmm8, %%xmm2\n"
12379           "addps %%xmm10, %%xmm0\n"
12380           "subps %%xmm10, %%xmm2\n"
12381           "movaps %%xmm9, %%xmm1\n"
12382           "movaps %%xmm9, %%xmm3\n"
12383           "addps %%xmm11, %%xmm1\n"
12384           "subps %%xmm11, %%xmm3\n"
12385           "movaps %%xmm12, %%xmm4\n"
12386           "movaps %%xmm12, %%xmm6\n"
12387           "addps %%xmm14, %%xmm4\n"
12388           "subps %%xmm14, %%xmm6\n"
12389           "movaps %%xmm13, %%xmm5\n"
12390           "movaps %%xmm13, %%xmm7\n"
12391           "addps %%xmm15, %%xmm5\n"
12392           "subps %%xmm15, %%xmm7\n"
12393           "movaps %%xmm0, %%xmm8\n"
12394           "movaps %%xmm0, %%xmm12\n"
12395           "addps %%xmm4, %%xmm8\n"
12396           "subps %%xmm4, %%xmm12\n"
12397           "movaps %%xmm1, %%xmm9\n"
12398           "movaps %%xmm1, %%xmm13\n"
12399           "addps %%xmm5, %%xmm9\n"
12400           "subps %%xmm5, %%xmm13\n"
12401           "movaps %%xmm2, %%xmm10\n"
12402           "movaps %%xmm2, %%xmm14\n"
12403           "addps %%xmm6, %%xmm10\n"
12404           "subps %%xmm6, %%xmm14\n"
12405           "movaps %%xmm3, %%xmm11\n"
12406           "movaps %%xmm3, %%xmm15\n"
12407           "addps %%xmm7, %%xmm11\n"
12408           "subps %%xmm7, %%xmm15\n"
12409           "movups %%xmm8, (%0)\n"
12410           "movups %%xmm9, (%1)\n"
12411           "movups %%xmm10, (%2)\n"
12412           "movups %%xmm11, (%3)\n"
12413           "movups %%xmm12, (%4)\n"
12414           "movups %%xmm13, (%5)\n"
12415           "movups %%xmm14, (%6)\n"
12416           "movups %%xmm15, (%7)\n"
12417           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12418         );
12419       }
12420     }
12421     for (int j = 0; j < 4096; j += 2048) {
12422       for (int k = 0; k < 256; k += 4) {
12423         __asm__ volatile (
12424           "movups (%0), %%xmm0\n"
12425           "movups (%1), %%xmm1\n"
12426           "movups (%2), %%xmm2\n"
12427           "movups (%3), %%xmm3\n"
12428           "movups (%4), %%xmm4\n"
12429           "movups (%5), %%xmm5\n"
12430           "movups (%6), %%xmm6\n"
12431           "movups (%7), %%xmm7\n"
12432           "movaps %%xmm0, %%xmm8\n"
12433           "movaps %%xmm0, %%xmm9\n"
12434           "addps %%xmm1, %%xmm8\n"
12435           "subps %%xmm1, %%xmm9\n"
12436           "movaps %%xmm2, %%xmm10\n"
12437           "movaps %%xmm2, %%xmm11\n"
12438           "addps %%xmm3, %%xmm10\n"
12439           "subps %%xmm3, %%xmm11\n"
12440           "movaps %%xmm4, %%xmm12\n"
12441           "movaps %%xmm4, %%xmm13\n"
12442           "addps %%xmm5, %%xmm12\n"
12443           "subps %%xmm5, %%xmm13\n"
12444           "movaps %%xmm6, %%xmm14\n"
12445           "movaps %%xmm6, %%xmm15\n"
12446           "addps %%xmm7, %%xmm14\n"
12447           "subps %%xmm7, %%xmm15\n"
12448           "movaps %%xmm8, %%xmm0\n"
12449           "movaps %%xmm8, %%xmm2\n"
12450           "addps %%xmm10, %%xmm0\n"
12451           "subps %%xmm10, %%xmm2\n"
12452           "movaps %%xmm9, %%xmm1\n"
12453           "movaps %%xmm9, %%xmm3\n"
12454           "addps %%xmm11, %%xmm1\n"
12455           "subps %%xmm11, %%xmm3\n"
12456           "movaps %%xmm12, %%xmm4\n"
12457           "movaps %%xmm12, %%xmm6\n"
12458           "addps %%xmm14, %%xmm4\n"
12459           "subps %%xmm14, %%xmm6\n"
12460           "movaps %%xmm13, %%xmm5\n"
12461           "movaps %%xmm13, %%xmm7\n"
12462           "addps %%xmm15, %%xmm5\n"
12463           "subps %%xmm15, %%xmm7\n"
12464           "movaps %%xmm0, %%xmm8\n"
12465           "movaps %%xmm0, %%xmm12\n"
12466           "addps %%xmm4, %%xmm8\n"
12467           "subps %%xmm4, %%xmm12\n"
12468           "movaps %%xmm1, %%xmm9\n"
12469           "movaps %%xmm1, %%xmm13\n"
12470           "addps %%xmm5, %%xmm9\n"
12471           "subps %%xmm5, %%xmm13\n"
12472           "movaps %%xmm2, %%xmm10\n"
12473           "movaps %%xmm2, %%xmm14\n"
12474           "addps %%xmm6, %%xmm10\n"
12475           "subps %%xmm6, %%xmm14\n"
12476           "movaps %%xmm3, %%xmm11\n"
12477           "movaps %%xmm3, %%xmm15\n"
12478           "addps %%xmm7, %%xmm11\n"
12479           "subps %%xmm7, %%xmm15\n"
12480           "movups %%xmm8, (%0)\n"
12481           "movups %%xmm9, (%1)\n"
12482           "movups %%xmm10, (%2)\n"
12483           "movups %%xmm11, (%3)\n"
12484           "movups %%xmm12, (%4)\n"
12485           "movups %%xmm13, (%5)\n"
12486           "movups %%xmm14, (%6)\n"
12487           "movups %%xmm15, (%7)\n"
12488           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12489         );
12490       }
12491     }
12492     for (int j = 0; j < 4096; j += 4096) {
12493       for (int k = 0; k < 2048; k += 4) {
12494         __asm__ volatile (
12495           "movups (%0), %%xmm0\n"
12496           "movups (%1), %%xmm1\n"
12497           "movaps %%xmm0, %%xmm8\n"
12498           "movaps %%xmm0, %%xmm9\n"
12499           "addps %%xmm1, %%xmm8\n"
12500           "subps %%xmm1, %%xmm9\n"
12501           "movups %%xmm8, (%0)\n"
12502           "movups %%xmm9, (%1)\n"
12503           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12504         );
12505       }
12506     }
12507     return;
12508   }
12509   if (depth == 15) {
12510     helper_float_29_recursive(buf + 0, 12);
12511     helper_float_29_recursive(buf + 4096, 12);
12512     helper_float_29_recursive(buf + 8192, 12);
12513     helper_float_29_recursive(buf + 12288, 12);
12514     helper_float_29_recursive(buf + 16384, 12);
12515     helper_float_29_recursive(buf + 20480, 12);
12516     helper_float_29_recursive(buf + 24576, 12);
12517     helper_float_29_recursive(buf + 28672, 12);
12518     for (int j = 0; j < 32768; j += 32768) {
12519       for (int k = 0; k < 4096; k += 4) {
12520         __asm__ volatile (
12521           "movups (%0), %%xmm0\n"
12522           "movups (%1), %%xmm1\n"
12523           "movups (%2), %%xmm2\n"
12524           "movups (%3), %%xmm3\n"
12525           "movups (%4), %%xmm4\n"
12526           "movups (%5), %%xmm5\n"
12527           "movups (%6), %%xmm6\n"
12528           "movups (%7), %%xmm7\n"
12529           "movaps %%xmm0, %%xmm8\n"
12530           "movaps %%xmm0, %%xmm9\n"
12531           "addps %%xmm1, %%xmm8\n"
12532           "subps %%xmm1, %%xmm9\n"
12533           "movaps %%xmm2, %%xmm10\n"
12534           "movaps %%xmm2, %%xmm11\n"
12535           "addps %%xmm3, %%xmm10\n"
12536           "subps %%xmm3, %%xmm11\n"
12537           "movaps %%xmm4, %%xmm12\n"
12538           "movaps %%xmm4, %%xmm13\n"
12539           "addps %%xmm5, %%xmm12\n"
12540           "subps %%xmm5, %%xmm13\n"
12541           "movaps %%xmm6, %%xmm14\n"
12542           "movaps %%xmm6, %%xmm15\n"
12543           "addps %%xmm7, %%xmm14\n"
12544           "subps %%xmm7, %%xmm15\n"
12545           "movaps %%xmm8, %%xmm0\n"
12546           "movaps %%xmm8, %%xmm2\n"
12547           "addps %%xmm10, %%xmm0\n"
12548           "subps %%xmm10, %%xmm2\n"
12549           "movaps %%xmm9, %%xmm1\n"
12550           "movaps %%xmm9, %%xmm3\n"
12551           "addps %%xmm11, %%xmm1\n"
12552           "subps %%xmm11, %%xmm3\n"
12553           "movaps %%xmm12, %%xmm4\n"
12554           "movaps %%xmm12, %%xmm6\n"
12555           "addps %%xmm14, %%xmm4\n"
12556           "subps %%xmm14, %%xmm6\n"
12557           "movaps %%xmm13, %%xmm5\n"
12558           "movaps %%xmm13, %%xmm7\n"
12559           "addps %%xmm15, %%xmm5\n"
12560           "subps %%xmm15, %%xmm7\n"
12561           "movaps %%xmm0, %%xmm8\n"
12562           "movaps %%xmm0, %%xmm12\n"
12563           "addps %%xmm4, %%xmm8\n"
12564           "subps %%xmm4, %%xmm12\n"
12565           "movaps %%xmm1, %%xmm9\n"
12566           "movaps %%xmm1, %%xmm13\n"
12567           "addps %%xmm5, %%xmm9\n"
12568           "subps %%xmm5, %%xmm13\n"
12569           "movaps %%xmm2, %%xmm10\n"
12570           "movaps %%xmm2, %%xmm14\n"
12571           "addps %%xmm6, %%xmm10\n"
12572           "subps %%xmm6, %%xmm14\n"
12573           "movaps %%xmm3, %%xmm11\n"
12574           "movaps %%xmm3, %%xmm15\n"
12575           "addps %%xmm7, %%xmm11\n"
12576           "subps %%xmm7, %%xmm15\n"
12577           "movups %%xmm8, (%0)\n"
12578           "movups %%xmm9, (%1)\n"
12579           "movups %%xmm10, (%2)\n"
12580           "movups %%xmm11, (%3)\n"
12581           "movups %%xmm12, (%4)\n"
12582           "movups %%xmm13, (%5)\n"
12583           "movups %%xmm14, (%6)\n"
12584           "movups %%xmm15, (%7)\n"
12585           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12586         );
12587       }
12588     }
12589     return;
12590   }
12591   if (depth == 18) {
12592     helper_float_29_recursive(buf + 0, 15);
12593     helper_float_29_recursive(buf + 32768, 15);
12594     helper_float_29_recursive(buf + 65536, 15);
12595     helper_float_29_recursive(buf + 98304, 15);
12596     helper_float_29_recursive(buf + 131072, 15);
12597     helper_float_29_recursive(buf + 163840, 15);
12598     helper_float_29_recursive(buf + 196608, 15);
12599     helper_float_29_recursive(buf + 229376, 15);
12600     for (int j = 0; j < 262144; j += 262144) {
12601       for (int k = 0; k < 32768; k += 4) {
12602         __asm__ volatile (
12603           "movups (%0), %%xmm0\n"
12604           "movups (%1), %%xmm1\n"
12605           "movups (%2), %%xmm2\n"
12606           "movups (%3), %%xmm3\n"
12607           "movups (%4), %%xmm4\n"
12608           "movups (%5), %%xmm5\n"
12609           "movups (%6), %%xmm6\n"
12610           "movups (%7), %%xmm7\n"
12611           "movaps %%xmm0, %%xmm8\n"
12612           "movaps %%xmm0, %%xmm9\n"
12613           "addps %%xmm1, %%xmm8\n"
12614           "subps %%xmm1, %%xmm9\n"
12615           "movaps %%xmm2, %%xmm10\n"
12616           "movaps %%xmm2, %%xmm11\n"
12617           "addps %%xmm3, %%xmm10\n"
12618           "subps %%xmm3, %%xmm11\n"
12619           "movaps %%xmm4, %%xmm12\n"
12620           "movaps %%xmm4, %%xmm13\n"
12621           "addps %%xmm5, %%xmm12\n"
12622           "subps %%xmm5, %%xmm13\n"
12623           "movaps %%xmm6, %%xmm14\n"
12624           "movaps %%xmm6, %%xmm15\n"
12625           "addps %%xmm7, %%xmm14\n"
12626           "subps %%xmm7, %%xmm15\n"
12627           "movaps %%xmm8, %%xmm0\n"
12628           "movaps %%xmm8, %%xmm2\n"
12629           "addps %%xmm10, %%xmm0\n"
12630           "subps %%xmm10, %%xmm2\n"
12631           "movaps %%xmm9, %%xmm1\n"
12632           "movaps %%xmm9, %%xmm3\n"
12633           "addps %%xmm11, %%xmm1\n"
12634           "subps %%xmm11, %%xmm3\n"
12635           "movaps %%xmm12, %%xmm4\n"
12636           "movaps %%xmm12, %%xmm6\n"
12637           "addps %%xmm14, %%xmm4\n"
12638           "subps %%xmm14, %%xmm6\n"
12639           "movaps %%xmm13, %%xmm5\n"
12640           "movaps %%xmm13, %%xmm7\n"
12641           "addps %%xmm15, %%xmm5\n"
12642           "subps %%xmm15, %%xmm7\n"
12643           "movaps %%xmm0, %%xmm8\n"
12644           "movaps %%xmm0, %%xmm12\n"
12645           "addps %%xmm4, %%xmm8\n"
12646           "subps %%xmm4, %%xmm12\n"
12647           "movaps %%xmm1, %%xmm9\n"
12648           "movaps %%xmm1, %%xmm13\n"
12649           "addps %%xmm5, %%xmm9\n"
12650           "subps %%xmm5, %%xmm13\n"
12651           "movaps %%xmm2, %%xmm10\n"
12652           "movaps %%xmm2, %%xmm14\n"
12653           "addps %%xmm6, %%xmm10\n"
12654           "subps %%xmm6, %%xmm14\n"
12655           "movaps %%xmm3, %%xmm11\n"
12656           "movaps %%xmm3, %%xmm15\n"
12657           "addps %%xmm7, %%xmm11\n"
12658           "subps %%xmm7, %%xmm15\n"
12659           "movups %%xmm8, (%0)\n"
12660           "movups %%xmm9, (%1)\n"
12661           "movups %%xmm10, (%2)\n"
12662           "movups %%xmm11, (%3)\n"
12663           "movups %%xmm12, (%4)\n"
12664           "movups %%xmm13, (%5)\n"
12665           "movups %%xmm14, (%6)\n"
12666           "movups %%xmm15, (%7)\n"
12667           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12668         );
12669       }
12670     }
12671     return;
12672   }
12673   if (depth == 21) {
12674     helper_float_29_recursive(buf + 0, 18);
12675     helper_float_29_recursive(buf + 262144, 18);
12676     helper_float_29_recursive(buf + 524288, 18);
12677     helper_float_29_recursive(buf + 786432, 18);
12678     helper_float_29_recursive(buf + 1048576, 18);
12679     helper_float_29_recursive(buf + 1310720, 18);
12680     helper_float_29_recursive(buf + 1572864, 18);
12681     helper_float_29_recursive(buf + 1835008, 18);
12682     for (int j = 0; j < 2097152; j += 2097152) {
12683       for (int k = 0; k < 262144; k += 4) {
12684         __asm__ volatile (
12685           "movups (%0), %%xmm0\n"
12686           "movups (%1), %%xmm1\n"
12687           "movups (%2), %%xmm2\n"
12688           "movups (%3), %%xmm3\n"
12689           "movups (%4), %%xmm4\n"
12690           "movups (%5), %%xmm5\n"
12691           "movups (%6), %%xmm6\n"
12692           "movups (%7), %%xmm7\n"
12693           "movaps %%xmm0, %%xmm8\n"
12694           "movaps %%xmm0, %%xmm9\n"
12695           "addps %%xmm1, %%xmm8\n"
12696           "subps %%xmm1, %%xmm9\n"
12697           "movaps %%xmm2, %%xmm10\n"
12698           "movaps %%xmm2, %%xmm11\n"
12699           "addps %%xmm3, %%xmm10\n"
12700           "subps %%xmm3, %%xmm11\n"
12701           "movaps %%xmm4, %%xmm12\n"
12702           "movaps %%xmm4, %%xmm13\n"
12703           "addps %%xmm5, %%xmm12\n"
12704           "subps %%xmm5, %%xmm13\n"
12705           "movaps %%xmm6, %%xmm14\n"
12706           "movaps %%xmm6, %%xmm15\n"
12707           "addps %%xmm7, %%xmm14\n"
12708           "subps %%xmm7, %%xmm15\n"
12709           "movaps %%xmm8, %%xmm0\n"
12710           "movaps %%xmm8, %%xmm2\n"
12711           "addps %%xmm10, %%xmm0\n"
12712           "subps %%xmm10, %%xmm2\n"
12713           "movaps %%xmm9, %%xmm1\n"
12714           "movaps %%xmm9, %%xmm3\n"
12715           "addps %%xmm11, %%xmm1\n"
12716           "subps %%xmm11, %%xmm3\n"
12717           "movaps %%xmm12, %%xmm4\n"
12718           "movaps %%xmm12, %%xmm6\n"
12719           "addps %%xmm14, %%xmm4\n"
12720           "subps %%xmm14, %%xmm6\n"
12721           "movaps %%xmm13, %%xmm5\n"
12722           "movaps %%xmm13, %%xmm7\n"
12723           "addps %%xmm15, %%xmm5\n"
12724           "subps %%xmm15, %%xmm7\n"
12725           "movaps %%xmm0, %%xmm8\n"
12726           "movaps %%xmm0, %%xmm12\n"
12727           "addps %%xmm4, %%xmm8\n"
12728           "subps %%xmm4, %%xmm12\n"
12729           "movaps %%xmm1, %%xmm9\n"
12730           "movaps %%xmm1, %%xmm13\n"
12731           "addps %%xmm5, %%xmm9\n"
12732           "subps %%xmm5, %%xmm13\n"
12733           "movaps %%xmm2, %%xmm10\n"
12734           "movaps %%xmm2, %%xmm14\n"
12735           "addps %%xmm6, %%xmm10\n"
12736           "subps %%xmm6, %%xmm14\n"
12737           "movaps %%xmm3, %%xmm11\n"
12738           "movaps %%xmm3, %%xmm15\n"
12739           "addps %%xmm7, %%xmm11\n"
12740           "subps %%xmm7, %%xmm15\n"
12741           "movups %%xmm8, (%0)\n"
12742           "movups %%xmm9, (%1)\n"
12743           "movups %%xmm10, (%2)\n"
12744           "movups %%xmm11, (%3)\n"
12745           "movups %%xmm12, (%4)\n"
12746           "movups %%xmm13, (%5)\n"
12747           "movups %%xmm14, (%6)\n"
12748           "movups %%xmm15, (%7)\n"
12749           :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12750         );
12751       }
12752     }
12753     return;
12754   }
12755   if (depth == 24) {
12756     helper_float_29_recursive(buf + 0, 21);
12757     helper_float_29_recursive(buf + 2097152, 21);
12758     helper_float_29_recursive(buf + 4194304, 21);
12759     helper_float_29_recursive(buf + 6291456, 21);
12760     helper_float_29_recursive(buf + 8388608, 21);
12761     helper_float_29_recursive(buf + 10485760, 21);
12762     helper_float_29_recursive(buf + 12582912, 21);
12763     helper_float_29_recursive(buf + 14680064, 21);
12764     for (int j = 0; j < 16777216; j += 16777216) {
12765       for (int k = 0; k < 2097152; k += 4) {
12766         __asm__ volatile (
12767           "movups (%0), %%xmm0\n"
12768           "movups (%1), %%xmm1\n"
12769           "movups (%2), %%xmm2\n"
12770           "movups (%3), %%xmm3\n"
12771           "movups (%4), %%xmm4\n"
12772           "movups (%5), %%xmm5\n"
12773           "movups (%6), %%xmm6\n"
12774           "movups (%7), %%xmm7\n"
12775           "movaps %%xmm0, %%xmm8\n"
12776           "movaps %%xmm0, %%xmm9\n"
12777           "addps %%xmm1, %%xmm8\n"
12778           "subps %%xmm1, %%xmm9\n"
12779           "movaps %%xmm2, %%xmm10\n"
12780           "movaps %%xmm2, %%xmm11\n"
12781           "addps %%xmm3, %%xmm10\n"
12782           "subps %%xmm3, %%xmm11\n"
12783           "movaps %%xmm4, %%xmm12\n"
12784           "movaps %%xmm4, %%xmm13\n"
12785           "addps %%xmm5, %%xmm12\n"
12786           "subps %%xmm5, %%xmm13\n"
12787           "movaps %%xmm6, %%xmm14\n"
12788           "movaps %%xmm6, %%xmm15\n"
12789           "addps %%xmm7, %%xmm14\n"
12790           "subps %%xmm7, %%xmm15\n"
12791           "movaps %%xmm8, %%xmm0\n"
12792           "movaps %%xmm8, %%xmm2\n"
12793           "addps %%xmm10, %%xmm0\n"
12794           "subps %%xmm10, %%xmm2\n"
12795           "movaps %%xmm9, %%xmm1\n"
12796           "movaps %%xmm9, %%xmm3\n"
12797           "addps %%xmm11, %%xmm1\n"
12798           "subps %%xmm11, %%xmm3\n"
12799           "movaps %%xmm12, %%xmm4\n"
12800           "movaps %%xmm12, %%xmm6\n"
12801           "addps %%xmm14, %%xmm4\n"
12802           "subps %%xmm14, %%xmm6\n"
12803           "movaps %%xmm13, %%xmm5\n"
12804           "movaps %%xmm13, %%xmm7\n"
12805           "addps %%xmm15, %%xmm5\n"
12806           "subps %%xmm15, %%xmm7\n"
12807           "movaps %%xmm0, %%xmm8\n"
12808           "movaps %%xmm0, %%xmm12\n"
12809           "addps %%xmm4, %%xmm8\n"
12810           "subps %%xmm4, %%xmm12\n"
12811           "movaps %%xmm1, %%xmm9\n"
12812           "movaps %%xmm1, %%xmm13\n"
12813           "addps %%xmm5, %%xmm9\n"
12814           "subps %%xmm5, %%xmm13\n"
12815           "movaps %%xmm2, %%xmm10\n"
12816           "movaps %%xmm2, %%xmm14\n"
12817           "addps %%xmm6, %%xmm10\n"
12818           "subps %%xmm6, %%xmm14\n"
12819           "movaps %%xmm3, %%xmm11\n"
12820           "movaps %%xmm3, %%xmm15\n"
12821           "addps %%xmm7, %%xmm11\n"
12822           "subps %%xmm7, %%xmm15\n"
12823           "movups %%xmm8, (%0)\n"
12824           "movups %%xmm9, (%1)\n"
12825           "movups %%xmm10, (%2)\n"
12826           "movups %%xmm11, (%3)\n"
12827           "movups %%xmm12, (%4)\n"
12828           "movups %%xmm13, (%5)\n"
12829           "movups %%xmm14, (%6)\n"
12830           "movups %%xmm15, (%7)\n"
12831           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12832         );
12833       }
12834     }
12835     return;
12836   }
12837   if (depth == 27) {
12838     helper_float_29_recursive(buf + 0, 24);
12839     helper_float_29_recursive(buf + 16777216, 24);
12840     helper_float_29_recursive(buf + 33554432, 24);
12841     helper_float_29_recursive(buf + 50331648, 24);
12842     helper_float_29_recursive(buf + 67108864, 24);
12843     helper_float_29_recursive(buf + 83886080, 24);
12844     helper_float_29_recursive(buf + 100663296, 24);
12845     helper_float_29_recursive(buf + 117440512, 24);
12846     for (int j = 0; j < 134217728; j += 134217728) {
12847       for (int k = 0; k < 16777216; k += 4) {
12848         __asm__ volatile (
12849           "movups (%0), %%xmm0\n"
12850           "movups (%1), %%xmm1\n"
12851           "movups (%2), %%xmm2\n"
12852           "movups (%3), %%xmm3\n"
12853           "movups (%4), %%xmm4\n"
12854           "movups (%5), %%xmm5\n"
12855           "movups (%6), %%xmm6\n"
12856           "movups (%7), %%xmm7\n"
12857           "movaps %%xmm0, %%xmm8\n"
12858           "movaps %%xmm0, %%xmm9\n"
12859           "addps %%xmm1, %%xmm8\n"
12860           "subps %%xmm1, %%xmm9\n"
12861           "movaps %%xmm2, %%xmm10\n"
12862           "movaps %%xmm2, %%xmm11\n"
12863           "addps %%xmm3, %%xmm10\n"
12864           "subps %%xmm3, %%xmm11\n"
12865           "movaps %%xmm4, %%xmm12\n"
12866           "movaps %%xmm4, %%xmm13\n"
12867           "addps %%xmm5, %%xmm12\n"
12868           "subps %%xmm5, %%xmm13\n"
12869           "movaps %%xmm6, %%xmm14\n"
12870           "movaps %%xmm6, %%xmm15\n"
12871           "addps %%xmm7, %%xmm14\n"
12872           "subps %%xmm7, %%xmm15\n"
12873           "movaps %%xmm8, %%xmm0\n"
12874           "movaps %%xmm8, %%xmm2\n"
12875           "addps %%xmm10, %%xmm0\n"
12876           "subps %%xmm10, %%xmm2\n"
12877           "movaps %%xmm9, %%xmm1\n"
12878           "movaps %%xmm9, %%xmm3\n"
12879           "addps %%xmm11, %%xmm1\n"
12880           "subps %%xmm11, %%xmm3\n"
12881           "movaps %%xmm12, %%xmm4\n"
12882           "movaps %%xmm12, %%xmm6\n"
12883           "addps %%xmm14, %%xmm4\n"
12884           "subps %%xmm14, %%xmm6\n"
12885           "movaps %%xmm13, %%xmm5\n"
12886           "movaps %%xmm13, %%xmm7\n"
12887           "addps %%xmm15, %%xmm5\n"
12888           "subps %%xmm15, %%xmm7\n"
12889           "movaps %%xmm0, %%xmm8\n"
12890           "movaps %%xmm0, %%xmm12\n"
12891           "addps %%xmm4, %%xmm8\n"
12892           "subps %%xmm4, %%xmm12\n"
12893           "movaps %%xmm1, %%xmm9\n"
12894           "movaps %%xmm1, %%xmm13\n"
12895           "addps %%xmm5, %%xmm9\n"
12896           "subps %%xmm5, %%xmm13\n"
12897           "movaps %%xmm2, %%xmm10\n"
12898           "movaps %%xmm2, %%xmm14\n"
12899           "addps %%xmm6, %%xmm10\n"
12900           "subps %%xmm6, %%xmm14\n"
12901           "movaps %%xmm3, %%xmm11\n"
12902           "movaps %%xmm3, %%xmm15\n"
12903           "addps %%xmm7, %%xmm11\n"
12904           "subps %%xmm7, %%xmm15\n"
12905           "movups %%xmm8, (%0)\n"
12906           "movups %%xmm9, (%1)\n"
12907           "movups %%xmm10, (%2)\n"
12908           "movups %%xmm11, (%3)\n"
12909           "movups %%xmm12, (%4)\n"
12910           "movups %%xmm13, (%5)\n"
12911           "movups %%xmm14, (%6)\n"
12912           "movups %%xmm15, (%7)\n"
12913           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12914         );
12915       }
12916     }
12917     return;
12918   }
12919   if (depth == 29) {
12920     helper_float_29_recursive(buf + 0, 27);
12921     helper_float_29_recursive(buf + 134217728, 27);
12922     helper_float_29_recursive(buf + 268435456, 27);
12923     helper_float_29_recursive(buf + 402653184, 27);
12924     for (int j = 0; j < 536870912; j += 536870912) {
12925       for (int k = 0; k < 134217728; k += 4) {
12926         __asm__ volatile (
12927           "movups (%0), %%xmm0\n"
12928           "movups (%1), %%xmm1\n"
12929           "movups (%2), %%xmm2\n"
12930           "movups (%3), %%xmm3\n"
12931           "movaps %%xmm0, %%xmm8\n"
12932           "movaps %%xmm0, %%xmm9\n"
12933           "addps %%xmm1, %%xmm8\n"
12934           "subps %%xmm1, %%xmm9\n"
12935           "movaps %%xmm2, %%xmm10\n"
12936           "movaps %%xmm2, %%xmm11\n"
12937           "addps %%xmm3, %%xmm10\n"
12938           "subps %%xmm3, %%xmm11\n"
12939           "movaps %%xmm8, %%xmm0\n"
12940           "movaps %%xmm8, %%xmm2\n"
12941           "addps %%xmm10, %%xmm0\n"
12942           "subps %%xmm10, %%xmm2\n"
12943           "movaps %%xmm9, %%xmm1\n"
12944           "movaps %%xmm9, %%xmm3\n"
12945           "addps %%xmm11, %%xmm1\n"
12946           "subps %%xmm11, %%xmm3\n"
12947           "movups %%xmm0, (%0)\n"
12948           "movups %%xmm1, (%1)\n"
12949           "movups %%xmm2, (%2)\n"
12950           "movups %%xmm3, (%3)\n"
12951           :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12952         );
12953       }
12954     }
12955     return;
12956   }
12957 }
12958 void helper_float_29(float *buf);
helper_float_29(float * buf)12959 void helper_float_29(float *buf) {
12960   helper_float_29_recursive(buf, 29);
12961 }
12962 void helper_float_30_recursive(float *buf, int depth);
helper_float_30_recursive(float * buf,int depth)12963 void helper_float_30_recursive(float *buf, int depth) {
12964   if (depth == 12) {
12965     for (int j = 0; j < 4096; j += 32) {
12966       for (int k = 0; k < 4; k += 4) {
12967         __asm__ volatile (
12968           "movups (%0), %%xmm0\n"
12969           "movups (%1), %%xmm1\n"
12970           "movups (%2), %%xmm2\n"
12971           "movups (%3), %%xmm3\n"
12972           "movups (%4), %%xmm4\n"
12973           "movups (%5), %%xmm5\n"
12974           "movups (%6), %%xmm6\n"
12975           "movups (%7), %%xmm7\n"
12976           "movaps %%xmm0, %%xmm8\n"
12977           "shufps $160, %%xmm8, %%xmm8\n"
12978           "shufps $245, %%xmm0, %%xmm0\n"
12979           "xorps %%xmm9, %%xmm9\n"
12980           "subps %%xmm0, %%xmm9\n"
12981           "addsubps %%xmm9, %%xmm8\n"
12982           "movaps %%xmm8, %%xmm0\n"
12983           "movaps %%xmm1, %%xmm8\n"
12984           "shufps $160, %%xmm8, %%xmm8\n"
12985           "shufps $245, %%xmm1, %%xmm1\n"
12986           "xorps %%xmm9, %%xmm9\n"
12987           "subps %%xmm1, %%xmm9\n"
12988           "addsubps %%xmm9, %%xmm8\n"
12989           "movaps %%xmm8, %%xmm1\n"
12990           "movaps %%xmm2, %%xmm8\n"
12991           "shufps $160, %%xmm8, %%xmm8\n"
12992           "shufps $245, %%xmm2, %%xmm2\n"
12993           "xorps %%xmm9, %%xmm9\n"
12994           "subps %%xmm2, %%xmm9\n"
12995           "addsubps %%xmm9, %%xmm8\n"
12996           "movaps %%xmm8, %%xmm2\n"
12997           "movaps %%xmm3, %%xmm8\n"
12998           "shufps $160, %%xmm8, %%xmm8\n"
12999           "shufps $245, %%xmm3, %%xmm3\n"
13000           "xorps %%xmm9, %%xmm9\n"
13001           "subps %%xmm3, %%xmm9\n"
13002           "addsubps %%xmm9, %%xmm8\n"
13003           "movaps %%xmm8, %%xmm3\n"
13004           "movaps %%xmm4, %%xmm8\n"
13005           "shufps $160, %%xmm8, %%xmm8\n"
13006           "shufps $245, %%xmm4, %%xmm4\n"
13007           "xorps %%xmm9, %%xmm9\n"
13008           "subps %%xmm4, %%xmm9\n"
13009           "addsubps %%xmm9, %%xmm8\n"
13010           "movaps %%xmm8, %%xmm4\n"
13011           "movaps %%xmm5, %%xmm8\n"
13012           "shufps $160, %%xmm8, %%xmm8\n"
13013           "shufps $245, %%xmm5, %%xmm5\n"
13014           "xorps %%xmm9, %%xmm9\n"
13015           "subps %%xmm5, %%xmm9\n"
13016           "addsubps %%xmm9, %%xmm8\n"
13017           "movaps %%xmm8, %%xmm5\n"
13018           "movaps %%xmm6, %%xmm8\n"
13019           "shufps $160, %%xmm8, %%xmm8\n"
13020           "shufps $245, %%xmm6, %%xmm6\n"
13021           "xorps %%xmm9, %%xmm9\n"
13022           "subps %%xmm6, %%xmm9\n"
13023           "addsubps %%xmm9, %%xmm8\n"
13024           "movaps %%xmm8, %%xmm6\n"
13025           "movaps %%xmm7, %%xmm8\n"
13026           "shufps $160, %%xmm8, %%xmm8\n"
13027           "shufps $245, %%xmm7, %%xmm7\n"
13028           "xorps %%xmm9, %%xmm9\n"
13029           "subps %%xmm7, %%xmm9\n"
13030           "addsubps %%xmm9, %%xmm8\n"
13031           "movaps %%xmm8, %%xmm7\n"
13032           "movaps %%xmm0, %%xmm8\n"
13033           "shufps $68, %%xmm8, %%xmm8\n"
13034           "xorps %%xmm9, %%xmm9\n"
13035           "movaps %%xmm0, %%xmm10\n"
13036           "shufps $14, %%xmm9, %%xmm10\n"
13037           "movaps %%xmm0, %%xmm11\n"
13038           "shufps $224, %%xmm11, %%xmm9\n"
13039           "addps %%xmm8, %%xmm10\n"
13040           "subps %%xmm9, %%xmm10\n"
13041           "movaps %%xmm10, %%xmm0\n"
13042           "movaps %%xmm1, %%xmm8\n"
13043           "shufps $68, %%xmm8, %%xmm8\n"
13044           "xorps %%xmm9, %%xmm9\n"
13045           "movaps %%xmm1, %%xmm10\n"
13046           "shufps $14, %%xmm9, %%xmm10\n"
13047           "movaps %%xmm1, %%xmm11\n"
13048           "shufps $224, %%xmm11, %%xmm9\n"
13049           "addps %%xmm8, %%xmm10\n"
13050           "subps %%xmm9, %%xmm10\n"
13051           "movaps %%xmm10, %%xmm1\n"
13052           "movaps %%xmm2, %%xmm8\n"
13053           "shufps $68, %%xmm8, %%xmm8\n"
13054           "xorps %%xmm9, %%xmm9\n"
13055           "movaps %%xmm2, %%xmm10\n"
13056           "shufps $14, %%xmm9, %%xmm10\n"
13057           "movaps %%xmm2, %%xmm11\n"
13058           "shufps $224, %%xmm11, %%xmm9\n"
13059           "addps %%xmm8, %%xmm10\n"
13060           "subps %%xmm9, %%xmm10\n"
13061           "movaps %%xmm10, %%xmm2\n"
13062           "movaps %%xmm3, %%xmm8\n"
13063           "shufps $68, %%xmm8, %%xmm8\n"
13064           "xorps %%xmm9, %%xmm9\n"
13065           "movaps %%xmm3, %%xmm10\n"
13066           "shufps $14, %%xmm9, %%xmm10\n"
13067           "movaps %%xmm3, %%xmm11\n"
13068           "shufps $224, %%xmm11, %%xmm9\n"
13069           "addps %%xmm8, %%xmm10\n"
13070           "subps %%xmm9, %%xmm10\n"
13071           "movaps %%xmm10, %%xmm3\n"
13072           "movaps %%xmm4, %%xmm8\n"
13073           "shufps $68, %%xmm8, %%xmm8\n"
13074           "xorps %%xmm9, %%xmm9\n"
13075           "movaps %%xmm4, %%xmm10\n"
13076           "shufps $14, %%xmm9, %%xmm10\n"
13077           "movaps %%xmm4, %%xmm11\n"
13078           "shufps $224, %%xmm11, %%xmm9\n"
13079           "addps %%xmm8, %%xmm10\n"
13080           "subps %%xmm9, %%xmm10\n"
13081           "movaps %%xmm10, %%xmm4\n"
13082           "movaps %%xmm5, %%xmm8\n"
13083           "shufps $68, %%xmm8, %%xmm8\n"
13084           "xorps %%xmm9, %%xmm9\n"
13085           "movaps %%xmm5, %%xmm10\n"
13086           "shufps $14, %%xmm9, %%xmm10\n"
13087           "movaps %%xmm5, %%xmm11\n"
13088           "shufps $224, %%xmm11, %%xmm9\n"
13089           "addps %%xmm8, %%xmm10\n"
13090           "subps %%xmm9, %%xmm10\n"
13091           "movaps %%xmm10, %%xmm5\n"
13092           "movaps %%xmm6, %%xmm8\n"
13093           "shufps $68, %%xmm8, %%xmm8\n"
13094           "xorps %%xmm9, %%xmm9\n"
13095           "movaps %%xmm6, %%xmm10\n"
13096           "shufps $14, %%xmm9, %%xmm10\n"
13097           "movaps %%xmm6, %%xmm11\n"
13098           "shufps $224, %%xmm11, %%xmm9\n"
13099           "addps %%xmm8, %%xmm10\n"
13100           "subps %%xmm9, %%xmm10\n"
13101           "movaps %%xmm10, %%xmm6\n"
13102           "movaps %%xmm7, %%xmm8\n"
13103           "shufps $68, %%xmm8, %%xmm8\n"
13104           "xorps %%xmm9, %%xmm9\n"
13105           "movaps %%xmm7, %%xmm10\n"
13106           "shufps $14, %%xmm9, %%xmm10\n"
13107           "movaps %%xmm7, %%xmm11\n"
13108           "shufps $224, %%xmm11, %%xmm9\n"
13109           "addps %%xmm8, %%xmm10\n"
13110           "subps %%xmm9, %%xmm10\n"
13111           "movaps %%xmm10, %%xmm7\n"
13112           "movaps %%xmm0, %%xmm8\n"
13113           "movaps %%xmm0, %%xmm9\n"
13114           "addps %%xmm1, %%xmm8\n"
13115           "subps %%xmm1, %%xmm9\n"
13116           "movaps %%xmm2, %%xmm10\n"
13117           "movaps %%xmm2, %%xmm11\n"
13118           "addps %%xmm3, %%xmm10\n"
13119           "subps %%xmm3, %%xmm11\n"
13120           "movaps %%xmm4, %%xmm12\n"
13121           "movaps %%xmm4, %%xmm13\n"
13122           "addps %%xmm5, %%xmm12\n"
13123           "subps %%xmm5, %%xmm13\n"
13124           "movaps %%xmm6, %%xmm14\n"
13125           "movaps %%xmm6, %%xmm15\n"
13126           "addps %%xmm7, %%xmm14\n"
13127           "subps %%xmm7, %%xmm15\n"
13128           "movaps %%xmm8, %%xmm0\n"
13129           "movaps %%xmm8, %%xmm2\n"
13130           "addps %%xmm10, %%xmm0\n"
13131           "subps %%xmm10, %%xmm2\n"
13132           "movaps %%xmm9, %%xmm1\n"
13133           "movaps %%xmm9, %%xmm3\n"
13134           "addps %%xmm11, %%xmm1\n"
13135           "subps %%xmm11, %%xmm3\n"
13136           "movaps %%xmm12, %%xmm4\n"
13137           "movaps %%xmm12, %%xmm6\n"
13138           "addps %%xmm14, %%xmm4\n"
13139           "subps %%xmm14, %%xmm6\n"
13140           "movaps %%xmm13, %%xmm5\n"
13141           "movaps %%xmm13, %%xmm7\n"
13142           "addps %%xmm15, %%xmm5\n"
13143           "subps %%xmm15, %%xmm7\n"
13144           "movaps %%xmm0, %%xmm8\n"
13145           "movaps %%xmm0, %%xmm12\n"
13146           "addps %%xmm4, %%xmm8\n"
13147           "subps %%xmm4, %%xmm12\n"
13148           "movaps %%xmm1, %%xmm9\n"
13149           "movaps %%xmm1, %%xmm13\n"
13150           "addps %%xmm5, %%xmm9\n"
13151           "subps %%xmm5, %%xmm13\n"
13152           "movaps %%xmm2, %%xmm10\n"
13153           "movaps %%xmm2, %%xmm14\n"
13154           "addps %%xmm6, %%xmm10\n"
13155           "subps %%xmm6, %%xmm14\n"
13156           "movaps %%xmm3, %%xmm11\n"
13157           "movaps %%xmm3, %%xmm15\n"
13158           "addps %%xmm7, %%xmm11\n"
13159           "subps %%xmm7, %%xmm15\n"
13160           "movups %%xmm8, (%0)\n"
13161           "movups %%xmm9, (%1)\n"
13162           "movups %%xmm10, (%2)\n"
13163           "movups %%xmm11, (%3)\n"
13164           "movups %%xmm12, (%4)\n"
13165           "movups %%xmm13, (%5)\n"
13166           "movups %%xmm14, (%6)\n"
13167           "movups %%xmm15, (%7)\n"
13168           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13169         );
13170       }
13171     }
13172     for (int j = 0; j < 4096; j += 256) {
13173       for (int k = 0; k < 32; k += 4) {
13174         __asm__ volatile (
13175           "movups (%0), %%xmm0\n"
13176           "movups (%1), %%xmm1\n"
13177           "movups (%2), %%xmm2\n"
13178           "movups (%3), %%xmm3\n"
13179           "movups (%4), %%xmm4\n"
13180           "movups (%5), %%xmm5\n"
13181           "movups (%6), %%xmm6\n"
13182           "movups (%7), %%xmm7\n"
13183           "movaps %%xmm0, %%xmm8\n"
13184           "movaps %%xmm0, %%xmm9\n"
13185           "addps %%xmm1, %%xmm8\n"
13186           "subps %%xmm1, %%xmm9\n"
13187           "movaps %%xmm2, %%xmm10\n"
13188           "movaps %%xmm2, %%xmm11\n"
13189           "addps %%xmm3, %%xmm10\n"
13190           "subps %%xmm3, %%xmm11\n"
13191           "movaps %%xmm4, %%xmm12\n"
13192           "movaps %%xmm4, %%xmm13\n"
13193           "addps %%xmm5, %%xmm12\n"
13194           "subps %%xmm5, %%xmm13\n"
13195           "movaps %%xmm6, %%xmm14\n"
13196           "movaps %%xmm6, %%xmm15\n"
13197           "addps %%xmm7, %%xmm14\n"
13198           "subps %%xmm7, %%xmm15\n"
13199           "movaps %%xmm8, %%xmm0\n"
13200           "movaps %%xmm8, %%xmm2\n"
13201           "addps %%xmm10, %%xmm0\n"
13202           "subps %%xmm10, %%xmm2\n"
13203           "movaps %%xmm9, %%xmm1\n"
13204           "movaps %%xmm9, %%xmm3\n"
13205           "addps %%xmm11, %%xmm1\n"
13206           "subps %%xmm11, %%xmm3\n"
13207           "movaps %%xmm12, %%xmm4\n"
13208           "movaps %%xmm12, %%xmm6\n"
13209           "addps %%xmm14, %%xmm4\n"
13210           "subps %%xmm14, %%xmm6\n"
13211           "movaps %%xmm13, %%xmm5\n"
13212           "movaps %%xmm13, %%xmm7\n"
13213           "addps %%xmm15, %%xmm5\n"
13214           "subps %%xmm15, %%xmm7\n"
13215           "movaps %%xmm0, %%xmm8\n"
13216           "movaps %%xmm0, %%xmm12\n"
13217           "addps %%xmm4, %%xmm8\n"
13218           "subps %%xmm4, %%xmm12\n"
13219           "movaps %%xmm1, %%xmm9\n"
13220           "movaps %%xmm1, %%xmm13\n"
13221           "addps %%xmm5, %%xmm9\n"
13222           "subps %%xmm5, %%xmm13\n"
13223           "movaps %%xmm2, %%xmm10\n"
13224           "movaps %%xmm2, %%xmm14\n"
13225           "addps %%xmm6, %%xmm10\n"
13226           "subps %%xmm6, %%xmm14\n"
13227           "movaps %%xmm3, %%xmm11\n"
13228           "movaps %%xmm3, %%xmm15\n"
13229           "addps %%xmm7, %%xmm11\n"
13230           "subps %%xmm7, %%xmm15\n"
13231           "movups %%xmm8, (%0)\n"
13232           "movups %%xmm9, (%1)\n"
13233           "movups %%xmm10, (%2)\n"
13234           "movups %%xmm11, (%3)\n"
13235           "movups %%xmm12, (%4)\n"
13236           "movups %%xmm13, (%5)\n"
13237           "movups %%xmm14, (%6)\n"
13238           "movups %%xmm15, (%7)\n"
13239           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13240         );
13241       }
13242     }
13243     for (int j = 0; j < 4096; j += 2048) {
13244       for (int k = 0; k < 256; k += 4) {
13245         __asm__ volatile (
13246           "movups (%0), %%xmm0\n"
13247           "movups (%1), %%xmm1\n"
13248           "movups (%2), %%xmm2\n"
13249           "movups (%3), %%xmm3\n"
13250           "movups (%4), %%xmm4\n"
13251           "movups (%5), %%xmm5\n"
13252           "movups (%6), %%xmm6\n"
13253           "movups (%7), %%xmm7\n"
13254           "movaps %%xmm0, %%xmm8\n"
13255           "movaps %%xmm0, %%xmm9\n"
13256           "addps %%xmm1, %%xmm8\n"
13257           "subps %%xmm1, %%xmm9\n"
13258           "movaps %%xmm2, %%xmm10\n"
13259           "movaps %%xmm2, %%xmm11\n"
13260           "addps %%xmm3, %%xmm10\n"
13261           "subps %%xmm3, %%xmm11\n"
13262           "movaps %%xmm4, %%xmm12\n"
13263           "movaps %%xmm4, %%xmm13\n"
13264           "addps %%xmm5, %%xmm12\n"
13265           "subps %%xmm5, %%xmm13\n"
13266           "movaps %%xmm6, %%xmm14\n"
13267           "movaps %%xmm6, %%xmm15\n"
13268           "addps %%xmm7, %%xmm14\n"
13269           "subps %%xmm7, %%xmm15\n"
13270           "movaps %%xmm8, %%xmm0\n"
13271           "movaps %%xmm8, %%xmm2\n"
13272           "addps %%xmm10, %%xmm0\n"
13273           "subps %%xmm10, %%xmm2\n"
13274           "movaps %%xmm9, %%xmm1\n"
13275           "movaps %%xmm9, %%xmm3\n"
13276           "addps %%xmm11, %%xmm1\n"
13277           "subps %%xmm11, %%xmm3\n"
13278           "movaps %%xmm12, %%xmm4\n"
13279           "movaps %%xmm12, %%xmm6\n"
13280           "addps %%xmm14, %%xmm4\n"
13281           "subps %%xmm14, %%xmm6\n"
13282           "movaps %%xmm13, %%xmm5\n"
13283           "movaps %%xmm13, %%xmm7\n"
13284           "addps %%xmm15, %%xmm5\n"
13285           "subps %%xmm15, %%xmm7\n"
13286           "movaps %%xmm0, %%xmm8\n"
13287           "movaps %%xmm0, %%xmm12\n"
13288           "addps %%xmm4, %%xmm8\n"
13289           "subps %%xmm4, %%xmm12\n"
13290           "movaps %%xmm1, %%xmm9\n"
13291           "movaps %%xmm1, %%xmm13\n"
13292           "addps %%xmm5, %%xmm9\n"
13293           "subps %%xmm5, %%xmm13\n"
13294           "movaps %%xmm2, %%xmm10\n"
13295           "movaps %%xmm2, %%xmm14\n"
13296           "addps %%xmm6, %%xmm10\n"
13297           "subps %%xmm6, %%xmm14\n"
13298           "movaps %%xmm3, %%xmm11\n"
13299           "movaps %%xmm3, %%xmm15\n"
13300           "addps %%xmm7, %%xmm11\n"
13301           "subps %%xmm7, %%xmm15\n"
13302           "movups %%xmm8, (%0)\n"
13303           "movups %%xmm9, (%1)\n"
13304           "movups %%xmm10, (%2)\n"
13305           "movups %%xmm11, (%3)\n"
13306           "movups %%xmm12, (%4)\n"
13307           "movups %%xmm13, (%5)\n"
13308           "movups %%xmm14, (%6)\n"
13309           "movups %%xmm15, (%7)\n"
13310           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13311         );
13312       }
13313     }
13314     for (int j = 0; j < 4096; j += 4096) {
13315       for (int k = 0; k < 2048; k += 4) {
13316         __asm__ volatile (
13317           "movups (%0), %%xmm0\n"
13318           "movups (%1), %%xmm1\n"
13319           "movaps %%xmm0, %%xmm8\n"
13320           "movaps %%xmm0, %%xmm9\n"
13321           "addps %%xmm1, %%xmm8\n"
13322           "subps %%xmm1, %%xmm9\n"
13323           "movups %%xmm8, (%0)\n"
13324           "movups %%xmm9, (%1)\n"
13325           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13326         );
13327       }
13328     }
13329     return;
13330   }
13331   if (depth == 15) {
13332     helper_float_30_recursive(buf + 0, 12);
13333     helper_float_30_recursive(buf + 4096, 12);
13334     helper_float_30_recursive(buf + 8192, 12);
13335     helper_float_30_recursive(buf + 12288, 12);
13336     helper_float_30_recursive(buf + 16384, 12);
13337     helper_float_30_recursive(buf + 20480, 12);
13338     helper_float_30_recursive(buf + 24576, 12);
13339     helper_float_30_recursive(buf + 28672, 12);
13340     for (int j = 0; j < 32768; j += 32768) {
13341       for (int k = 0; k < 4096; k += 4) {
13342         __asm__ volatile (
13343           "movups (%0), %%xmm0\n"
13344           "movups (%1), %%xmm1\n"
13345           "movups (%2), %%xmm2\n"
13346           "movups (%3), %%xmm3\n"
13347           "movups (%4), %%xmm4\n"
13348           "movups (%5), %%xmm5\n"
13349           "movups (%6), %%xmm6\n"
13350           "movups (%7), %%xmm7\n"
13351           "movaps %%xmm0, %%xmm8\n"
13352           "movaps %%xmm0, %%xmm9\n"
13353           "addps %%xmm1, %%xmm8\n"
13354           "subps %%xmm1, %%xmm9\n"
13355           "movaps %%xmm2, %%xmm10\n"
13356           "movaps %%xmm2, %%xmm11\n"
13357           "addps %%xmm3, %%xmm10\n"
13358           "subps %%xmm3, %%xmm11\n"
13359           "movaps %%xmm4, %%xmm12\n"
13360           "movaps %%xmm4, %%xmm13\n"
13361           "addps %%xmm5, %%xmm12\n"
13362           "subps %%xmm5, %%xmm13\n"
13363           "movaps %%xmm6, %%xmm14\n"
13364           "movaps %%xmm6, %%xmm15\n"
13365           "addps %%xmm7, %%xmm14\n"
13366           "subps %%xmm7, %%xmm15\n"
13367           "movaps %%xmm8, %%xmm0\n"
13368           "movaps %%xmm8, %%xmm2\n"
13369           "addps %%xmm10, %%xmm0\n"
13370           "subps %%xmm10, %%xmm2\n"
13371           "movaps %%xmm9, %%xmm1\n"
13372           "movaps %%xmm9, %%xmm3\n"
13373           "addps %%xmm11, %%xmm1\n"
13374           "subps %%xmm11, %%xmm3\n"
13375           "movaps %%xmm12, %%xmm4\n"
13376           "movaps %%xmm12, %%xmm6\n"
13377           "addps %%xmm14, %%xmm4\n"
13378           "subps %%xmm14, %%xmm6\n"
13379           "movaps %%xmm13, %%xmm5\n"
13380           "movaps %%xmm13, %%xmm7\n"
13381           "addps %%xmm15, %%xmm5\n"
13382           "subps %%xmm15, %%xmm7\n"
13383           "movaps %%xmm0, %%xmm8\n"
13384           "movaps %%xmm0, %%xmm12\n"
13385           "addps %%xmm4, %%xmm8\n"
13386           "subps %%xmm4, %%xmm12\n"
13387           "movaps %%xmm1, %%xmm9\n"
13388           "movaps %%xmm1, %%xmm13\n"
13389           "addps %%xmm5, %%xmm9\n"
13390           "subps %%xmm5, %%xmm13\n"
13391           "movaps %%xmm2, %%xmm10\n"
13392           "movaps %%xmm2, %%xmm14\n"
13393           "addps %%xmm6, %%xmm10\n"
13394           "subps %%xmm6, %%xmm14\n"
13395           "movaps %%xmm3, %%xmm11\n"
13396           "movaps %%xmm3, %%xmm15\n"
13397           "addps %%xmm7, %%xmm11\n"
13398           "subps %%xmm7, %%xmm15\n"
13399           "movups %%xmm8, (%0)\n"
13400           "movups %%xmm9, (%1)\n"
13401           "movups %%xmm10, (%2)\n"
13402           "movups %%xmm11, (%3)\n"
13403           "movups %%xmm12, (%4)\n"
13404           "movups %%xmm13, (%5)\n"
13405           "movups %%xmm14, (%6)\n"
13406           "movups %%xmm15, (%7)\n"
13407           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13408         );
13409       }
13410     }
13411     return;
13412   }
13413   if (depth == 18) {
13414     helper_float_30_recursive(buf + 0, 15);
13415     helper_float_30_recursive(buf + 32768, 15);
13416     helper_float_30_recursive(buf + 65536, 15);
13417     helper_float_30_recursive(buf + 98304, 15);
13418     helper_float_30_recursive(buf + 131072, 15);
13419     helper_float_30_recursive(buf + 163840, 15);
13420     helper_float_30_recursive(buf + 196608, 15);
13421     helper_float_30_recursive(buf + 229376, 15);
13422     for (int j = 0; j < 262144; j += 262144) {
13423       for (int k = 0; k < 32768; k += 4) {
13424         __asm__ volatile (
13425           "movups (%0), %%xmm0\n"
13426           "movups (%1), %%xmm1\n"
13427           "movups (%2), %%xmm2\n"
13428           "movups (%3), %%xmm3\n"
13429           "movups (%4), %%xmm4\n"
13430           "movups (%5), %%xmm5\n"
13431           "movups (%6), %%xmm6\n"
13432           "movups (%7), %%xmm7\n"
13433           "movaps %%xmm0, %%xmm8\n"
13434           "movaps %%xmm0, %%xmm9\n"
13435           "addps %%xmm1, %%xmm8\n"
13436           "subps %%xmm1, %%xmm9\n"
13437           "movaps %%xmm2, %%xmm10\n"
13438           "movaps %%xmm2, %%xmm11\n"
13439           "addps %%xmm3, %%xmm10\n"
13440           "subps %%xmm3, %%xmm11\n"
13441           "movaps %%xmm4, %%xmm12\n"
13442           "movaps %%xmm4, %%xmm13\n"
13443           "addps %%xmm5, %%xmm12\n"
13444           "subps %%xmm5, %%xmm13\n"
13445           "movaps %%xmm6, %%xmm14\n"
13446           "movaps %%xmm6, %%xmm15\n"
13447           "addps %%xmm7, %%xmm14\n"
13448           "subps %%xmm7, %%xmm15\n"
13449           "movaps %%xmm8, %%xmm0\n"
13450           "movaps %%xmm8, %%xmm2\n"
13451           "addps %%xmm10, %%xmm0\n"
13452           "subps %%xmm10, %%xmm2\n"
13453           "movaps %%xmm9, %%xmm1\n"
13454           "movaps %%xmm9, %%xmm3\n"
13455           "addps %%xmm11, %%xmm1\n"
13456           "subps %%xmm11, %%xmm3\n"
13457           "movaps %%xmm12, %%xmm4\n"
13458           "movaps %%xmm12, %%xmm6\n"
13459           "addps %%xmm14, %%xmm4\n"
13460           "subps %%xmm14, %%xmm6\n"
13461           "movaps %%xmm13, %%xmm5\n"
13462           "movaps %%xmm13, %%xmm7\n"
13463           "addps %%xmm15, %%xmm5\n"
13464           "subps %%xmm15, %%xmm7\n"
13465           "movaps %%xmm0, %%xmm8\n"
13466           "movaps %%xmm0, %%xmm12\n"
13467           "addps %%xmm4, %%xmm8\n"
13468           "subps %%xmm4, %%xmm12\n"
13469           "movaps %%xmm1, %%xmm9\n"
13470           "movaps %%xmm1, %%xmm13\n"
13471           "addps %%xmm5, %%xmm9\n"
13472           "subps %%xmm5, %%xmm13\n"
13473           "movaps %%xmm2, %%xmm10\n"
13474           "movaps %%xmm2, %%xmm14\n"
13475           "addps %%xmm6, %%xmm10\n"
13476           "subps %%xmm6, %%xmm14\n"
13477           "movaps %%xmm3, %%xmm11\n"
13478           "movaps %%xmm3, %%xmm15\n"
13479           "addps %%xmm7, %%xmm11\n"
13480           "subps %%xmm7, %%xmm15\n"
13481           "movups %%xmm8, (%0)\n"
13482           "movups %%xmm9, (%1)\n"
13483           "movups %%xmm10, (%2)\n"
13484           "movups %%xmm11, (%3)\n"
13485           "movups %%xmm12, (%4)\n"
13486           "movups %%xmm13, (%5)\n"
13487           "movups %%xmm14, (%6)\n"
13488           "movups %%xmm15, (%7)\n"
13489           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13490         );
13491       }
13492     }
13493     return;
13494   }
13495   if (depth == 21) {
13496     helper_float_30_recursive(buf + 0, 18);
13497     helper_float_30_recursive(buf + 262144, 18);
13498     helper_float_30_recursive(buf + 524288, 18);
13499     helper_float_30_recursive(buf + 786432, 18);
13500     helper_float_30_recursive(buf + 1048576, 18);
13501     helper_float_30_recursive(buf + 1310720, 18);
13502     helper_float_30_recursive(buf + 1572864, 18);
13503     helper_float_30_recursive(buf + 1835008, 18);
13504     for (int j = 0; j < 2097152; j += 2097152) {
13505       for (int k = 0; k < 262144; k += 4) {
13506         __asm__ volatile (
13507           "movups (%0), %%xmm0\n"
13508           "movups (%1), %%xmm1\n"
13509           "movups (%2), %%xmm2\n"
13510           "movups (%3), %%xmm3\n"
13511           "movups (%4), %%xmm4\n"
13512           "movups (%5), %%xmm5\n"
13513           "movups (%6), %%xmm6\n"
13514           "movups (%7), %%xmm7\n"
13515           "movaps %%xmm0, %%xmm8\n"
13516           "movaps %%xmm0, %%xmm9\n"
13517           "addps %%xmm1, %%xmm8\n"
13518           "subps %%xmm1, %%xmm9\n"
13519           "movaps %%xmm2, %%xmm10\n"
13520           "movaps %%xmm2, %%xmm11\n"
13521           "addps %%xmm3, %%xmm10\n"
13522           "subps %%xmm3, %%xmm11\n"
13523           "movaps %%xmm4, %%xmm12\n"
13524           "movaps %%xmm4, %%xmm13\n"
13525           "addps %%xmm5, %%xmm12\n"
13526           "subps %%xmm5, %%xmm13\n"
13527           "movaps %%xmm6, %%xmm14\n"
13528           "movaps %%xmm6, %%xmm15\n"
13529           "addps %%xmm7, %%xmm14\n"
13530           "subps %%xmm7, %%xmm15\n"
13531           "movaps %%xmm8, %%xmm0\n"
13532           "movaps %%xmm8, %%xmm2\n"
13533           "addps %%xmm10, %%xmm0\n"
13534           "subps %%xmm10, %%xmm2\n"
13535           "movaps %%xmm9, %%xmm1\n"
13536           "movaps %%xmm9, %%xmm3\n"
13537           "addps %%xmm11, %%xmm1\n"
13538           "subps %%xmm11, %%xmm3\n"
13539           "movaps %%xmm12, %%xmm4\n"
13540           "movaps %%xmm12, %%xmm6\n"
13541           "addps %%xmm14, %%xmm4\n"
13542           "subps %%xmm14, %%xmm6\n"
13543           "movaps %%xmm13, %%xmm5\n"
13544           "movaps %%xmm13, %%xmm7\n"
13545           "addps %%xmm15, %%xmm5\n"
13546           "subps %%xmm15, %%xmm7\n"
13547           "movaps %%xmm0, %%xmm8\n"
13548           "movaps %%xmm0, %%xmm12\n"
13549           "addps %%xmm4, %%xmm8\n"
13550           "subps %%xmm4, %%xmm12\n"
13551           "movaps %%xmm1, %%xmm9\n"
13552           "movaps %%xmm1, %%xmm13\n"
13553           "addps %%xmm5, %%xmm9\n"
13554           "subps %%xmm5, %%xmm13\n"
13555           "movaps %%xmm2, %%xmm10\n"
13556           "movaps %%xmm2, %%xmm14\n"
13557           "addps %%xmm6, %%xmm10\n"
13558           "subps %%xmm6, %%xmm14\n"
13559           "movaps %%xmm3, %%xmm11\n"
13560           "movaps %%xmm3, %%xmm15\n"
13561           "addps %%xmm7, %%xmm11\n"
13562           "subps %%xmm7, %%xmm15\n"
13563           "movups %%xmm8, (%0)\n"
13564           "movups %%xmm9, (%1)\n"
13565           "movups %%xmm10, (%2)\n"
13566           "movups %%xmm11, (%3)\n"
13567           "movups %%xmm12, (%4)\n"
13568           "movups %%xmm13, (%5)\n"
13569           "movups %%xmm14, (%6)\n"
13570           "movups %%xmm15, (%7)\n"
13571           :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13572         );
13573       }
13574     }
13575     return;
13576   }
13577   if (depth == 24) {
13578     helper_float_30_recursive(buf + 0, 21);
13579     helper_float_30_recursive(buf + 2097152, 21);
13580     helper_float_30_recursive(buf + 4194304, 21);
13581     helper_float_30_recursive(buf + 6291456, 21);
13582     helper_float_30_recursive(buf + 8388608, 21);
13583     helper_float_30_recursive(buf + 10485760, 21);
13584     helper_float_30_recursive(buf + 12582912, 21);
13585     helper_float_30_recursive(buf + 14680064, 21);
13586     for (int j = 0; j < 16777216; j += 16777216) {
13587       for (int k = 0; k < 2097152; k += 4) {
13588         __asm__ volatile (
13589           "movups (%0), %%xmm0\n"
13590           "movups (%1), %%xmm1\n"
13591           "movups (%2), %%xmm2\n"
13592           "movups (%3), %%xmm3\n"
13593           "movups (%4), %%xmm4\n"
13594           "movups (%5), %%xmm5\n"
13595           "movups (%6), %%xmm6\n"
13596           "movups (%7), %%xmm7\n"
13597           "movaps %%xmm0, %%xmm8\n"
13598           "movaps %%xmm0, %%xmm9\n"
13599           "addps %%xmm1, %%xmm8\n"
13600           "subps %%xmm1, %%xmm9\n"
13601           "movaps %%xmm2, %%xmm10\n"
13602           "movaps %%xmm2, %%xmm11\n"
13603           "addps %%xmm3, %%xmm10\n"
13604           "subps %%xmm3, %%xmm11\n"
13605           "movaps %%xmm4, %%xmm12\n"
13606           "movaps %%xmm4, %%xmm13\n"
13607           "addps %%xmm5, %%xmm12\n"
13608           "subps %%xmm5, %%xmm13\n"
13609           "movaps %%xmm6, %%xmm14\n"
13610           "movaps %%xmm6, %%xmm15\n"
13611           "addps %%xmm7, %%xmm14\n"
13612           "subps %%xmm7, %%xmm15\n"
13613           "movaps %%xmm8, %%xmm0\n"
13614           "movaps %%xmm8, %%xmm2\n"
13615           "addps %%xmm10, %%xmm0\n"
13616           "subps %%xmm10, %%xmm2\n"
13617           "movaps %%xmm9, %%xmm1\n"
13618           "movaps %%xmm9, %%xmm3\n"
13619           "addps %%xmm11, %%xmm1\n"
13620           "subps %%xmm11, %%xmm3\n"
13621           "movaps %%xmm12, %%xmm4\n"
13622           "movaps %%xmm12, %%xmm6\n"
13623           "addps %%xmm14, %%xmm4\n"
13624           "subps %%xmm14, %%xmm6\n"
13625           "movaps %%xmm13, %%xmm5\n"
13626           "movaps %%xmm13, %%xmm7\n"
13627           "addps %%xmm15, %%xmm5\n"
13628           "subps %%xmm15, %%xmm7\n"
13629           "movaps %%xmm0, %%xmm8\n"
13630           "movaps %%xmm0, %%xmm12\n"
13631           "addps %%xmm4, %%xmm8\n"
13632           "subps %%xmm4, %%xmm12\n"
13633           "movaps %%xmm1, %%xmm9\n"
13634           "movaps %%xmm1, %%xmm13\n"
13635           "addps %%xmm5, %%xmm9\n"
13636           "subps %%xmm5, %%xmm13\n"
13637           "movaps %%xmm2, %%xmm10\n"
13638           "movaps %%xmm2, %%xmm14\n"
13639           "addps %%xmm6, %%xmm10\n"
13640           "subps %%xmm6, %%xmm14\n"
13641           "movaps %%xmm3, %%xmm11\n"
13642           "movaps %%xmm3, %%xmm15\n"
13643           "addps %%xmm7, %%xmm11\n"
13644           "subps %%xmm7, %%xmm15\n"
13645           "movups %%xmm8, (%0)\n"
13646           "movups %%xmm9, (%1)\n"
13647           "movups %%xmm10, (%2)\n"
13648           "movups %%xmm11, (%3)\n"
13649           "movups %%xmm12, (%4)\n"
13650           "movups %%xmm13, (%5)\n"
13651           "movups %%xmm14, (%6)\n"
13652           "movups %%xmm15, (%7)\n"
13653           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13654         );
13655       }
13656     }
13657     return;
13658   }
13659   if (depth == 27) {
13660     helper_float_30_recursive(buf + 0, 24);
13661     helper_float_30_recursive(buf + 16777216, 24);
13662     helper_float_30_recursive(buf + 33554432, 24);
13663     helper_float_30_recursive(buf + 50331648, 24);
13664     helper_float_30_recursive(buf + 67108864, 24);
13665     helper_float_30_recursive(buf + 83886080, 24);
13666     helper_float_30_recursive(buf + 100663296, 24);
13667     helper_float_30_recursive(buf + 117440512, 24);
13668     for (int j = 0; j < 134217728; j += 134217728) {
13669       for (int k = 0; k < 16777216; k += 4) {
13670         __asm__ volatile (
13671           "movups (%0), %%xmm0\n"
13672           "movups (%1), %%xmm1\n"
13673           "movups (%2), %%xmm2\n"
13674           "movups (%3), %%xmm3\n"
13675           "movups (%4), %%xmm4\n"
13676           "movups (%5), %%xmm5\n"
13677           "movups (%6), %%xmm6\n"
13678           "movups (%7), %%xmm7\n"
13679           "movaps %%xmm0, %%xmm8\n"
13680           "movaps %%xmm0, %%xmm9\n"
13681           "addps %%xmm1, %%xmm8\n"
13682           "subps %%xmm1, %%xmm9\n"
13683           "movaps %%xmm2, %%xmm10\n"
13684           "movaps %%xmm2, %%xmm11\n"
13685           "addps %%xmm3, %%xmm10\n"
13686           "subps %%xmm3, %%xmm11\n"
13687           "movaps %%xmm4, %%xmm12\n"
13688           "movaps %%xmm4, %%xmm13\n"
13689           "addps %%xmm5, %%xmm12\n"
13690           "subps %%xmm5, %%xmm13\n"
13691           "movaps %%xmm6, %%xmm14\n"
13692           "movaps %%xmm6, %%xmm15\n"
13693           "addps %%xmm7, %%xmm14\n"
13694           "subps %%xmm7, %%xmm15\n"
13695           "movaps %%xmm8, %%xmm0\n"
13696           "movaps %%xmm8, %%xmm2\n"
13697           "addps %%xmm10, %%xmm0\n"
13698           "subps %%xmm10, %%xmm2\n"
13699           "movaps %%xmm9, %%xmm1\n"
13700           "movaps %%xmm9, %%xmm3\n"
13701           "addps %%xmm11, %%xmm1\n"
13702           "subps %%xmm11, %%xmm3\n"
13703           "movaps %%xmm12, %%xmm4\n"
13704           "movaps %%xmm12, %%xmm6\n"
13705           "addps %%xmm14, %%xmm4\n"
13706           "subps %%xmm14, %%xmm6\n"
13707           "movaps %%xmm13, %%xmm5\n"
13708           "movaps %%xmm13, %%xmm7\n"
13709           "addps %%xmm15, %%xmm5\n"
13710           "subps %%xmm15, %%xmm7\n"
13711           "movaps %%xmm0, %%xmm8\n"
13712           "movaps %%xmm0, %%xmm12\n"
13713           "addps %%xmm4, %%xmm8\n"
13714           "subps %%xmm4, %%xmm12\n"
13715           "movaps %%xmm1, %%xmm9\n"
13716           "movaps %%xmm1, %%xmm13\n"
13717           "addps %%xmm5, %%xmm9\n"
13718           "subps %%xmm5, %%xmm13\n"
13719           "movaps %%xmm2, %%xmm10\n"
13720           "movaps %%xmm2, %%xmm14\n"
13721           "addps %%xmm6, %%xmm10\n"
13722           "subps %%xmm6, %%xmm14\n"
13723           "movaps %%xmm3, %%xmm11\n"
13724           "movaps %%xmm3, %%xmm15\n"
13725           "addps %%xmm7, %%xmm11\n"
13726           "subps %%xmm7, %%xmm15\n"
13727           "movups %%xmm8, (%0)\n"
13728           "movups %%xmm9, (%1)\n"
13729           "movups %%xmm10, (%2)\n"
13730           "movups %%xmm11, (%3)\n"
13731           "movups %%xmm12, (%4)\n"
13732           "movups %%xmm13, (%5)\n"
13733           "movups %%xmm14, (%6)\n"
13734           "movups %%xmm15, (%7)\n"
13735           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13736         );
13737       }
13738     }
13739     return;
13740   }
13741   if (depth == 30) {
13742     helper_float_30_recursive(buf + 0, 27);
13743     helper_float_30_recursive(buf + 134217728, 27);
13744     helper_float_30_recursive(buf + 268435456, 27);
13745     helper_float_30_recursive(buf + 402653184, 27);
13746     helper_float_30_recursive(buf + 536870912, 27);
13747     helper_float_30_recursive(buf + 671088640, 27);
13748     helper_float_30_recursive(buf + 805306368, 27);
13749     helper_float_30_recursive(buf + 939524096, 27);
13750     for (int j = 0; j < 1073741824; j += 1073741824) {
13751       for (int k = 0; k < 134217728; k += 4) {
13752         __asm__ volatile (
13753           "movups (%0), %%xmm0\n"
13754           "movups (%1), %%xmm1\n"
13755           "movups (%2), %%xmm2\n"
13756           "movups (%3), %%xmm3\n"
13757           "movups (%4), %%xmm4\n"
13758           "movups (%5), %%xmm5\n"
13759           "movups (%6), %%xmm6\n"
13760           "movups (%7), %%xmm7\n"
13761           "movaps %%xmm0, %%xmm8\n"
13762           "movaps %%xmm0, %%xmm9\n"
13763           "addps %%xmm1, %%xmm8\n"
13764           "subps %%xmm1, %%xmm9\n"
13765           "movaps %%xmm2, %%xmm10\n"
13766           "movaps %%xmm2, %%xmm11\n"
13767           "addps %%xmm3, %%xmm10\n"
13768           "subps %%xmm3, %%xmm11\n"
13769           "movaps %%xmm4, %%xmm12\n"
13770           "movaps %%xmm4, %%xmm13\n"
13771           "addps %%xmm5, %%xmm12\n"
13772           "subps %%xmm5, %%xmm13\n"
13773           "movaps %%xmm6, %%xmm14\n"
13774           "movaps %%xmm6, %%xmm15\n"
13775           "addps %%xmm7, %%xmm14\n"
13776           "subps %%xmm7, %%xmm15\n"
13777           "movaps %%xmm8, %%xmm0\n"
13778           "movaps %%xmm8, %%xmm2\n"
13779           "addps %%xmm10, %%xmm0\n"
13780           "subps %%xmm10, %%xmm2\n"
13781           "movaps %%xmm9, %%xmm1\n"
13782           "movaps %%xmm9, %%xmm3\n"
13783           "addps %%xmm11, %%xmm1\n"
13784           "subps %%xmm11, %%xmm3\n"
13785           "movaps %%xmm12, %%xmm4\n"
13786           "movaps %%xmm12, %%xmm6\n"
13787           "addps %%xmm14, %%xmm4\n"
13788           "subps %%xmm14, %%xmm6\n"
13789           "movaps %%xmm13, %%xmm5\n"
13790           "movaps %%xmm13, %%xmm7\n"
13791           "addps %%xmm15, %%xmm5\n"
13792           "subps %%xmm15, %%xmm7\n"
13793           "movaps %%xmm0, %%xmm8\n"
13794           "movaps %%xmm0, %%xmm12\n"
13795           "addps %%xmm4, %%xmm8\n"
13796           "subps %%xmm4, %%xmm12\n"
13797           "movaps %%xmm1, %%xmm9\n"
13798           "movaps %%xmm1, %%xmm13\n"
13799           "addps %%xmm5, %%xmm9\n"
13800           "subps %%xmm5, %%xmm13\n"
13801           "movaps %%xmm2, %%xmm10\n"
13802           "movaps %%xmm2, %%xmm14\n"
13803           "addps %%xmm6, %%xmm10\n"
13804           "subps %%xmm6, %%xmm14\n"
13805           "movaps %%xmm3, %%xmm11\n"
13806           "movaps %%xmm3, %%xmm15\n"
13807           "addps %%xmm7, %%xmm11\n"
13808           "subps %%xmm7, %%xmm15\n"
13809           "movups %%xmm8, (%0)\n"
13810           "movups %%xmm9, (%1)\n"
13811           "movups %%xmm10, (%2)\n"
13812           "movups %%xmm11, (%3)\n"
13813           "movups %%xmm12, (%4)\n"
13814           "movups %%xmm13, (%5)\n"
13815           "movups %%xmm14, (%6)\n"
13816           "movups %%xmm15, (%7)\n"
13817           :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13818         );
13819       }
13820     }
13821     return;
13822   }
13823 }
13824 void helper_float_30(float *buf);
helper_float_30(float * buf)13825 void helper_float_30(float *buf) {
13826   helper_float_30_recursive(buf, 30);
13827 }
fht_float(float * buf,int log_n)13828 int fht_float(float *buf, int log_n) {
13829   if (log_n == 0) {
13830     return 0;
13831   }
13832   if (log_n == 1) {
13833     helper_float_1(buf);
13834     return 0;
13835   }
13836   if (log_n == 2) {
13837     helper_float_2(buf);
13838     return 0;
13839   }
13840   if (log_n == 3) {
13841     helper_float_3(buf);
13842     return 0;
13843   }
13844   if (log_n == 4) {
13845     helper_float_4(buf);
13846     return 0;
13847   }
13848   if (log_n == 5) {
13849     helper_float_5(buf);
13850     return 0;
13851   }
13852   if (log_n == 6) {
13853     helper_float_6(buf);
13854     return 0;
13855   }
13856   if (log_n == 7) {
13857     helper_float_7(buf);
13858     return 0;
13859   }
13860   if (log_n == 8) {
13861     helper_float_8(buf);
13862     return 0;
13863   }
13864   if (log_n == 9) {
13865     helper_float_9(buf);
13866     return 0;
13867   }
13868   if (log_n == 10) {
13869     helper_float_10(buf);
13870     return 0;
13871   }
13872   if (log_n == 11) {
13873     helper_float_11(buf);
13874     return 0;
13875   }
13876   if (log_n == 12) {
13877     helper_float_12(buf);
13878     return 0;
13879   }
13880   if (log_n == 13) {
13881     helper_float_13(buf);
13882     return 0;
13883   }
13884   if (log_n == 14) {
13885     helper_float_14(buf);
13886     return 0;
13887   }
13888   if (log_n == 15) {
13889     helper_float_15(buf);
13890     return 0;
13891   }
13892   if (log_n == 16) {
13893     helper_float_16(buf);
13894     return 0;
13895   }
13896   if (log_n == 17) {
13897     helper_float_17(buf);
13898     return 0;
13899   }
13900   if (log_n == 18) {
13901     helper_float_18(buf);
13902     return 0;
13903   }
13904   if (log_n == 19) {
13905     helper_float_19(buf);
13906     return 0;
13907   }
13908   if (log_n == 20) {
13909     helper_float_20(buf);
13910     return 0;
13911   }
13912   if (log_n == 21) {
13913     helper_float_21(buf);
13914     return 0;
13915   }
13916   if (log_n == 22) {
13917     helper_float_22(buf);
13918     return 0;
13919   }
13920   if (log_n == 23) {
13921     helper_float_23(buf);
13922     return 0;
13923   }
13924   if (log_n == 24) {
13925     helper_float_24(buf);
13926     return 0;
13927   }
13928   if (log_n == 25) {
13929     helper_float_25(buf);
13930     return 0;
13931   }
13932   if (log_n == 26) {
13933     helper_float_26(buf);
13934     return 0;
13935   }
13936   if (log_n == 27) {
13937     helper_float_27(buf);
13938     return 0;
13939   }
13940   if (log_n == 28) {
13941     helper_float_28(buf);
13942     return 0;
13943   }
13944   if (log_n == 29) {
13945     helper_float_29(buf);
13946     return 0;
13947   }
13948   if (log_n == 30) {
13949     helper_float_30(buf);
13950     return 0;
13951   }
13952   return 1;
13953 }
13954 static inline void helper_double_1(double *buf);
helper_double_1(double * buf)13955 static inline void helper_double_1(double *buf) {
13956   for (int j = 0; j < 2; j += 2) {
13957     __asm__ volatile (
13958       "movupd (%0), %%xmm0\n"
13959       "movapd %%xmm0, %%xmm8\n"
13960       "haddpd %%xmm8, %%xmm8\n"
13961       "movapd %%xmm0, %%xmm9\n"
13962       "hsubpd %%xmm9, %%xmm9\n"
13963       "blendpd $1, %%xmm8, %%xmm9\n"
13964       "movapd %%xmm9, %%xmm0\n"
13965       "movupd %%xmm0, (%0)\n"
13966       :: "r"(buf + j) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13967     );
13968   }
13969 }
13970 void helper_double_2_recursive(double *buf, int depth);
helper_double_2_recursive(double * buf,int depth)13971 void helper_double_2_recursive(double *buf, int depth) {
13972   if (depth == 2) {
13973     for (int j = 0; j < 4; j += 4) {
13974       for (int k = 0; k < 2; k += 2) {
13975         __asm__ volatile (
13976           "movupd (%0), %%xmm0\n"
13977           "movupd (%1), %%xmm1\n"
13978           "movapd %%xmm0, %%xmm8\n"
13979           "haddpd %%xmm8, %%xmm8\n"
13980           "movapd %%xmm0, %%xmm9\n"
13981           "hsubpd %%xmm9, %%xmm9\n"
13982           "blendpd $1, %%xmm8, %%xmm9\n"
13983           "movapd %%xmm9, %%xmm0\n"
13984           "movapd %%xmm1, %%xmm8\n"
13985           "haddpd %%xmm8, %%xmm8\n"
13986           "movapd %%xmm1, %%xmm9\n"
13987           "hsubpd %%xmm9, %%xmm9\n"
13988           "blendpd $1, %%xmm8, %%xmm9\n"
13989           "movapd %%xmm9, %%xmm1\n"
13990           "movapd %%xmm0, %%xmm8\n"
13991           "movapd %%xmm0, %%xmm9\n"
13992           "addpd %%xmm1, %%xmm8\n"
13993           "subpd %%xmm1, %%xmm9\n"
13994           "movupd %%xmm8, (%0)\n"
13995           "movupd %%xmm9, (%1)\n"
13996           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13997         );
13998       }
13999     }
14000     return;
14001   }
14002 }
14003 void helper_double_2(double *buf);
helper_double_2(double * buf)14004 void helper_double_2(double *buf) {
14005   helper_double_2_recursive(buf, 2);
14006 }
14007 void helper_double_3_recursive(double *buf, int depth);
helper_double_3_recursive(double * buf,int depth)14008 void helper_double_3_recursive(double *buf, int depth) {
14009   if (depth == 3) {
14010     for (int j = 0; j < 8; j += 8) {
14011       for (int k = 0; k < 2; k += 2) {
14012         __asm__ volatile (
14013           "movupd (%0), %%xmm0\n"
14014           "movupd (%1), %%xmm1\n"
14015           "movupd (%2), %%xmm2\n"
14016           "movupd (%3), %%xmm3\n"
14017           "movapd %%xmm0, %%xmm8\n"
14018           "haddpd %%xmm8, %%xmm8\n"
14019           "movapd %%xmm0, %%xmm9\n"
14020           "hsubpd %%xmm9, %%xmm9\n"
14021           "blendpd $1, %%xmm8, %%xmm9\n"
14022           "movapd %%xmm9, %%xmm0\n"
14023           "movapd %%xmm1, %%xmm8\n"
14024           "haddpd %%xmm8, %%xmm8\n"
14025           "movapd %%xmm1, %%xmm9\n"
14026           "hsubpd %%xmm9, %%xmm9\n"
14027           "blendpd $1, %%xmm8, %%xmm9\n"
14028           "movapd %%xmm9, %%xmm1\n"
14029           "movapd %%xmm2, %%xmm8\n"
14030           "haddpd %%xmm8, %%xmm8\n"
14031           "movapd %%xmm2, %%xmm9\n"
14032           "hsubpd %%xmm9, %%xmm9\n"
14033           "blendpd $1, %%xmm8, %%xmm9\n"
14034           "movapd %%xmm9, %%xmm2\n"
14035           "movapd %%xmm3, %%xmm8\n"
14036           "haddpd %%xmm8, %%xmm8\n"
14037           "movapd %%xmm3, %%xmm9\n"
14038           "hsubpd %%xmm9, %%xmm9\n"
14039           "blendpd $1, %%xmm8, %%xmm9\n"
14040           "movapd %%xmm9, %%xmm3\n"
14041           "movapd %%xmm0, %%xmm8\n"
14042           "movapd %%xmm0, %%xmm9\n"
14043           "addpd %%xmm1, %%xmm8\n"
14044           "subpd %%xmm1, %%xmm9\n"
14045           "movapd %%xmm2, %%xmm10\n"
14046           "movapd %%xmm2, %%xmm11\n"
14047           "addpd %%xmm3, %%xmm10\n"
14048           "subpd %%xmm3, %%xmm11\n"
14049           "movapd %%xmm8, %%xmm0\n"
14050           "movapd %%xmm8, %%xmm2\n"
14051           "addpd %%xmm10, %%xmm0\n"
14052           "subpd %%xmm10, %%xmm2\n"
14053           "movapd %%xmm9, %%xmm1\n"
14054           "movapd %%xmm9, %%xmm3\n"
14055           "addpd %%xmm11, %%xmm1\n"
14056           "subpd %%xmm11, %%xmm3\n"
14057           "movupd %%xmm0, (%0)\n"
14058           "movupd %%xmm1, (%1)\n"
14059           "movupd %%xmm2, (%2)\n"
14060           "movupd %%xmm3, (%3)\n"
14061           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14062         );
14063       }
14064     }
14065     return;
14066   }
14067 }
14068 void helper_double_3(double *buf);
helper_double_3(double * buf)14069 void helper_double_3(double *buf) {
14070   helper_double_3_recursive(buf, 3);
14071 }
14072 static inline void helper_double_4(double *buf);
helper_double_4(double * buf)14073 static inline void helper_double_4(double *buf) {
14074   for (int j = 0; j < 16; j += 16) {
14075     for (int k = 0; k < 2; k += 2) {
14076       __asm__ volatile (
14077         "movupd (%0), %%xmm0\n"
14078         "movupd (%1), %%xmm1\n"
14079         "movupd (%2), %%xmm2\n"
14080         "movupd (%3), %%xmm3\n"
14081         "movupd (%4), %%xmm4\n"
14082         "movupd (%5), %%xmm5\n"
14083         "movupd (%6), %%xmm6\n"
14084         "movupd (%7), %%xmm7\n"
14085         "movapd %%xmm0, %%xmm8\n"
14086         "haddpd %%xmm8, %%xmm8\n"
14087         "movapd %%xmm0, %%xmm9\n"
14088         "hsubpd %%xmm9, %%xmm9\n"
14089         "blendpd $1, %%xmm8, %%xmm9\n"
14090         "movapd %%xmm9, %%xmm0\n"
14091         "movapd %%xmm1, %%xmm8\n"
14092         "haddpd %%xmm8, %%xmm8\n"
14093         "movapd %%xmm1, %%xmm9\n"
14094         "hsubpd %%xmm9, %%xmm9\n"
14095         "blendpd $1, %%xmm8, %%xmm9\n"
14096         "movapd %%xmm9, %%xmm1\n"
14097         "movapd %%xmm2, %%xmm8\n"
14098         "haddpd %%xmm8, %%xmm8\n"
14099         "movapd %%xmm2, %%xmm9\n"
14100         "hsubpd %%xmm9, %%xmm9\n"
14101         "blendpd $1, %%xmm8, %%xmm9\n"
14102         "movapd %%xmm9, %%xmm2\n"
14103         "movapd %%xmm3, %%xmm8\n"
14104         "haddpd %%xmm8, %%xmm8\n"
14105         "movapd %%xmm3, %%xmm9\n"
14106         "hsubpd %%xmm9, %%xmm9\n"
14107         "blendpd $1, %%xmm8, %%xmm9\n"
14108         "movapd %%xmm9, %%xmm3\n"
14109         "movapd %%xmm4, %%xmm8\n"
14110         "haddpd %%xmm8, %%xmm8\n"
14111         "movapd %%xmm4, %%xmm9\n"
14112         "hsubpd %%xmm9, %%xmm9\n"
14113         "blendpd $1, %%xmm8, %%xmm9\n"
14114         "movapd %%xmm9, %%xmm4\n"
14115         "movapd %%xmm5, %%xmm8\n"
14116         "haddpd %%xmm8, %%xmm8\n"
14117         "movapd %%xmm5, %%xmm9\n"
14118         "hsubpd %%xmm9, %%xmm9\n"
14119         "blendpd $1, %%xmm8, %%xmm9\n"
14120         "movapd %%xmm9, %%xmm5\n"
14121         "movapd %%xmm6, %%xmm8\n"
14122         "haddpd %%xmm8, %%xmm8\n"
14123         "movapd %%xmm6, %%xmm9\n"
14124         "hsubpd %%xmm9, %%xmm9\n"
14125         "blendpd $1, %%xmm8, %%xmm9\n"
14126         "movapd %%xmm9, %%xmm6\n"
14127         "movapd %%xmm7, %%xmm8\n"
14128         "haddpd %%xmm8, %%xmm8\n"
14129         "movapd %%xmm7, %%xmm9\n"
14130         "hsubpd %%xmm9, %%xmm9\n"
14131         "blendpd $1, %%xmm8, %%xmm9\n"
14132         "movapd %%xmm9, %%xmm7\n"
14133         "movapd %%xmm0, %%xmm8\n"
14134         "movapd %%xmm0, %%xmm9\n"
14135         "addpd %%xmm1, %%xmm8\n"
14136         "subpd %%xmm1, %%xmm9\n"
14137         "movapd %%xmm2, %%xmm10\n"
14138         "movapd %%xmm2, %%xmm11\n"
14139         "addpd %%xmm3, %%xmm10\n"
14140         "subpd %%xmm3, %%xmm11\n"
14141         "movapd %%xmm4, %%xmm12\n"
14142         "movapd %%xmm4, %%xmm13\n"
14143         "addpd %%xmm5, %%xmm12\n"
14144         "subpd %%xmm5, %%xmm13\n"
14145         "movapd %%xmm6, %%xmm14\n"
14146         "movapd %%xmm6, %%xmm15\n"
14147         "addpd %%xmm7, %%xmm14\n"
14148         "subpd %%xmm7, %%xmm15\n"
14149         "movapd %%xmm8, %%xmm0\n"
14150         "movapd %%xmm8, %%xmm2\n"
14151         "addpd %%xmm10, %%xmm0\n"
14152         "subpd %%xmm10, %%xmm2\n"
14153         "movapd %%xmm9, %%xmm1\n"
14154         "movapd %%xmm9, %%xmm3\n"
14155         "addpd %%xmm11, %%xmm1\n"
14156         "subpd %%xmm11, %%xmm3\n"
14157         "movapd %%xmm12, %%xmm4\n"
14158         "movapd %%xmm12, %%xmm6\n"
14159         "addpd %%xmm14, %%xmm4\n"
14160         "subpd %%xmm14, %%xmm6\n"
14161         "movapd %%xmm13, %%xmm5\n"
14162         "movapd %%xmm13, %%xmm7\n"
14163         "addpd %%xmm15, %%xmm5\n"
14164         "subpd %%xmm15, %%xmm7\n"
14165         "movapd %%xmm0, %%xmm8\n"
14166         "movapd %%xmm0, %%xmm12\n"
14167         "addpd %%xmm4, %%xmm8\n"
14168         "subpd %%xmm4, %%xmm12\n"
14169         "movapd %%xmm1, %%xmm9\n"
14170         "movapd %%xmm1, %%xmm13\n"
14171         "addpd %%xmm5, %%xmm9\n"
14172         "subpd %%xmm5, %%xmm13\n"
14173         "movapd %%xmm2, %%xmm10\n"
14174         "movapd %%xmm2, %%xmm14\n"
14175         "addpd %%xmm6, %%xmm10\n"
14176         "subpd %%xmm6, %%xmm14\n"
14177         "movapd %%xmm3, %%xmm11\n"
14178         "movapd %%xmm3, %%xmm15\n"
14179         "addpd %%xmm7, %%xmm11\n"
14180         "subpd %%xmm7, %%xmm15\n"
14181         "movupd %%xmm8, (%0)\n"
14182         "movupd %%xmm9, (%1)\n"
14183         "movupd %%xmm10, (%2)\n"
14184         "movupd %%xmm11, (%3)\n"
14185         "movupd %%xmm12, (%4)\n"
14186         "movupd %%xmm13, (%5)\n"
14187         "movupd %%xmm14, (%6)\n"
14188         "movupd %%xmm15, (%7)\n"
14189         :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14190       );
14191     }
14192   }
14193 }
14194 void helper_double_5_recursive(double *buf, int depth);
helper_double_5_recursive(double * buf,int depth)14195 void helper_double_5_recursive(double *buf, int depth) {
14196   if (depth == 2) {
14197     for (int j = 0; j < 4; j += 4) {
14198       for (int k = 0; k < 2; k += 2) {
14199         __asm__ volatile (
14200           "movupd (%0), %%xmm0\n"
14201           "movupd (%1), %%xmm1\n"
14202           "movapd %%xmm0, %%xmm8\n"
14203           "haddpd %%xmm8, %%xmm8\n"
14204           "movapd %%xmm0, %%xmm9\n"
14205           "hsubpd %%xmm9, %%xmm9\n"
14206           "blendpd $1, %%xmm8, %%xmm9\n"
14207           "movapd %%xmm9, %%xmm0\n"
14208           "movapd %%xmm1, %%xmm8\n"
14209           "haddpd %%xmm8, %%xmm8\n"
14210           "movapd %%xmm1, %%xmm9\n"
14211           "hsubpd %%xmm9, %%xmm9\n"
14212           "blendpd $1, %%xmm8, %%xmm9\n"
14213           "movapd %%xmm9, %%xmm1\n"
14214           "movapd %%xmm0, %%xmm8\n"
14215           "movapd %%xmm0, %%xmm9\n"
14216           "addpd %%xmm1, %%xmm8\n"
14217           "subpd %%xmm1, %%xmm9\n"
14218           "movupd %%xmm8, (%0)\n"
14219           "movupd %%xmm9, (%1)\n"
14220           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14221         );
14222       }
14223     }
14224     return;
14225   }
14226   if (depth == 5) {
14227     helper_double_5_recursive(buf + 0, 2);
14228     helper_double_5_recursive(buf + 4, 2);
14229     helper_double_5_recursive(buf + 8, 2);
14230     helper_double_5_recursive(buf + 12, 2);
14231     helper_double_5_recursive(buf + 16, 2);
14232     helper_double_5_recursive(buf + 20, 2);
14233     helper_double_5_recursive(buf + 24, 2);
14234     helper_double_5_recursive(buf + 28, 2);
14235     for (int j = 0; j < 32; j += 32) {
14236       for (int k = 0; k < 4; k += 2) {
14237         __asm__ volatile (
14238           "movupd (%0), %%xmm0\n"
14239           "movupd (%1), %%xmm1\n"
14240           "movupd (%2), %%xmm2\n"
14241           "movupd (%3), %%xmm3\n"
14242           "movupd (%4), %%xmm4\n"
14243           "movupd (%5), %%xmm5\n"
14244           "movupd (%6), %%xmm6\n"
14245           "movupd (%7), %%xmm7\n"
14246           "movapd %%xmm0, %%xmm8\n"
14247           "movapd %%xmm0, %%xmm9\n"
14248           "addpd %%xmm1, %%xmm8\n"
14249           "subpd %%xmm1, %%xmm9\n"
14250           "movapd %%xmm2, %%xmm10\n"
14251           "movapd %%xmm2, %%xmm11\n"
14252           "addpd %%xmm3, %%xmm10\n"
14253           "subpd %%xmm3, %%xmm11\n"
14254           "movapd %%xmm4, %%xmm12\n"
14255           "movapd %%xmm4, %%xmm13\n"
14256           "addpd %%xmm5, %%xmm12\n"
14257           "subpd %%xmm5, %%xmm13\n"
14258           "movapd %%xmm6, %%xmm14\n"
14259           "movapd %%xmm6, %%xmm15\n"
14260           "addpd %%xmm7, %%xmm14\n"
14261           "subpd %%xmm7, %%xmm15\n"
14262           "movapd %%xmm8, %%xmm0\n"
14263           "movapd %%xmm8, %%xmm2\n"
14264           "addpd %%xmm10, %%xmm0\n"
14265           "subpd %%xmm10, %%xmm2\n"
14266           "movapd %%xmm9, %%xmm1\n"
14267           "movapd %%xmm9, %%xmm3\n"
14268           "addpd %%xmm11, %%xmm1\n"
14269           "subpd %%xmm11, %%xmm3\n"
14270           "movapd %%xmm12, %%xmm4\n"
14271           "movapd %%xmm12, %%xmm6\n"
14272           "addpd %%xmm14, %%xmm4\n"
14273           "subpd %%xmm14, %%xmm6\n"
14274           "movapd %%xmm13, %%xmm5\n"
14275           "movapd %%xmm13, %%xmm7\n"
14276           "addpd %%xmm15, %%xmm5\n"
14277           "subpd %%xmm15, %%xmm7\n"
14278           "movapd %%xmm0, %%xmm8\n"
14279           "movapd %%xmm0, %%xmm12\n"
14280           "addpd %%xmm4, %%xmm8\n"
14281           "subpd %%xmm4, %%xmm12\n"
14282           "movapd %%xmm1, %%xmm9\n"
14283           "movapd %%xmm1, %%xmm13\n"
14284           "addpd %%xmm5, %%xmm9\n"
14285           "subpd %%xmm5, %%xmm13\n"
14286           "movapd %%xmm2, %%xmm10\n"
14287           "movapd %%xmm2, %%xmm14\n"
14288           "addpd %%xmm6, %%xmm10\n"
14289           "subpd %%xmm6, %%xmm14\n"
14290           "movapd %%xmm3, %%xmm11\n"
14291           "movapd %%xmm3, %%xmm15\n"
14292           "addpd %%xmm7, %%xmm11\n"
14293           "subpd %%xmm7, %%xmm15\n"
14294           "movupd %%xmm8, (%0)\n"
14295           "movupd %%xmm9, (%1)\n"
14296           "movupd %%xmm10, (%2)\n"
14297           "movupd %%xmm11, (%3)\n"
14298           "movupd %%xmm12, (%4)\n"
14299           "movupd %%xmm13, (%5)\n"
14300           "movupd %%xmm14, (%6)\n"
14301           "movupd %%xmm15, (%7)\n"
14302           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14303         );
14304       }
14305     }
14306     return;
14307   }
14308 }
14309 void helper_double_5(double *buf);
helper_double_5(double * buf)14310 void helper_double_5(double *buf) {
14311   helper_double_5_recursive(buf, 5);
14312 }
14313 static inline void helper_double_6(double *buf);
helper_double_6(double * buf)14314 static inline void helper_double_6(double *buf) {
14315   for (int j = 0; j < 64; j += 16) {
14316     for (int k = 0; k < 2; k += 2) {
14317       __asm__ volatile (
14318         "movupd (%0), %%xmm0\n"
14319         "movupd (%1), %%xmm1\n"
14320         "movupd (%2), %%xmm2\n"
14321         "movupd (%3), %%xmm3\n"
14322         "movupd (%4), %%xmm4\n"
14323         "movupd (%5), %%xmm5\n"
14324         "movupd (%6), %%xmm6\n"
14325         "movupd (%7), %%xmm7\n"
14326         "movapd %%xmm0, %%xmm8\n"
14327         "haddpd %%xmm8, %%xmm8\n"
14328         "movapd %%xmm0, %%xmm9\n"
14329         "hsubpd %%xmm9, %%xmm9\n"
14330         "blendpd $1, %%xmm8, %%xmm9\n"
14331         "movapd %%xmm9, %%xmm0\n"
14332         "movapd %%xmm1, %%xmm8\n"
14333         "haddpd %%xmm8, %%xmm8\n"
14334         "movapd %%xmm1, %%xmm9\n"
14335         "hsubpd %%xmm9, %%xmm9\n"
14336         "blendpd $1, %%xmm8, %%xmm9\n"
14337         "movapd %%xmm9, %%xmm1\n"
14338         "movapd %%xmm2, %%xmm8\n"
14339         "haddpd %%xmm8, %%xmm8\n"
14340         "movapd %%xmm2, %%xmm9\n"
14341         "hsubpd %%xmm9, %%xmm9\n"
14342         "blendpd $1, %%xmm8, %%xmm9\n"
14343         "movapd %%xmm9, %%xmm2\n"
14344         "movapd %%xmm3, %%xmm8\n"
14345         "haddpd %%xmm8, %%xmm8\n"
14346         "movapd %%xmm3, %%xmm9\n"
14347         "hsubpd %%xmm9, %%xmm9\n"
14348         "blendpd $1, %%xmm8, %%xmm9\n"
14349         "movapd %%xmm9, %%xmm3\n"
14350         "movapd %%xmm4, %%xmm8\n"
14351         "haddpd %%xmm8, %%xmm8\n"
14352         "movapd %%xmm4, %%xmm9\n"
14353         "hsubpd %%xmm9, %%xmm9\n"
14354         "blendpd $1, %%xmm8, %%xmm9\n"
14355         "movapd %%xmm9, %%xmm4\n"
14356         "movapd %%xmm5, %%xmm8\n"
14357         "haddpd %%xmm8, %%xmm8\n"
14358         "movapd %%xmm5, %%xmm9\n"
14359         "hsubpd %%xmm9, %%xmm9\n"
14360         "blendpd $1, %%xmm8, %%xmm9\n"
14361         "movapd %%xmm9, %%xmm5\n"
14362         "movapd %%xmm6, %%xmm8\n"
14363         "haddpd %%xmm8, %%xmm8\n"
14364         "movapd %%xmm6, %%xmm9\n"
14365         "hsubpd %%xmm9, %%xmm9\n"
14366         "blendpd $1, %%xmm8, %%xmm9\n"
14367         "movapd %%xmm9, %%xmm6\n"
14368         "movapd %%xmm7, %%xmm8\n"
14369         "haddpd %%xmm8, %%xmm8\n"
14370         "movapd %%xmm7, %%xmm9\n"
14371         "hsubpd %%xmm9, %%xmm9\n"
14372         "blendpd $1, %%xmm8, %%xmm9\n"
14373         "movapd %%xmm9, %%xmm7\n"
14374         "movapd %%xmm0, %%xmm8\n"
14375         "movapd %%xmm0, %%xmm9\n"
14376         "addpd %%xmm1, %%xmm8\n"
14377         "subpd %%xmm1, %%xmm9\n"
14378         "movapd %%xmm2, %%xmm10\n"
14379         "movapd %%xmm2, %%xmm11\n"
14380         "addpd %%xmm3, %%xmm10\n"
14381         "subpd %%xmm3, %%xmm11\n"
14382         "movapd %%xmm4, %%xmm12\n"
14383         "movapd %%xmm4, %%xmm13\n"
14384         "addpd %%xmm5, %%xmm12\n"
14385         "subpd %%xmm5, %%xmm13\n"
14386         "movapd %%xmm6, %%xmm14\n"
14387         "movapd %%xmm6, %%xmm15\n"
14388         "addpd %%xmm7, %%xmm14\n"
14389         "subpd %%xmm7, %%xmm15\n"
14390         "movapd %%xmm8, %%xmm0\n"
14391         "movapd %%xmm8, %%xmm2\n"
14392         "addpd %%xmm10, %%xmm0\n"
14393         "subpd %%xmm10, %%xmm2\n"
14394         "movapd %%xmm9, %%xmm1\n"
14395         "movapd %%xmm9, %%xmm3\n"
14396         "addpd %%xmm11, %%xmm1\n"
14397         "subpd %%xmm11, %%xmm3\n"
14398         "movapd %%xmm12, %%xmm4\n"
14399         "movapd %%xmm12, %%xmm6\n"
14400         "addpd %%xmm14, %%xmm4\n"
14401         "subpd %%xmm14, %%xmm6\n"
14402         "movapd %%xmm13, %%xmm5\n"
14403         "movapd %%xmm13, %%xmm7\n"
14404         "addpd %%xmm15, %%xmm5\n"
14405         "subpd %%xmm15, %%xmm7\n"
14406         "movapd %%xmm0, %%xmm8\n"
14407         "movapd %%xmm0, %%xmm12\n"
14408         "addpd %%xmm4, %%xmm8\n"
14409         "subpd %%xmm4, %%xmm12\n"
14410         "movapd %%xmm1, %%xmm9\n"
14411         "movapd %%xmm1, %%xmm13\n"
14412         "addpd %%xmm5, %%xmm9\n"
14413         "subpd %%xmm5, %%xmm13\n"
14414         "movapd %%xmm2, %%xmm10\n"
14415         "movapd %%xmm2, %%xmm14\n"
14416         "addpd %%xmm6, %%xmm10\n"
14417         "subpd %%xmm6, %%xmm14\n"
14418         "movapd %%xmm3, %%xmm11\n"
14419         "movapd %%xmm3, %%xmm15\n"
14420         "addpd %%xmm7, %%xmm11\n"
14421         "subpd %%xmm7, %%xmm15\n"
14422         "movupd %%xmm8, (%0)\n"
14423         "movupd %%xmm9, (%1)\n"
14424         "movupd %%xmm10, (%2)\n"
14425         "movupd %%xmm11, (%3)\n"
14426         "movupd %%xmm12, (%4)\n"
14427         "movupd %%xmm13, (%5)\n"
14428         "movupd %%xmm14, (%6)\n"
14429         "movupd %%xmm15, (%7)\n"
14430         :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14431       );
14432     }
14433   }
14434   for (int j = 0; j < 64; j += 64) {
14435     for (int k = 0; k < 16; k += 2) {
14436       __asm__ volatile (
14437         "movupd (%0), %%xmm0\n"
14438         "movupd (%1), %%xmm1\n"
14439         "movupd (%2), %%xmm2\n"
14440         "movupd (%3), %%xmm3\n"
14441         "movapd %%xmm0, %%xmm8\n"
14442         "movapd %%xmm0, %%xmm9\n"
14443         "addpd %%xmm1, %%xmm8\n"
14444         "subpd %%xmm1, %%xmm9\n"
14445         "movapd %%xmm2, %%xmm10\n"
14446         "movapd %%xmm2, %%xmm11\n"
14447         "addpd %%xmm3, %%xmm10\n"
14448         "subpd %%xmm3, %%xmm11\n"
14449         "movapd %%xmm8, %%xmm0\n"
14450         "movapd %%xmm8, %%xmm2\n"
14451         "addpd %%xmm10, %%xmm0\n"
14452         "subpd %%xmm10, %%xmm2\n"
14453         "movapd %%xmm9, %%xmm1\n"
14454         "movapd %%xmm9, %%xmm3\n"
14455         "addpd %%xmm11, %%xmm1\n"
14456         "subpd %%xmm11, %%xmm3\n"
14457         "movupd %%xmm0, (%0)\n"
14458         "movupd %%xmm1, (%1)\n"
14459         "movupd %%xmm2, (%2)\n"
14460         "movupd %%xmm3, (%3)\n"
14461         :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14462       );
14463     }
14464   }
14465 }
14466 static inline void helper_double_7(double *buf);
helper_double_7(double * buf)14467 static inline void helper_double_7(double *buf) {
14468   for (int j = 0; j < 128; j += 16) {
14469     for (int k = 0; k < 2; k += 2) {
14470       __asm__ volatile (
14471         "movupd (%0), %%xmm0\n"
14472         "movupd (%1), %%xmm1\n"
14473         "movupd (%2), %%xmm2\n"
14474         "movupd (%3), %%xmm3\n"
14475         "movupd (%4), %%xmm4\n"
14476         "movupd (%5), %%xmm5\n"
14477         "movupd (%6), %%xmm6\n"
14478         "movupd (%7), %%xmm7\n"
14479         "movapd %%xmm0, %%xmm8\n"
14480         "haddpd %%xmm8, %%xmm8\n"
14481         "movapd %%xmm0, %%xmm9\n"
14482         "hsubpd %%xmm9, %%xmm9\n"
14483         "blendpd $1, %%xmm8, %%xmm9\n"
14484         "movapd %%xmm9, %%xmm0\n"
14485         "movapd %%xmm1, %%xmm8\n"
14486         "haddpd %%xmm8, %%xmm8\n"
14487         "movapd %%xmm1, %%xmm9\n"
14488         "hsubpd %%xmm9, %%xmm9\n"
14489         "blendpd $1, %%xmm8, %%xmm9\n"
14490         "movapd %%xmm9, %%xmm1\n"
14491         "movapd %%xmm2, %%xmm8\n"
14492         "haddpd %%xmm8, %%xmm8\n"
14493         "movapd %%xmm2, %%xmm9\n"
14494         "hsubpd %%xmm9, %%xmm9\n"
14495         "blendpd $1, %%xmm8, %%xmm9\n"
14496         "movapd %%xmm9, %%xmm2\n"
14497         "movapd %%xmm3, %%xmm8\n"
14498         "haddpd %%xmm8, %%xmm8\n"
14499         "movapd %%xmm3, %%xmm9\n"
14500         "hsubpd %%xmm9, %%xmm9\n"
14501         "blendpd $1, %%xmm8, %%xmm9\n"
14502         "movapd %%xmm9, %%xmm3\n"
14503         "movapd %%xmm4, %%xmm8\n"
14504         "haddpd %%xmm8, %%xmm8\n"
14505         "movapd %%xmm4, %%xmm9\n"
14506         "hsubpd %%xmm9, %%xmm9\n"
14507         "blendpd $1, %%xmm8, %%xmm9\n"
14508         "movapd %%xmm9, %%xmm4\n"
14509         "movapd %%xmm5, %%xmm8\n"
14510         "haddpd %%xmm8, %%xmm8\n"
14511         "movapd %%xmm5, %%xmm9\n"
14512         "hsubpd %%xmm9, %%xmm9\n"
14513         "blendpd $1, %%xmm8, %%xmm9\n"
14514         "movapd %%xmm9, %%xmm5\n"
14515         "movapd %%xmm6, %%xmm8\n"
14516         "haddpd %%xmm8, %%xmm8\n"
14517         "movapd %%xmm6, %%xmm9\n"
14518         "hsubpd %%xmm9, %%xmm9\n"
14519         "blendpd $1, %%xmm8, %%xmm9\n"
14520         "movapd %%xmm9, %%xmm6\n"
14521         "movapd %%xmm7, %%xmm8\n"
14522         "haddpd %%xmm8, %%xmm8\n"
14523         "movapd %%xmm7, %%xmm9\n"
14524         "hsubpd %%xmm9, %%xmm9\n"
14525         "blendpd $1, %%xmm8, %%xmm9\n"
14526         "movapd %%xmm9, %%xmm7\n"
14527         "movapd %%xmm0, %%xmm8\n"
14528         "movapd %%xmm0, %%xmm9\n"
14529         "addpd %%xmm1, %%xmm8\n"
14530         "subpd %%xmm1, %%xmm9\n"
14531         "movapd %%xmm2, %%xmm10\n"
14532         "movapd %%xmm2, %%xmm11\n"
14533         "addpd %%xmm3, %%xmm10\n"
14534         "subpd %%xmm3, %%xmm11\n"
14535         "movapd %%xmm4, %%xmm12\n"
14536         "movapd %%xmm4, %%xmm13\n"
14537         "addpd %%xmm5, %%xmm12\n"
14538         "subpd %%xmm5, %%xmm13\n"
14539         "movapd %%xmm6, %%xmm14\n"
14540         "movapd %%xmm6, %%xmm15\n"
14541         "addpd %%xmm7, %%xmm14\n"
14542         "subpd %%xmm7, %%xmm15\n"
14543         "movapd %%xmm8, %%xmm0\n"
14544         "movapd %%xmm8, %%xmm2\n"
14545         "addpd %%xmm10, %%xmm0\n"
14546         "subpd %%xmm10, %%xmm2\n"
14547         "movapd %%xmm9, %%xmm1\n"
14548         "movapd %%xmm9, %%xmm3\n"
14549         "addpd %%xmm11, %%xmm1\n"
14550         "subpd %%xmm11, %%xmm3\n"
14551         "movapd %%xmm12, %%xmm4\n"
14552         "movapd %%xmm12, %%xmm6\n"
14553         "addpd %%xmm14, %%xmm4\n"
14554         "subpd %%xmm14, %%xmm6\n"
14555         "movapd %%xmm13, %%xmm5\n"
14556         "movapd %%xmm13, %%xmm7\n"
14557         "addpd %%xmm15, %%xmm5\n"
14558         "subpd %%xmm15, %%xmm7\n"
14559         "movapd %%xmm0, %%xmm8\n"
14560         "movapd %%xmm0, %%xmm12\n"
14561         "addpd %%xmm4, %%xmm8\n"
14562         "subpd %%xmm4, %%xmm12\n"
14563         "movapd %%xmm1, %%xmm9\n"
14564         "movapd %%xmm1, %%xmm13\n"
14565         "addpd %%xmm5, %%xmm9\n"
14566         "subpd %%xmm5, %%xmm13\n"
14567         "movapd %%xmm2, %%xmm10\n"
14568         "movapd %%xmm2, %%xmm14\n"
14569         "addpd %%xmm6, %%xmm10\n"
14570         "subpd %%xmm6, %%xmm14\n"
14571         "movapd %%xmm3, %%xmm11\n"
14572         "movapd %%xmm3, %%xmm15\n"
14573         "addpd %%xmm7, %%xmm11\n"
14574         "subpd %%xmm7, %%xmm15\n"
14575         "movupd %%xmm8, (%0)\n"
14576         "movupd %%xmm9, (%1)\n"
14577         "movupd %%xmm10, (%2)\n"
14578         "movupd %%xmm11, (%3)\n"
14579         "movupd %%xmm12, (%4)\n"
14580         "movupd %%xmm13, (%5)\n"
14581         "movupd %%xmm14, (%6)\n"
14582         "movupd %%xmm15, (%7)\n"
14583         :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14584       );
14585     }
14586   }
14587   for (int j = 0; j < 128; j += 128) {
14588     for (int k = 0; k < 16; k += 2) {
14589       __asm__ volatile (
14590         "movupd (%0), %%xmm0\n"
14591         "movupd (%1), %%xmm1\n"
14592         "movupd (%2), %%xmm2\n"
14593         "movupd (%3), %%xmm3\n"
14594         "movupd (%4), %%xmm4\n"
14595         "movupd (%5), %%xmm5\n"
14596         "movupd (%6), %%xmm6\n"
14597         "movupd (%7), %%xmm7\n"
14598         "movapd %%xmm0, %%xmm8\n"
14599         "movapd %%xmm0, %%xmm9\n"
14600         "addpd %%xmm1, %%xmm8\n"
14601         "subpd %%xmm1, %%xmm9\n"
14602         "movapd %%xmm2, %%xmm10\n"
14603         "movapd %%xmm2, %%xmm11\n"
14604         "addpd %%xmm3, %%xmm10\n"
14605         "subpd %%xmm3, %%xmm11\n"
14606         "movapd %%xmm4, %%xmm12\n"
14607         "movapd %%xmm4, %%xmm13\n"
14608         "addpd %%xmm5, %%xmm12\n"
14609         "subpd %%xmm5, %%xmm13\n"
14610         "movapd %%xmm6, %%xmm14\n"
14611         "movapd %%xmm6, %%xmm15\n"
14612         "addpd %%xmm7, %%xmm14\n"
14613         "subpd %%xmm7, %%xmm15\n"
14614         "movapd %%xmm8, %%xmm0\n"
14615         "movapd %%xmm8, %%xmm2\n"
14616         "addpd %%xmm10, %%xmm0\n"
14617         "subpd %%xmm10, %%xmm2\n"
14618         "movapd %%xmm9, %%xmm1\n"
14619         "movapd %%xmm9, %%xmm3\n"
14620         "addpd %%xmm11, %%xmm1\n"
14621         "subpd %%xmm11, %%xmm3\n"
14622         "movapd %%xmm12, %%xmm4\n"
14623         "movapd %%xmm12, %%xmm6\n"
14624         "addpd %%xmm14, %%xmm4\n"
14625         "subpd %%xmm14, %%xmm6\n"
14626         "movapd %%xmm13, %%xmm5\n"
14627         "movapd %%xmm13, %%xmm7\n"
14628         "addpd %%xmm15, %%xmm5\n"
14629         "subpd %%xmm15, %%xmm7\n"
14630         "movapd %%xmm0, %%xmm8\n"
14631         "movapd %%xmm0, %%xmm12\n"
14632         "addpd %%xmm4, %%xmm8\n"
14633         "subpd %%xmm4, %%xmm12\n"
14634         "movapd %%xmm1, %%xmm9\n"
14635         "movapd %%xmm1, %%xmm13\n"
14636         "addpd %%xmm5, %%xmm9\n"
14637         "subpd %%xmm5, %%xmm13\n"
14638         "movapd %%xmm2, %%xmm10\n"
14639         "movapd %%xmm2, %%xmm14\n"
14640         "addpd %%xmm6, %%xmm10\n"
14641         "subpd %%xmm6, %%xmm14\n"
14642         "movapd %%xmm3, %%xmm11\n"
14643         "movapd %%xmm3, %%xmm15\n"
14644         "addpd %%xmm7, %%xmm11\n"
14645         "subpd %%xmm7, %%xmm15\n"
14646         "movupd %%xmm8, (%0)\n"
14647         "movupd %%xmm9, (%1)\n"
14648         "movupd %%xmm10, (%2)\n"
14649         "movupd %%xmm11, (%3)\n"
14650         "movupd %%xmm12, (%4)\n"
14651         "movupd %%xmm13, (%5)\n"
14652         "movupd %%xmm14, (%6)\n"
14653         "movupd %%xmm15, (%7)\n"
14654         :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14655       );
14656     }
14657   }
14658 }
14659 void helper_double_8_recursive(double *buf, int depth);
helper_double_8_recursive(double * buf,int depth)14660 void helper_double_8_recursive(double *buf, int depth) {
14661   if (depth == 2) {
14662     for (int j = 0; j < 4; j += 4) {
14663       for (int k = 0; k < 2; k += 2) {
14664         __asm__ volatile (
14665           "movupd (%0), %%xmm0\n"
14666           "movupd (%1), %%xmm1\n"
14667           "movapd %%xmm0, %%xmm8\n"
14668           "haddpd %%xmm8, %%xmm8\n"
14669           "movapd %%xmm0, %%xmm9\n"
14670           "hsubpd %%xmm9, %%xmm9\n"
14671           "blendpd $1, %%xmm8, %%xmm9\n"
14672           "movapd %%xmm9, %%xmm0\n"
14673           "movapd %%xmm1, %%xmm8\n"
14674           "haddpd %%xmm8, %%xmm8\n"
14675           "movapd %%xmm1, %%xmm9\n"
14676           "hsubpd %%xmm9, %%xmm9\n"
14677           "blendpd $1, %%xmm8, %%xmm9\n"
14678           "movapd %%xmm9, %%xmm1\n"
14679           "movapd %%xmm0, %%xmm8\n"
14680           "movapd %%xmm0, %%xmm9\n"
14681           "addpd %%xmm1, %%xmm8\n"
14682           "subpd %%xmm1, %%xmm9\n"
14683           "movupd %%xmm8, (%0)\n"
14684           "movupd %%xmm9, (%1)\n"
14685           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14686         );
14687       }
14688     }
14689     return;
14690   }
14691   if (depth == 5) {
14692     helper_double_8_recursive(buf + 0, 2);
14693     helper_double_8_recursive(buf + 4, 2);
14694     helper_double_8_recursive(buf + 8, 2);
14695     helper_double_8_recursive(buf + 12, 2);
14696     helper_double_8_recursive(buf + 16, 2);
14697     helper_double_8_recursive(buf + 20, 2);
14698     helper_double_8_recursive(buf + 24, 2);
14699     helper_double_8_recursive(buf + 28, 2);
14700     for (int j = 0; j < 32; j += 32) {
14701       for (int k = 0; k < 4; k += 2) {
14702         __asm__ volatile (
14703           "movupd (%0), %%xmm0\n"
14704           "movupd (%1), %%xmm1\n"
14705           "movupd (%2), %%xmm2\n"
14706           "movupd (%3), %%xmm3\n"
14707           "movupd (%4), %%xmm4\n"
14708           "movupd (%5), %%xmm5\n"
14709           "movupd (%6), %%xmm6\n"
14710           "movupd (%7), %%xmm7\n"
14711           "movapd %%xmm0, %%xmm8\n"
14712           "movapd %%xmm0, %%xmm9\n"
14713           "addpd %%xmm1, %%xmm8\n"
14714           "subpd %%xmm1, %%xmm9\n"
14715           "movapd %%xmm2, %%xmm10\n"
14716           "movapd %%xmm2, %%xmm11\n"
14717           "addpd %%xmm3, %%xmm10\n"
14718           "subpd %%xmm3, %%xmm11\n"
14719           "movapd %%xmm4, %%xmm12\n"
14720           "movapd %%xmm4, %%xmm13\n"
14721           "addpd %%xmm5, %%xmm12\n"
14722           "subpd %%xmm5, %%xmm13\n"
14723           "movapd %%xmm6, %%xmm14\n"
14724           "movapd %%xmm6, %%xmm15\n"
14725           "addpd %%xmm7, %%xmm14\n"
14726           "subpd %%xmm7, %%xmm15\n"
14727           "movapd %%xmm8, %%xmm0\n"
14728           "movapd %%xmm8, %%xmm2\n"
14729           "addpd %%xmm10, %%xmm0\n"
14730           "subpd %%xmm10, %%xmm2\n"
14731           "movapd %%xmm9, %%xmm1\n"
14732           "movapd %%xmm9, %%xmm3\n"
14733           "addpd %%xmm11, %%xmm1\n"
14734           "subpd %%xmm11, %%xmm3\n"
14735           "movapd %%xmm12, %%xmm4\n"
14736           "movapd %%xmm12, %%xmm6\n"
14737           "addpd %%xmm14, %%xmm4\n"
14738           "subpd %%xmm14, %%xmm6\n"
14739           "movapd %%xmm13, %%xmm5\n"
14740           "movapd %%xmm13, %%xmm7\n"
14741           "addpd %%xmm15, %%xmm5\n"
14742           "subpd %%xmm15, %%xmm7\n"
14743           "movapd %%xmm0, %%xmm8\n"
14744           "movapd %%xmm0, %%xmm12\n"
14745           "addpd %%xmm4, %%xmm8\n"
14746           "subpd %%xmm4, %%xmm12\n"
14747           "movapd %%xmm1, %%xmm9\n"
14748           "movapd %%xmm1, %%xmm13\n"
14749           "addpd %%xmm5, %%xmm9\n"
14750           "subpd %%xmm5, %%xmm13\n"
14751           "movapd %%xmm2, %%xmm10\n"
14752           "movapd %%xmm2, %%xmm14\n"
14753           "addpd %%xmm6, %%xmm10\n"
14754           "subpd %%xmm6, %%xmm14\n"
14755           "movapd %%xmm3, %%xmm11\n"
14756           "movapd %%xmm3, %%xmm15\n"
14757           "addpd %%xmm7, %%xmm11\n"
14758           "subpd %%xmm7, %%xmm15\n"
14759           "movupd %%xmm8, (%0)\n"
14760           "movupd %%xmm9, (%1)\n"
14761           "movupd %%xmm10, (%2)\n"
14762           "movupd %%xmm11, (%3)\n"
14763           "movupd %%xmm12, (%4)\n"
14764           "movupd %%xmm13, (%5)\n"
14765           "movupd %%xmm14, (%6)\n"
14766           "movupd %%xmm15, (%7)\n"
14767           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14768         );
14769       }
14770     }
14771     return;
14772   }
14773   if (depth == 8) {
14774     helper_double_8_recursive(buf + 0, 5);
14775     helper_double_8_recursive(buf + 32, 5);
14776     helper_double_8_recursive(buf + 64, 5);
14777     helper_double_8_recursive(buf + 96, 5);
14778     helper_double_8_recursive(buf + 128, 5);
14779     helper_double_8_recursive(buf + 160, 5);
14780     helper_double_8_recursive(buf + 192, 5);
14781     helper_double_8_recursive(buf + 224, 5);
14782     for (int j = 0; j < 256; j += 256) {
14783       for (int k = 0; k < 32; k += 2) {
14784         __asm__ volatile (
14785           "movupd (%0), %%xmm0\n"
14786           "movupd (%1), %%xmm1\n"
14787           "movupd (%2), %%xmm2\n"
14788           "movupd (%3), %%xmm3\n"
14789           "movupd (%4), %%xmm4\n"
14790           "movupd (%5), %%xmm5\n"
14791           "movupd (%6), %%xmm6\n"
14792           "movupd (%7), %%xmm7\n"
14793           "movapd %%xmm0, %%xmm8\n"
14794           "movapd %%xmm0, %%xmm9\n"
14795           "addpd %%xmm1, %%xmm8\n"
14796           "subpd %%xmm1, %%xmm9\n"
14797           "movapd %%xmm2, %%xmm10\n"
14798           "movapd %%xmm2, %%xmm11\n"
14799           "addpd %%xmm3, %%xmm10\n"
14800           "subpd %%xmm3, %%xmm11\n"
14801           "movapd %%xmm4, %%xmm12\n"
14802           "movapd %%xmm4, %%xmm13\n"
14803           "addpd %%xmm5, %%xmm12\n"
14804           "subpd %%xmm5, %%xmm13\n"
14805           "movapd %%xmm6, %%xmm14\n"
14806           "movapd %%xmm6, %%xmm15\n"
14807           "addpd %%xmm7, %%xmm14\n"
14808           "subpd %%xmm7, %%xmm15\n"
14809           "movapd %%xmm8, %%xmm0\n"
14810           "movapd %%xmm8, %%xmm2\n"
14811           "addpd %%xmm10, %%xmm0\n"
14812           "subpd %%xmm10, %%xmm2\n"
14813           "movapd %%xmm9, %%xmm1\n"
14814           "movapd %%xmm9, %%xmm3\n"
14815           "addpd %%xmm11, %%xmm1\n"
14816           "subpd %%xmm11, %%xmm3\n"
14817           "movapd %%xmm12, %%xmm4\n"
14818           "movapd %%xmm12, %%xmm6\n"
14819           "addpd %%xmm14, %%xmm4\n"
14820           "subpd %%xmm14, %%xmm6\n"
14821           "movapd %%xmm13, %%xmm5\n"
14822           "movapd %%xmm13, %%xmm7\n"
14823           "addpd %%xmm15, %%xmm5\n"
14824           "subpd %%xmm15, %%xmm7\n"
14825           "movapd %%xmm0, %%xmm8\n"
14826           "movapd %%xmm0, %%xmm12\n"
14827           "addpd %%xmm4, %%xmm8\n"
14828           "subpd %%xmm4, %%xmm12\n"
14829           "movapd %%xmm1, %%xmm9\n"
14830           "movapd %%xmm1, %%xmm13\n"
14831           "addpd %%xmm5, %%xmm9\n"
14832           "subpd %%xmm5, %%xmm13\n"
14833           "movapd %%xmm2, %%xmm10\n"
14834           "movapd %%xmm2, %%xmm14\n"
14835           "addpd %%xmm6, %%xmm10\n"
14836           "subpd %%xmm6, %%xmm14\n"
14837           "movapd %%xmm3, %%xmm11\n"
14838           "movapd %%xmm3, %%xmm15\n"
14839           "addpd %%xmm7, %%xmm11\n"
14840           "subpd %%xmm7, %%xmm15\n"
14841           "movupd %%xmm8, (%0)\n"
14842           "movupd %%xmm9, (%1)\n"
14843           "movupd %%xmm10, (%2)\n"
14844           "movupd %%xmm11, (%3)\n"
14845           "movupd %%xmm12, (%4)\n"
14846           "movupd %%xmm13, (%5)\n"
14847           "movupd %%xmm14, (%6)\n"
14848           "movupd %%xmm15, (%7)\n"
14849           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14850         );
14851       }
14852     }
14853     return;
14854   }
14855 }
14856 void helper_double_8(double *buf);
helper_double_8(double * buf)14857 void helper_double_8(double *buf) {
14858   helper_double_8_recursive(buf, 8);
14859 }
14860 void helper_double_9_recursive(double *buf, int depth);
helper_double_9_recursive(double * buf,int depth)14861 void helper_double_9_recursive(double *buf, int depth) {
14862   if (depth == 6) {
14863     for (int j = 0; j < 64; j += 16) {
14864       for (int k = 0; k < 2; k += 2) {
14865         __asm__ volatile (
14866           "movupd (%0), %%xmm0\n"
14867           "movupd (%1), %%xmm1\n"
14868           "movupd (%2), %%xmm2\n"
14869           "movupd (%3), %%xmm3\n"
14870           "movupd (%4), %%xmm4\n"
14871           "movupd (%5), %%xmm5\n"
14872           "movupd (%6), %%xmm6\n"
14873           "movupd (%7), %%xmm7\n"
14874           "movapd %%xmm0, %%xmm8\n"
14875           "haddpd %%xmm8, %%xmm8\n"
14876           "movapd %%xmm0, %%xmm9\n"
14877           "hsubpd %%xmm9, %%xmm9\n"
14878           "blendpd $1, %%xmm8, %%xmm9\n"
14879           "movapd %%xmm9, %%xmm0\n"
14880           "movapd %%xmm1, %%xmm8\n"
14881           "haddpd %%xmm8, %%xmm8\n"
14882           "movapd %%xmm1, %%xmm9\n"
14883           "hsubpd %%xmm9, %%xmm9\n"
14884           "blendpd $1, %%xmm8, %%xmm9\n"
14885           "movapd %%xmm9, %%xmm1\n"
14886           "movapd %%xmm2, %%xmm8\n"
14887           "haddpd %%xmm8, %%xmm8\n"
14888           "movapd %%xmm2, %%xmm9\n"
14889           "hsubpd %%xmm9, %%xmm9\n"
14890           "blendpd $1, %%xmm8, %%xmm9\n"
14891           "movapd %%xmm9, %%xmm2\n"
14892           "movapd %%xmm3, %%xmm8\n"
14893           "haddpd %%xmm8, %%xmm8\n"
14894           "movapd %%xmm3, %%xmm9\n"
14895           "hsubpd %%xmm9, %%xmm9\n"
14896           "blendpd $1, %%xmm8, %%xmm9\n"
14897           "movapd %%xmm9, %%xmm3\n"
14898           "movapd %%xmm4, %%xmm8\n"
14899           "haddpd %%xmm8, %%xmm8\n"
14900           "movapd %%xmm4, %%xmm9\n"
14901           "hsubpd %%xmm9, %%xmm9\n"
14902           "blendpd $1, %%xmm8, %%xmm9\n"
14903           "movapd %%xmm9, %%xmm4\n"
14904           "movapd %%xmm5, %%xmm8\n"
14905           "haddpd %%xmm8, %%xmm8\n"
14906           "movapd %%xmm5, %%xmm9\n"
14907           "hsubpd %%xmm9, %%xmm9\n"
14908           "blendpd $1, %%xmm8, %%xmm9\n"
14909           "movapd %%xmm9, %%xmm5\n"
14910           "movapd %%xmm6, %%xmm8\n"
14911           "haddpd %%xmm8, %%xmm8\n"
14912           "movapd %%xmm6, %%xmm9\n"
14913           "hsubpd %%xmm9, %%xmm9\n"
14914           "blendpd $1, %%xmm8, %%xmm9\n"
14915           "movapd %%xmm9, %%xmm6\n"
14916           "movapd %%xmm7, %%xmm8\n"
14917           "haddpd %%xmm8, %%xmm8\n"
14918           "movapd %%xmm7, %%xmm9\n"
14919           "hsubpd %%xmm9, %%xmm9\n"
14920           "blendpd $1, %%xmm8, %%xmm9\n"
14921           "movapd %%xmm9, %%xmm7\n"
14922           "movapd %%xmm0, %%xmm8\n"
14923           "movapd %%xmm0, %%xmm9\n"
14924           "addpd %%xmm1, %%xmm8\n"
14925           "subpd %%xmm1, %%xmm9\n"
14926           "movapd %%xmm2, %%xmm10\n"
14927           "movapd %%xmm2, %%xmm11\n"
14928           "addpd %%xmm3, %%xmm10\n"
14929           "subpd %%xmm3, %%xmm11\n"
14930           "movapd %%xmm4, %%xmm12\n"
14931           "movapd %%xmm4, %%xmm13\n"
14932           "addpd %%xmm5, %%xmm12\n"
14933           "subpd %%xmm5, %%xmm13\n"
14934           "movapd %%xmm6, %%xmm14\n"
14935           "movapd %%xmm6, %%xmm15\n"
14936           "addpd %%xmm7, %%xmm14\n"
14937           "subpd %%xmm7, %%xmm15\n"
14938           "movapd %%xmm8, %%xmm0\n"
14939           "movapd %%xmm8, %%xmm2\n"
14940           "addpd %%xmm10, %%xmm0\n"
14941           "subpd %%xmm10, %%xmm2\n"
14942           "movapd %%xmm9, %%xmm1\n"
14943           "movapd %%xmm9, %%xmm3\n"
14944           "addpd %%xmm11, %%xmm1\n"
14945           "subpd %%xmm11, %%xmm3\n"
14946           "movapd %%xmm12, %%xmm4\n"
14947           "movapd %%xmm12, %%xmm6\n"
14948           "addpd %%xmm14, %%xmm4\n"
14949           "subpd %%xmm14, %%xmm6\n"
14950           "movapd %%xmm13, %%xmm5\n"
14951           "movapd %%xmm13, %%xmm7\n"
14952           "addpd %%xmm15, %%xmm5\n"
14953           "subpd %%xmm15, %%xmm7\n"
14954           "movapd %%xmm0, %%xmm8\n"
14955           "movapd %%xmm0, %%xmm12\n"
14956           "addpd %%xmm4, %%xmm8\n"
14957           "subpd %%xmm4, %%xmm12\n"
14958           "movapd %%xmm1, %%xmm9\n"
14959           "movapd %%xmm1, %%xmm13\n"
14960           "addpd %%xmm5, %%xmm9\n"
14961           "subpd %%xmm5, %%xmm13\n"
14962           "movapd %%xmm2, %%xmm10\n"
14963           "movapd %%xmm2, %%xmm14\n"
14964           "addpd %%xmm6, %%xmm10\n"
14965           "subpd %%xmm6, %%xmm14\n"
14966           "movapd %%xmm3, %%xmm11\n"
14967           "movapd %%xmm3, %%xmm15\n"
14968           "addpd %%xmm7, %%xmm11\n"
14969           "subpd %%xmm7, %%xmm15\n"
14970           "movupd %%xmm8, (%0)\n"
14971           "movupd %%xmm9, (%1)\n"
14972           "movupd %%xmm10, (%2)\n"
14973           "movupd %%xmm11, (%3)\n"
14974           "movupd %%xmm12, (%4)\n"
14975           "movupd %%xmm13, (%5)\n"
14976           "movupd %%xmm14, (%6)\n"
14977           "movupd %%xmm15, (%7)\n"
14978           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14979         );
14980       }
14981     }
14982     for (int j = 0; j < 64; j += 64) {
14983       for (int k = 0; k < 16; k += 2) {
14984         __asm__ volatile (
14985           "movupd (%0), %%xmm0\n"
14986           "movupd (%1), %%xmm1\n"
14987           "movupd (%2), %%xmm2\n"
14988           "movupd (%3), %%xmm3\n"
14989           "movapd %%xmm0, %%xmm8\n"
14990           "movapd %%xmm0, %%xmm9\n"
14991           "addpd %%xmm1, %%xmm8\n"
14992           "subpd %%xmm1, %%xmm9\n"
14993           "movapd %%xmm2, %%xmm10\n"
14994           "movapd %%xmm2, %%xmm11\n"
14995           "addpd %%xmm3, %%xmm10\n"
14996           "subpd %%xmm3, %%xmm11\n"
14997           "movapd %%xmm8, %%xmm0\n"
14998           "movapd %%xmm8, %%xmm2\n"
14999           "addpd %%xmm10, %%xmm0\n"
15000           "subpd %%xmm10, %%xmm2\n"
15001           "movapd %%xmm9, %%xmm1\n"
15002           "movapd %%xmm9, %%xmm3\n"
15003           "addpd %%xmm11, %%xmm1\n"
15004           "subpd %%xmm11, %%xmm3\n"
15005           "movupd %%xmm0, (%0)\n"
15006           "movupd %%xmm1, (%1)\n"
15007           "movupd %%xmm2, (%2)\n"
15008           "movupd %%xmm3, (%3)\n"
15009           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15010         );
15011       }
15012     }
15013     return;
15014   }
15015   if (depth == 9) {
15016     helper_double_9_recursive(buf + 0, 6);
15017     helper_double_9_recursive(buf + 64, 6);
15018     helper_double_9_recursive(buf + 128, 6);
15019     helper_double_9_recursive(buf + 192, 6);
15020     helper_double_9_recursive(buf + 256, 6);
15021     helper_double_9_recursive(buf + 320, 6);
15022     helper_double_9_recursive(buf + 384, 6);
15023     helper_double_9_recursive(buf + 448, 6);
15024     for (int j = 0; j < 512; j += 512) {
15025       for (int k = 0; k < 64; k += 2) {
15026         __asm__ volatile (
15027           "movupd (%0), %%xmm0\n"
15028           "movupd (%1), %%xmm1\n"
15029           "movupd (%2), %%xmm2\n"
15030           "movupd (%3), %%xmm3\n"
15031           "movupd (%4), %%xmm4\n"
15032           "movupd (%5), %%xmm5\n"
15033           "movupd (%6), %%xmm6\n"
15034           "movupd (%7), %%xmm7\n"
15035           "movapd %%xmm0, %%xmm8\n"
15036           "movapd %%xmm0, %%xmm9\n"
15037           "addpd %%xmm1, %%xmm8\n"
15038           "subpd %%xmm1, %%xmm9\n"
15039           "movapd %%xmm2, %%xmm10\n"
15040           "movapd %%xmm2, %%xmm11\n"
15041           "addpd %%xmm3, %%xmm10\n"
15042           "subpd %%xmm3, %%xmm11\n"
15043           "movapd %%xmm4, %%xmm12\n"
15044           "movapd %%xmm4, %%xmm13\n"
15045           "addpd %%xmm5, %%xmm12\n"
15046           "subpd %%xmm5, %%xmm13\n"
15047           "movapd %%xmm6, %%xmm14\n"
15048           "movapd %%xmm6, %%xmm15\n"
15049           "addpd %%xmm7, %%xmm14\n"
15050           "subpd %%xmm7, %%xmm15\n"
15051           "movapd %%xmm8, %%xmm0\n"
15052           "movapd %%xmm8, %%xmm2\n"
15053           "addpd %%xmm10, %%xmm0\n"
15054           "subpd %%xmm10, %%xmm2\n"
15055           "movapd %%xmm9, %%xmm1\n"
15056           "movapd %%xmm9, %%xmm3\n"
15057           "addpd %%xmm11, %%xmm1\n"
15058           "subpd %%xmm11, %%xmm3\n"
15059           "movapd %%xmm12, %%xmm4\n"
15060           "movapd %%xmm12, %%xmm6\n"
15061           "addpd %%xmm14, %%xmm4\n"
15062           "subpd %%xmm14, %%xmm6\n"
15063           "movapd %%xmm13, %%xmm5\n"
15064           "movapd %%xmm13, %%xmm7\n"
15065           "addpd %%xmm15, %%xmm5\n"
15066           "subpd %%xmm15, %%xmm7\n"
15067           "movapd %%xmm0, %%xmm8\n"
15068           "movapd %%xmm0, %%xmm12\n"
15069           "addpd %%xmm4, %%xmm8\n"
15070           "subpd %%xmm4, %%xmm12\n"
15071           "movapd %%xmm1, %%xmm9\n"
15072           "movapd %%xmm1, %%xmm13\n"
15073           "addpd %%xmm5, %%xmm9\n"
15074           "subpd %%xmm5, %%xmm13\n"
15075           "movapd %%xmm2, %%xmm10\n"
15076           "movapd %%xmm2, %%xmm14\n"
15077           "addpd %%xmm6, %%xmm10\n"
15078           "subpd %%xmm6, %%xmm14\n"
15079           "movapd %%xmm3, %%xmm11\n"
15080           "movapd %%xmm3, %%xmm15\n"
15081           "addpd %%xmm7, %%xmm11\n"
15082           "subpd %%xmm7, %%xmm15\n"
15083           "movupd %%xmm8, (%0)\n"
15084           "movupd %%xmm9, (%1)\n"
15085           "movupd %%xmm10, (%2)\n"
15086           "movupd %%xmm11, (%3)\n"
15087           "movupd %%xmm12, (%4)\n"
15088           "movupd %%xmm13, (%5)\n"
15089           "movupd %%xmm14, (%6)\n"
15090           "movupd %%xmm15, (%7)\n"
15091           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15092         );
15093       }
15094     }
15095     return;
15096   }
15097 }
15098 void helper_double_9(double *buf);
helper_double_9(double * buf)15099 void helper_double_9(double *buf) {
15100   helper_double_9_recursive(buf, 9);
15101 }
15102 void helper_double_10_recursive(double *buf, int depth);
helper_double_10_recursive(double * buf,int depth)15103 void helper_double_10_recursive(double *buf, int depth) {
15104   if (depth == 10) {
15105     for (int j = 0; j < 1024; j += 16) {
15106       for (int k = 0; k < 2; k += 2) {
15107         __asm__ volatile (
15108           "movupd (%0), %%xmm0\n"
15109           "movupd (%1), %%xmm1\n"
15110           "movupd (%2), %%xmm2\n"
15111           "movupd (%3), %%xmm3\n"
15112           "movupd (%4), %%xmm4\n"
15113           "movupd (%5), %%xmm5\n"
15114           "movupd (%6), %%xmm6\n"
15115           "movupd (%7), %%xmm7\n"
15116           "movapd %%xmm0, %%xmm8\n"
15117           "haddpd %%xmm8, %%xmm8\n"
15118           "movapd %%xmm0, %%xmm9\n"
15119           "hsubpd %%xmm9, %%xmm9\n"
15120           "blendpd $1, %%xmm8, %%xmm9\n"
15121           "movapd %%xmm9, %%xmm0\n"
15122           "movapd %%xmm1, %%xmm8\n"
15123           "haddpd %%xmm8, %%xmm8\n"
15124           "movapd %%xmm1, %%xmm9\n"
15125           "hsubpd %%xmm9, %%xmm9\n"
15126           "blendpd $1, %%xmm8, %%xmm9\n"
15127           "movapd %%xmm9, %%xmm1\n"
15128           "movapd %%xmm2, %%xmm8\n"
15129           "haddpd %%xmm8, %%xmm8\n"
15130           "movapd %%xmm2, %%xmm9\n"
15131           "hsubpd %%xmm9, %%xmm9\n"
15132           "blendpd $1, %%xmm8, %%xmm9\n"
15133           "movapd %%xmm9, %%xmm2\n"
15134           "movapd %%xmm3, %%xmm8\n"
15135           "haddpd %%xmm8, %%xmm8\n"
15136           "movapd %%xmm3, %%xmm9\n"
15137           "hsubpd %%xmm9, %%xmm9\n"
15138           "blendpd $1, %%xmm8, %%xmm9\n"
15139           "movapd %%xmm9, %%xmm3\n"
15140           "movapd %%xmm4, %%xmm8\n"
15141           "haddpd %%xmm8, %%xmm8\n"
15142           "movapd %%xmm4, %%xmm9\n"
15143           "hsubpd %%xmm9, %%xmm9\n"
15144           "blendpd $1, %%xmm8, %%xmm9\n"
15145           "movapd %%xmm9, %%xmm4\n"
15146           "movapd %%xmm5, %%xmm8\n"
15147           "haddpd %%xmm8, %%xmm8\n"
15148           "movapd %%xmm5, %%xmm9\n"
15149           "hsubpd %%xmm9, %%xmm9\n"
15150           "blendpd $1, %%xmm8, %%xmm9\n"
15151           "movapd %%xmm9, %%xmm5\n"
15152           "movapd %%xmm6, %%xmm8\n"
15153           "haddpd %%xmm8, %%xmm8\n"
15154           "movapd %%xmm6, %%xmm9\n"
15155           "hsubpd %%xmm9, %%xmm9\n"
15156           "blendpd $1, %%xmm8, %%xmm9\n"
15157           "movapd %%xmm9, %%xmm6\n"
15158           "movapd %%xmm7, %%xmm8\n"
15159           "haddpd %%xmm8, %%xmm8\n"
15160           "movapd %%xmm7, %%xmm9\n"
15161           "hsubpd %%xmm9, %%xmm9\n"
15162           "blendpd $1, %%xmm8, %%xmm9\n"
15163           "movapd %%xmm9, %%xmm7\n"
15164           "movapd %%xmm0, %%xmm8\n"
15165           "movapd %%xmm0, %%xmm9\n"
15166           "addpd %%xmm1, %%xmm8\n"
15167           "subpd %%xmm1, %%xmm9\n"
15168           "movapd %%xmm2, %%xmm10\n"
15169           "movapd %%xmm2, %%xmm11\n"
15170           "addpd %%xmm3, %%xmm10\n"
15171           "subpd %%xmm3, %%xmm11\n"
15172           "movapd %%xmm4, %%xmm12\n"
15173           "movapd %%xmm4, %%xmm13\n"
15174           "addpd %%xmm5, %%xmm12\n"
15175           "subpd %%xmm5, %%xmm13\n"
15176           "movapd %%xmm6, %%xmm14\n"
15177           "movapd %%xmm6, %%xmm15\n"
15178           "addpd %%xmm7, %%xmm14\n"
15179           "subpd %%xmm7, %%xmm15\n"
15180           "movapd %%xmm8, %%xmm0\n"
15181           "movapd %%xmm8, %%xmm2\n"
15182           "addpd %%xmm10, %%xmm0\n"
15183           "subpd %%xmm10, %%xmm2\n"
15184           "movapd %%xmm9, %%xmm1\n"
15185           "movapd %%xmm9, %%xmm3\n"
15186           "addpd %%xmm11, %%xmm1\n"
15187           "subpd %%xmm11, %%xmm3\n"
15188           "movapd %%xmm12, %%xmm4\n"
15189           "movapd %%xmm12, %%xmm6\n"
15190           "addpd %%xmm14, %%xmm4\n"
15191           "subpd %%xmm14, %%xmm6\n"
15192           "movapd %%xmm13, %%xmm5\n"
15193           "movapd %%xmm13, %%xmm7\n"
15194           "addpd %%xmm15, %%xmm5\n"
15195           "subpd %%xmm15, %%xmm7\n"
15196           "movapd %%xmm0, %%xmm8\n"
15197           "movapd %%xmm0, %%xmm12\n"
15198           "addpd %%xmm4, %%xmm8\n"
15199           "subpd %%xmm4, %%xmm12\n"
15200           "movapd %%xmm1, %%xmm9\n"
15201           "movapd %%xmm1, %%xmm13\n"
15202           "addpd %%xmm5, %%xmm9\n"
15203           "subpd %%xmm5, %%xmm13\n"
15204           "movapd %%xmm2, %%xmm10\n"
15205           "movapd %%xmm2, %%xmm14\n"
15206           "addpd %%xmm6, %%xmm10\n"
15207           "subpd %%xmm6, %%xmm14\n"
15208           "movapd %%xmm3, %%xmm11\n"
15209           "movapd %%xmm3, %%xmm15\n"
15210           "addpd %%xmm7, %%xmm11\n"
15211           "subpd %%xmm7, %%xmm15\n"
15212           "movupd %%xmm8, (%0)\n"
15213           "movupd %%xmm9, (%1)\n"
15214           "movupd %%xmm10, (%2)\n"
15215           "movupd %%xmm11, (%3)\n"
15216           "movupd %%xmm12, (%4)\n"
15217           "movupd %%xmm13, (%5)\n"
15218           "movupd %%xmm14, (%6)\n"
15219           "movupd %%xmm15, (%7)\n"
15220           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15221         );
15222       }
15223     }
15224     for (int j = 0; j < 1024; j += 128) {
15225       for (int k = 0; k < 16; k += 2) {
15226         __asm__ volatile (
15227           "movupd (%0), %%xmm0\n"
15228           "movupd (%1), %%xmm1\n"
15229           "movupd (%2), %%xmm2\n"
15230           "movupd (%3), %%xmm3\n"
15231           "movupd (%4), %%xmm4\n"
15232           "movupd (%5), %%xmm5\n"
15233           "movupd (%6), %%xmm6\n"
15234           "movupd (%7), %%xmm7\n"
15235           "movapd %%xmm0, %%xmm8\n"
15236           "movapd %%xmm0, %%xmm9\n"
15237           "addpd %%xmm1, %%xmm8\n"
15238           "subpd %%xmm1, %%xmm9\n"
15239           "movapd %%xmm2, %%xmm10\n"
15240           "movapd %%xmm2, %%xmm11\n"
15241           "addpd %%xmm3, %%xmm10\n"
15242           "subpd %%xmm3, %%xmm11\n"
15243           "movapd %%xmm4, %%xmm12\n"
15244           "movapd %%xmm4, %%xmm13\n"
15245           "addpd %%xmm5, %%xmm12\n"
15246           "subpd %%xmm5, %%xmm13\n"
15247           "movapd %%xmm6, %%xmm14\n"
15248           "movapd %%xmm6, %%xmm15\n"
15249           "addpd %%xmm7, %%xmm14\n"
15250           "subpd %%xmm7, %%xmm15\n"
15251           "movapd %%xmm8, %%xmm0\n"
15252           "movapd %%xmm8, %%xmm2\n"
15253           "addpd %%xmm10, %%xmm0\n"
15254           "subpd %%xmm10, %%xmm2\n"
15255           "movapd %%xmm9, %%xmm1\n"
15256           "movapd %%xmm9, %%xmm3\n"
15257           "addpd %%xmm11, %%xmm1\n"
15258           "subpd %%xmm11, %%xmm3\n"
15259           "movapd %%xmm12, %%xmm4\n"
15260           "movapd %%xmm12, %%xmm6\n"
15261           "addpd %%xmm14, %%xmm4\n"
15262           "subpd %%xmm14, %%xmm6\n"
15263           "movapd %%xmm13, %%xmm5\n"
15264           "movapd %%xmm13, %%xmm7\n"
15265           "addpd %%xmm15, %%xmm5\n"
15266           "subpd %%xmm15, %%xmm7\n"
15267           "movapd %%xmm0, %%xmm8\n"
15268           "movapd %%xmm0, %%xmm12\n"
15269           "addpd %%xmm4, %%xmm8\n"
15270           "subpd %%xmm4, %%xmm12\n"
15271           "movapd %%xmm1, %%xmm9\n"
15272           "movapd %%xmm1, %%xmm13\n"
15273           "addpd %%xmm5, %%xmm9\n"
15274           "subpd %%xmm5, %%xmm13\n"
15275           "movapd %%xmm2, %%xmm10\n"
15276           "movapd %%xmm2, %%xmm14\n"
15277           "addpd %%xmm6, %%xmm10\n"
15278           "subpd %%xmm6, %%xmm14\n"
15279           "movapd %%xmm3, %%xmm11\n"
15280           "movapd %%xmm3, %%xmm15\n"
15281           "addpd %%xmm7, %%xmm11\n"
15282           "subpd %%xmm7, %%xmm15\n"
15283           "movupd %%xmm8, (%0)\n"
15284           "movupd %%xmm9, (%1)\n"
15285           "movupd %%xmm10, (%2)\n"
15286           "movupd %%xmm11, (%3)\n"
15287           "movupd %%xmm12, (%4)\n"
15288           "movupd %%xmm13, (%5)\n"
15289           "movupd %%xmm14, (%6)\n"
15290           "movupd %%xmm15, (%7)\n"
15291           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15292         );
15293       }
15294     }
15295     for (int j = 0; j < 1024; j += 1024) {
15296       for (int k = 0; k < 128; k += 2) {
15297         __asm__ volatile (
15298           "movupd (%0), %%xmm0\n"
15299           "movupd (%1), %%xmm1\n"
15300           "movupd (%2), %%xmm2\n"
15301           "movupd (%3), %%xmm3\n"
15302           "movupd (%4), %%xmm4\n"
15303           "movupd (%5), %%xmm5\n"
15304           "movupd (%6), %%xmm6\n"
15305           "movupd (%7), %%xmm7\n"
15306           "movapd %%xmm0, %%xmm8\n"
15307           "movapd %%xmm0, %%xmm9\n"
15308           "addpd %%xmm1, %%xmm8\n"
15309           "subpd %%xmm1, %%xmm9\n"
15310           "movapd %%xmm2, %%xmm10\n"
15311           "movapd %%xmm2, %%xmm11\n"
15312           "addpd %%xmm3, %%xmm10\n"
15313           "subpd %%xmm3, %%xmm11\n"
15314           "movapd %%xmm4, %%xmm12\n"
15315           "movapd %%xmm4, %%xmm13\n"
15316           "addpd %%xmm5, %%xmm12\n"
15317           "subpd %%xmm5, %%xmm13\n"
15318           "movapd %%xmm6, %%xmm14\n"
15319           "movapd %%xmm6, %%xmm15\n"
15320           "addpd %%xmm7, %%xmm14\n"
15321           "subpd %%xmm7, %%xmm15\n"
15322           "movapd %%xmm8, %%xmm0\n"
15323           "movapd %%xmm8, %%xmm2\n"
15324           "addpd %%xmm10, %%xmm0\n"
15325           "subpd %%xmm10, %%xmm2\n"
15326           "movapd %%xmm9, %%xmm1\n"
15327           "movapd %%xmm9, %%xmm3\n"
15328           "addpd %%xmm11, %%xmm1\n"
15329           "subpd %%xmm11, %%xmm3\n"
15330           "movapd %%xmm12, %%xmm4\n"
15331           "movapd %%xmm12, %%xmm6\n"
15332           "addpd %%xmm14, %%xmm4\n"
15333           "subpd %%xmm14, %%xmm6\n"
15334           "movapd %%xmm13, %%xmm5\n"
15335           "movapd %%xmm13, %%xmm7\n"
15336           "addpd %%xmm15, %%xmm5\n"
15337           "subpd %%xmm15, %%xmm7\n"
15338           "movapd %%xmm0, %%xmm8\n"
15339           "movapd %%xmm0, %%xmm12\n"
15340           "addpd %%xmm4, %%xmm8\n"
15341           "subpd %%xmm4, %%xmm12\n"
15342           "movapd %%xmm1, %%xmm9\n"
15343           "movapd %%xmm1, %%xmm13\n"
15344           "addpd %%xmm5, %%xmm9\n"
15345           "subpd %%xmm5, %%xmm13\n"
15346           "movapd %%xmm2, %%xmm10\n"
15347           "movapd %%xmm2, %%xmm14\n"
15348           "addpd %%xmm6, %%xmm10\n"
15349           "subpd %%xmm6, %%xmm14\n"
15350           "movapd %%xmm3, %%xmm11\n"
15351           "movapd %%xmm3, %%xmm15\n"
15352           "addpd %%xmm7, %%xmm11\n"
15353           "subpd %%xmm7, %%xmm15\n"
15354           "movupd %%xmm8, (%0)\n"
15355           "movupd %%xmm9, (%1)\n"
15356           "movupd %%xmm10, (%2)\n"
15357           "movupd %%xmm11, (%3)\n"
15358           "movupd %%xmm12, (%4)\n"
15359           "movupd %%xmm13, (%5)\n"
15360           "movupd %%xmm14, (%6)\n"
15361           "movupd %%xmm15, (%7)\n"
15362           :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15363         );
15364       }
15365     }
15366     return;
15367   }
15368 }
15369 void helper_double_10(double *buf);
helper_double_10(double * buf)15370 void helper_double_10(double *buf) {
15371   helper_double_10_recursive(buf, 10);
15372 }
15373 void helper_double_11_recursive(double *buf, int depth);
helper_double_11_recursive(double * buf,int depth)15374 void helper_double_11_recursive(double *buf, int depth) {
15375   if (depth == 2) {
15376     for (int j = 0; j < 4; j += 4) {
15377       for (int k = 0; k < 2; k += 2) {
15378         __asm__ volatile (
15379           "movupd (%0), %%xmm0\n"
15380           "movupd (%1), %%xmm1\n"
15381           "movapd %%xmm0, %%xmm8\n"
15382           "haddpd %%xmm8, %%xmm8\n"
15383           "movapd %%xmm0, %%xmm9\n"
15384           "hsubpd %%xmm9, %%xmm9\n"
15385           "blendpd $1, %%xmm8, %%xmm9\n"
15386           "movapd %%xmm9, %%xmm0\n"
15387           "movapd %%xmm1, %%xmm8\n"
15388           "haddpd %%xmm8, %%xmm8\n"
15389           "movapd %%xmm1, %%xmm9\n"
15390           "hsubpd %%xmm9, %%xmm9\n"
15391           "blendpd $1, %%xmm8, %%xmm9\n"
15392           "movapd %%xmm9, %%xmm1\n"
15393           "movapd %%xmm0, %%xmm8\n"
15394           "movapd %%xmm0, %%xmm9\n"
15395           "addpd %%xmm1, %%xmm8\n"
15396           "subpd %%xmm1, %%xmm9\n"
15397           "movupd %%xmm8, (%0)\n"
15398           "movupd %%xmm9, (%1)\n"
15399           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15400         );
15401       }
15402     }
15403     return;
15404   }
15405   if (depth == 5) {
15406     helper_double_11_recursive(buf + 0, 2);
15407     helper_double_11_recursive(buf + 4, 2);
15408     helper_double_11_recursive(buf + 8, 2);
15409     helper_double_11_recursive(buf + 12, 2);
15410     helper_double_11_recursive(buf + 16, 2);
15411     helper_double_11_recursive(buf + 20, 2);
15412     helper_double_11_recursive(buf + 24, 2);
15413     helper_double_11_recursive(buf + 28, 2);
15414     for (int j = 0; j < 32; j += 32) {
15415       for (int k = 0; k < 4; k += 2) {
15416         __asm__ volatile (
15417           "movupd (%0), %%xmm0\n"
15418           "movupd (%1), %%xmm1\n"
15419           "movupd (%2), %%xmm2\n"
15420           "movupd (%3), %%xmm3\n"
15421           "movupd (%4), %%xmm4\n"
15422           "movupd (%5), %%xmm5\n"
15423           "movupd (%6), %%xmm6\n"
15424           "movupd (%7), %%xmm7\n"
15425           "movapd %%xmm0, %%xmm8\n"
15426           "movapd %%xmm0, %%xmm9\n"
15427           "addpd %%xmm1, %%xmm8\n"
15428           "subpd %%xmm1, %%xmm9\n"
15429           "movapd %%xmm2, %%xmm10\n"
15430           "movapd %%xmm2, %%xmm11\n"
15431           "addpd %%xmm3, %%xmm10\n"
15432           "subpd %%xmm3, %%xmm11\n"
15433           "movapd %%xmm4, %%xmm12\n"
15434           "movapd %%xmm4, %%xmm13\n"
15435           "addpd %%xmm5, %%xmm12\n"
15436           "subpd %%xmm5, %%xmm13\n"
15437           "movapd %%xmm6, %%xmm14\n"
15438           "movapd %%xmm6, %%xmm15\n"
15439           "addpd %%xmm7, %%xmm14\n"
15440           "subpd %%xmm7, %%xmm15\n"
15441           "movapd %%xmm8, %%xmm0\n"
15442           "movapd %%xmm8, %%xmm2\n"
15443           "addpd %%xmm10, %%xmm0\n"
15444           "subpd %%xmm10, %%xmm2\n"
15445           "movapd %%xmm9, %%xmm1\n"
15446           "movapd %%xmm9, %%xmm3\n"
15447           "addpd %%xmm11, %%xmm1\n"
15448           "subpd %%xmm11, %%xmm3\n"
15449           "movapd %%xmm12, %%xmm4\n"
15450           "movapd %%xmm12, %%xmm6\n"
15451           "addpd %%xmm14, %%xmm4\n"
15452           "subpd %%xmm14, %%xmm6\n"
15453           "movapd %%xmm13, %%xmm5\n"
15454           "movapd %%xmm13, %%xmm7\n"
15455           "addpd %%xmm15, %%xmm5\n"
15456           "subpd %%xmm15, %%xmm7\n"
15457           "movapd %%xmm0, %%xmm8\n"
15458           "movapd %%xmm0, %%xmm12\n"
15459           "addpd %%xmm4, %%xmm8\n"
15460           "subpd %%xmm4, %%xmm12\n"
15461           "movapd %%xmm1, %%xmm9\n"
15462           "movapd %%xmm1, %%xmm13\n"
15463           "addpd %%xmm5, %%xmm9\n"
15464           "subpd %%xmm5, %%xmm13\n"
15465           "movapd %%xmm2, %%xmm10\n"
15466           "movapd %%xmm2, %%xmm14\n"
15467           "addpd %%xmm6, %%xmm10\n"
15468           "subpd %%xmm6, %%xmm14\n"
15469           "movapd %%xmm3, %%xmm11\n"
15470           "movapd %%xmm3, %%xmm15\n"
15471           "addpd %%xmm7, %%xmm11\n"
15472           "subpd %%xmm7, %%xmm15\n"
15473           "movupd %%xmm8, (%0)\n"
15474           "movupd %%xmm9, (%1)\n"
15475           "movupd %%xmm10, (%2)\n"
15476           "movupd %%xmm11, (%3)\n"
15477           "movupd %%xmm12, (%4)\n"
15478           "movupd %%xmm13, (%5)\n"
15479           "movupd %%xmm14, (%6)\n"
15480           "movupd %%xmm15, (%7)\n"
15481           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15482         );
15483       }
15484     }
15485     return;
15486   }
15487   if (depth == 8) {
15488     helper_double_11_recursive(buf + 0, 5);
15489     helper_double_11_recursive(buf + 32, 5);
15490     helper_double_11_recursive(buf + 64, 5);
15491     helper_double_11_recursive(buf + 96, 5);
15492     helper_double_11_recursive(buf + 128, 5);
15493     helper_double_11_recursive(buf + 160, 5);
15494     helper_double_11_recursive(buf + 192, 5);
15495     helper_double_11_recursive(buf + 224, 5);
15496     for (int j = 0; j < 256; j += 256) {
15497       for (int k = 0; k < 32; k += 2) {
15498         __asm__ volatile (
15499           "movupd (%0), %%xmm0\n"
15500           "movupd (%1), %%xmm1\n"
15501           "movupd (%2), %%xmm2\n"
15502           "movupd (%3), %%xmm3\n"
15503           "movupd (%4), %%xmm4\n"
15504           "movupd (%5), %%xmm5\n"
15505           "movupd (%6), %%xmm6\n"
15506           "movupd (%7), %%xmm7\n"
15507           "movapd %%xmm0, %%xmm8\n"
15508           "movapd %%xmm0, %%xmm9\n"
15509           "addpd %%xmm1, %%xmm8\n"
15510           "subpd %%xmm1, %%xmm9\n"
15511           "movapd %%xmm2, %%xmm10\n"
15512           "movapd %%xmm2, %%xmm11\n"
15513           "addpd %%xmm3, %%xmm10\n"
15514           "subpd %%xmm3, %%xmm11\n"
15515           "movapd %%xmm4, %%xmm12\n"
15516           "movapd %%xmm4, %%xmm13\n"
15517           "addpd %%xmm5, %%xmm12\n"
15518           "subpd %%xmm5, %%xmm13\n"
15519           "movapd %%xmm6, %%xmm14\n"
15520           "movapd %%xmm6, %%xmm15\n"
15521           "addpd %%xmm7, %%xmm14\n"
15522           "subpd %%xmm7, %%xmm15\n"
15523           "movapd %%xmm8, %%xmm0\n"
15524           "movapd %%xmm8, %%xmm2\n"
15525           "addpd %%xmm10, %%xmm0\n"
15526           "subpd %%xmm10, %%xmm2\n"
15527           "movapd %%xmm9, %%xmm1\n"
15528           "movapd %%xmm9, %%xmm3\n"
15529           "addpd %%xmm11, %%xmm1\n"
15530           "subpd %%xmm11, %%xmm3\n"
15531           "movapd %%xmm12, %%xmm4\n"
15532           "movapd %%xmm12, %%xmm6\n"
15533           "addpd %%xmm14, %%xmm4\n"
15534           "subpd %%xmm14, %%xmm6\n"
15535           "movapd %%xmm13, %%xmm5\n"
15536           "movapd %%xmm13, %%xmm7\n"
15537           "addpd %%xmm15, %%xmm5\n"
15538           "subpd %%xmm15, %%xmm7\n"
15539           "movapd %%xmm0, %%xmm8\n"
15540           "movapd %%xmm0, %%xmm12\n"
15541           "addpd %%xmm4, %%xmm8\n"
15542           "subpd %%xmm4, %%xmm12\n"
15543           "movapd %%xmm1, %%xmm9\n"
15544           "movapd %%xmm1, %%xmm13\n"
15545           "addpd %%xmm5, %%xmm9\n"
15546           "subpd %%xmm5, %%xmm13\n"
15547           "movapd %%xmm2, %%xmm10\n"
15548           "movapd %%xmm2, %%xmm14\n"
15549           "addpd %%xmm6, %%xmm10\n"
15550           "subpd %%xmm6, %%xmm14\n"
15551           "movapd %%xmm3, %%xmm11\n"
15552           "movapd %%xmm3, %%xmm15\n"
15553           "addpd %%xmm7, %%xmm11\n"
15554           "subpd %%xmm7, %%xmm15\n"
15555           "movupd %%xmm8, (%0)\n"
15556           "movupd %%xmm9, (%1)\n"
15557           "movupd %%xmm10, (%2)\n"
15558           "movupd %%xmm11, (%3)\n"
15559           "movupd %%xmm12, (%4)\n"
15560           "movupd %%xmm13, (%5)\n"
15561           "movupd %%xmm14, (%6)\n"
15562           "movupd %%xmm15, (%7)\n"
15563           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15564         );
15565       }
15566     }
15567     return;
15568   }
15569   if (depth == 11) {
15570     helper_double_11_recursive(buf + 0, 8);
15571     helper_double_11_recursive(buf + 256, 8);
15572     helper_double_11_recursive(buf + 512, 8);
15573     helper_double_11_recursive(buf + 768, 8);
15574     helper_double_11_recursive(buf + 1024, 8);
15575     helper_double_11_recursive(buf + 1280, 8);
15576     helper_double_11_recursive(buf + 1536, 8);
15577     helper_double_11_recursive(buf + 1792, 8);
15578     for (int j = 0; j < 2048; j += 2048) {
15579       for (int k = 0; k < 256; k += 2) {
15580         __asm__ volatile (
15581           "movupd (%0), %%xmm0\n"
15582           "movupd (%1), %%xmm1\n"
15583           "movupd (%2), %%xmm2\n"
15584           "movupd (%3), %%xmm3\n"
15585           "movupd (%4), %%xmm4\n"
15586           "movupd (%5), %%xmm5\n"
15587           "movupd (%6), %%xmm6\n"
15588           "movupd (%7), %%xmm7\n"
15589           "movapd %%xmm0, %%xmm8\n"
15590           "movapd %%xmm0, %%xmm9\n"
15591           "addpd %%xmm1, %%xmm8\n"
15592           "subpd %%xmm1, %%xmm9\n"
15593           "movapd %%xmm2, %%xmm10\n"
15594           "movapd %%xmm2, %%xmm11\n"
15595           "addpd %%xmm3, %%xmm10\n"
15596           "subpd %%xmm3, %%xmm11\n"
15597           "movapd %%xmm4, %%xmm12\n"
15598           "movapd %%xmm4, %%xmm13\n"
15599           "addpd %%xmm5, %%xmm12\n"
15600           "subpd %%xmm5, %%xmm13\n"
15601           "movapd %%xmm6, %%xmm14\n"
15602           "movapd %%xmm6, %%xmm15\n"
15603           "addpd %%xmm7, %%xmm14\n"
15604           "subpd %%xmm7, %%xmm15\n"
15605           "movapd %%xmm8, %%xmm0\n"
15606           "movapd %%xmm8, %%xmm2\n"
15607           "addpd %%xmm10, %%xmm0\n"
15608           "subpd %%xmm10, %%xmm2\n"
15609           "movapd %%xmm9, %%xmm1\n"
15610           "movapd %%xmm9, %%xmm3\n"
15611           "addpd %%xmm11, %%xmm1\n"
15612           "subpd %%xmm11, %%xmm3\n"
15613           "movapd %%xmm12, %%xmm4\n"
15614           "movapd %%xmm12, %%xmm6\n"
15615           "addpd %%xmm14, %%xmm4\n"
15616           "subpd %%xmm14, %%xmm6\n"
15617           "movapd %%xmm13, %%xmm5\n"
15618           "movapd %%xmm13, %%xmm7\n"
15619           "addpd %%xmm15, %%xmm5\n"
15620           "subpd %%xmm15, %%xmm7\n"
15621           "movapd %%xmm0, %%xmm8\n"
15622           "movapd %%xmm0, %%xmm12\n"
15623           "addpd %%xmm4, %%xmm8\n"
15624           "subpd %%xmm4, %%xmm12\n"
15625           "movapd %%xmm1, %%xmm9\n"
15626           "movapd %%xmm1, %%xmm13\n"
15627           "addpd %%xmm5, %%xmm9\n"
15628           "subpd %%xmm5, %%xmm13\n"
15629           "movapd %%xmm2, %%xmm10\n"
15630           "movapd %%xmm2, %%xmm14\n"
15631           "addpd %%xmm6, %%xmm10\n"
15632           "subpd %%xmm6, %%xmm14\n"
15633           "movapd %%xmm3, %%xmm11\n"
15634           "movapd %%xmm3, %%xmm15\n"
15635           "addpd %%xmm7, %%xmm11\n"
15636           "subpd %%xmm7, %%xmm15\n"
15637           "movupd %%xmm8, (%0)\n"
15638           "movupd %%xmm9, (%1)\n"
15639           "movupd %%xmm10, (%2)\n"
15640           "movupd %%xmm11, (%3)\n"
15641           "movupd %%xmm12, (%4)\n"
15642           "movupd %%xmm13, (%5)\n"
15643           "movupd %%xmm14, (%6)\n"
15644           "movupd %%xmm15, (%7)\n"
15645           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15646         );
15647       }
15648     }
15649     return;
15650   }
15651 }
15652 void helper_double_11(double *buf);
helper_double_11(double * buf)15653 void helper_double_11(double *buf) {
15654   helper_double_11_recursive(buf, 11);
15655 }
15656 void helper_double_12_recursive(double *buf, int depth);
helper_double_12_recursive(double * buf,int depth)15657 void helper_double_12_recursive(double *buf, int depth) {
15658   if (depth == 10) {
15659     for (int j = 0; j < 1024; j += 16) {
15660       for (int k = 0; k < 2; k += 2) {
15661         __asm__ volatile (
15662           "movupd (%0), %%xmm0\n"
15663           "movupd (%1), %%xmm1\n"
15664           "movupd (%2), %%xmm2\n"
15665           "movupd (%3), %%xmm3\n"
15666           "movupd (%4), %%xmm4\n"
15667           "movupd (%5), %%xmm5\n"
15668           "movupd (%6), %%xmm6\n"
15669           "movupd (%7), %%xmm7\n"
15670           "movapd %%xmm0, %%xmm8\n"
15671           "haddpd %%xmm8, %%xmm8\n"
15672           "movapd %%xmm0, %%xmm9\n"
15673           "hsubpd %%xmm9, %%xmm9\n"
15674           "blendpd $1, %%xmm8, %%xmm9\n"
15675           "movapd %%xmm9, %%xmm0\n"
15676           "movapd %%xmm1, %%xmm8\n"
15677           "haddpd %%xmm8, %%xmm8\n"
15678           "movapd %%xmm1, %%xmm9\n"
15679           "hsubpd %%xmm9, %%xmm9\n"
15680           "blendpd $1, %%xmm8, %%xmm9\n"
15681           "movapd %%xmm9, %%xmm1\n"
15682           "movapd %%xmm2, %%xmm8\n"
15683           "haddpd %%xmm8, %%xmm8\n"
15684           "movapd %%xmm2, %%xmm9\n"
15685           "hsubpd %%xmm9, %%xmm9\n"
15686           "blendpd $1, %%xmm8, %%xmm9\n"
15687           "movapd %%xmm9, %%xmm2\n"
15688           "movapd %%xmm3, %%xmm8\n"
15689           "haddpd %%xmm8, %%xmm8\n"
15690           "movapd %%xmm3, %%xmm9\n"
15691           "hsubpd %%xmm9, %%xmm9\n"
15692           "blendpd $1, %%xmm8, %%xmm9\n"
15693           "movapd %%xmm9, %%xmm3\n"
15694           "movapd %%xmm4, %%xmm8\n"
15695           "haddpd %%xmm8, %%xmm8\n"
15696           "movapd %%xmm4, %%xmm9\n"
15697           "hsubpd %%xmm9, %%xmm9\n"
15698           "blendpd $1, %%xmm8, %%xmm9\n"
15699           "movapd %%xmm9, %%xmm4\n"
15700           "movapd %%xmm5, %%xmm8\n"
15701           "haddpd %%xmm8, %%xmm8\n"
15702           "movapd %%xmm5, %%xmm9\n"
15703           "hsubpd %%xmm9, %%xmm9\n"
15704           "blendpd $1, %%xmm8, %%xmm9\n"
15705           "movapd %%xmm9, %%xmm5\n"
15706           "movapd %%xmm6, %%xmm8\n"
15707           "haddpd %%xmm8, %%xmm8\n"
15708           "movapd %%xmm6, %%xmm9\n"
15709           "hsubpd %%xmm9, %%xmm9\n"
15710           "blendpd $1, %%xmm8, %%xmm9\n"
15711           "movapd %%xmm9, %%xmm6\n"
15712           "movapd %%xmm7, %%xmm8\n"
15713           "haddpd %%xmm8, %%xmm8\n"
15714           "movapd %%xmm7, %%xmm9\n"
15715           "hsubpd %%xmm9, %%xmm9\n"
15716           "blendpd $1, %%xmm8, %%xmm9\n"
15717           "movapd %%xmm9, %%xmm7\n"
15718           "movapd %%xmm0, %%xmm8\n"
15719           "movapd %%xmm0, %%xmm9\n"
15720           "addpd %%xmm1, %%xmm8\n"
15721           "subpd %%xmm1, %%xmm9\n"
15722           "movapd %%xmm2, %%xmm10\n"
15723           "movapd %%xmm2, %%xmm11\n"
15724           "addpd %%xmm3, %%xmm10\n"
15725           "subpd %%xmm3, %%xmm11\n"
15726           "movapd %%xmm4, %%xmm12\n"
15727           "movapd %%xmm4, %%xmm13\n"
15728           "addpd %%xmm5, %%xmm12\n"
15729           "subpd %%xmm5, %%xmm13\n"
15730           "movapd %%xmm6, %%xmm14\n"
15731           "movapd %%xmm6, %%xmm15\n"
15732           "addpd %%xmm7, %%xmm14\n"
15733           "subpd %%xmm7, %%xmm15\n"
15734           "movapd %%xmm8, %%xmm0\n"
15735           "movapd %%xmm8, %%xmm2\n"
15736           "addpd %%xmm10, %%xmm0\n"
15737           "subpd %%xmm10, %%xmm2\n"
15738           "movapd %%xmm9, %%xmm1\n"
15739           "movapd %%xmm9, %%xmm3\n"
15740           "addpd %%xmm11, %%xmm1\n"
15741           "subpd %%xmm11, %%xmm3\n"
15742           "movapd %%xmm12, %%xmm4\n"
15743           "movapd %%xmm12, %%xmm6\n"
15744           "addpd %%xmm14, %%xmm4\n"
15745           "subpd %%xmm14, %%xmm6\n"
15746           "movapd %%xmm13, %%xmm5\n"
15747           "movapd %%xmm13, %%xmm7\n"
15748           "addpd %%xmm15, %%xmm5\n"
15749           "subpd %%xmm15, %%xmm7\n"
15750           "movapd %%xmm0, %%xmm8\n"
15751           "movapd %%xmm0, %%xmm12\n"
15752           "addpd %%xmm4, %%xmm8\n"
15753           "subpd %%xmm4, %%xmm12\n"
15754           "movapd %%xmm1, %%xmm9\n"
15755           "movapd %%xmm1, %%xmm13\n"
15756           "addpd %%xmm5, %%xmm9\n"
15757           "subpd %%xmm5, %%xmm13\n"
15758           "movapd %%xmm2, %%xmm10\n"
15759           "movapd %%xmm2, %%xmm14\n"
15760           "addpd %%xmm6, %%xmm10\n"
15761           "subpd %%xmm6, %%xmm14\n"
15762           "movapd %%xmm3, %%xmm11\n"
15763           "movapd %%xmm3, %%xmm15\n"
15764           "addpd %%xmm7, %%xmm11\n"
15765           "subpd %%xmm7, %%xmm15\n"
15766           "movupd %%xmm8, (%0)\n"
15767           "movupd %%xmm9, (%1)\n"
15768           "movupd %%xmm10, (%2)\n"
15769           "movupd %%xmm11, (%3)\n"
15770           "movupd %%xmm12, (%4)\n"
15771           "movupd %%xmm13, (%5)\n"
15772           "movupd %%xmm14, (%6)\n"
15773           "movupd %%xmm15, (%7)\n"
15774           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15775         );
15776       }
15777     }
15778     for (int j = 0; j < 1024; j += 128) {
15779       for (int k = 0; k < 16; k += 2) {
15780         __asm__ volatile (
15781           "movupd (%0), %%xmm0\n"
15782           "movupd (%1), %%xmm1\n"
15783           "movupd (%2), %%xmm2\n"
15784           "movupd (%3), %%xmm3\n"
15785           "movupd (%4), %%xmm4\n"
15786           "movupd (%5), %%xmm5\n"
15787           "movupd (%6), %%xmm6\n"
15788           "movupd (%7), %%xmm7\n"
15789           "movapd %%xmm0, %%xmm8\n"
15790           "movapd %%xmm0, %%xmm9\n"
15791           "addpd %%xmm1, %%xmm8\n"
15792           "subpd %%xmm1, %%xmm9\n"
15793           "movapd %%xmm2, %%xmm10\n"
15794           "movapd %%xmm2, %%xmm11\n"
15795           "addpd %%xmm3, %%xmm10\n"
15796           "subpd %%xmm3, %%xmm11\n"
15797           "movapd %%xmm4, %%xmm12\n"
15798           "movapd %%xmm4, %%xmm13\n"
15799           "addpd %%xmm5, %%xmm12\n"
15800           "subpd %%xmm5, %%xmm13\n"
15801           "movapd %%xmm6, %%xmm14\n"
15802           "movapd %%xmm6, %%xmm15\n"
15803           "addpd %%xmm7, %%xmm14\n"
15804           "subpd %%xmm7, %%xmm15\n"
15805           "movapd %%xmm8, %%xmm0\n"
15806           "movapd %%xmm8, %%xmm2\n"
15807           "addpd %%xmm10, %%xmm0\n"
15808           "subpd %%xmm10, %%xmm2\n"
15809           "movapd %%xmm9, %%xmm1\n"
15810           "movapd %%xmm9, %%xmm3\n"
15811           "addpd %%xmm11, %%xmm1\n"
15812           "subpd %%xmm11, %%xmm3\n"
15813           "movapd %%xmm12, %%xmm4\n"
15814           "movapd %%xmm12, %%xmm6\n"
15815           "addpd %%xmm14, %%xmm4\n"
15816           "subpd %%xmm14, %%xmm6\n"
15817           "movapd %%xmm13, %%xmm5\n"
15818           "movapd %%xmm13, %%xmm7\n"
15819           "addpd %%xmm15, %%xmm5\n"
15820           "subpd %%xmm15, %%xmm7\n"
15821           "movapd %%xmm0, %%xmm8\n"
15822           "movapd %%xmm0, %%xmm12\n"
15823           "addpd %%xmm4, %%xmm8\n"
15824           "subpd %%xmm4, %%xmm12\n"
15825           "movapd %%xmm1, %%xmm9\n"
15826           "movapd %%xmm1, %%xmm13\n"
15827           "addpd %%xmm5, %%xmm9\n"
15828           "subpd %%xmm5, %%xmm13\n"
15829           "movapd %%xmm2, %%xmm10\n"
15830           "movapd %%xmm2, %%xmm14\n"
15831           "addpd %%xmm6, %%xmm10\n"
15832           "subpd %%xmm6, %%xmm14\n"
15833           "movapd %%xmm3, %%xmm11\n"
15834           "movapd %%xmm3, %%xmm15\n"
15835           "addpd %%xmm7, %%xmm11\n"
15836           "subpd %%xmm7, %%xmm15\n"
15837           "movupd %%xmm8, (%0)\n"
15838           "movupd %%xmm9, (%1)\n"
15839           "movupd %%xmm10, (%2)\n"
15840           "movupd %%xmm11, (%3)\n"
15841           "movupd %%xmm12, (%4)\n"
15842           "movupd %%xmm13, (%5)\n"
15843           "movupd %%xmm14, (%6)\n"
15844           "movupd %%xmm15, (%7)\n"
15845           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15846         );
15847       }
15848     }
15849     for (int j = 0; j < 1024; j += 1024) {
15850       for (int k = 0; k < 128; k += 2) {
15851         __asm__ volatile (
15852           "movupd (%0), %%xmm0\n"
15853           "movupd (%1), %%xmm1\n"
15854           "movupd (%2), %%xmm2\n"
15855           "movupd (%3), %%xmm3\n"
15856           "movupd (%4), %%xmm4\n"
15857           "movupd (%5), %%xmm5\n"
15858           "movupd (%6), %%xmm6\n"
15859           "movupd (%7), %%xmm7\n"
15860           "movapd %%xmm0, %%xmm8\n"
15861           "movapd %%xmm0, %%xmm9\n"
15862           "addpd %%xmm1, %%xmm8\n"
15863           "subpd %%xmm1, %%xmm9\n"
15864           "movapd %%xmm2, %%xmm10\n"
15865           "movapd %%xmm2, %%xmm11\n"
15866           "addpd %%xmm3, %%xmm10\n"
15867           "subpd %%xmm3, %%xmm11\n"
15868           "movapd %%xmm4, %%xmm12\n"
15869           "movapd %%xmm4, %%xmm13\n"
15870           "addpd %%xmm5, %%xmm12\n"
15871           "subpd %%xmm5, %%xmm13\n"
15872           "movapd %%xmm6, %%xmm14\n"
15873           "movapd %%xmm6, %%xmm15\n"
15874           "addpd %%xmm7, %%xmm14\n"
15875           "subpd %%xmm7, %%xmm15\n"
15876           "movapd %%xmm8, %%xmm0\n"
15877           "movapd %%xmm8, %%xmm2\n"
15878           "addpd %%xmm10, %%xmm0\n"
15879           "subpd %%xmm10, %%xmm2\n"
15880           "movapd %%xmm9, %%xmm1\n"
15881           "movapd %%xmm9, %%xmm3\n"
15882           "addpd %%xmm11, %%xmm1\n"
15883           "subpd %%xmm11, %%xmm3\n"
15884           "movapd %%xmm12, %%xmm4\n"
15885           "movapd %%xmm12, %%xmm6\n"
15886           "addpd %%xmm14, %%xmm4\n"
15887           "subpd %%xmm14, %%xmm6\n"
15888           "movapd %%xmm13, %%xmm5\n"
15889           "movapd %%xmm13, %%xmm7\n"
15890           "addpd %%xmm15, %%xmm5\n"
15891           "subpd %%xmm15, %%xmm7\n"
15892           "movapd %%xmm0, %%xmm8\n"
15893           "movapd %%xmm0, %%xmm12\n"
15894           "addpd %%xmm4, %%xmm8\n"
15895           "subpd %%xmm4, %%xmm12\n"
15896           "movapd %%xmm1, %%xmm9\n"
15897           "movapd %%xmm1, %%xmm13\n"
15898           "addpd %%xmm5, %%xmm9\n"
15899           "subpd %%xmm5, %%xmm13\n"
15900           "movapd %%xmm2, %%xmm10\n"
15901           "movapd %%xmm2, %%xmm14\n"
15902           "addpd %%xmm6, %%xmm10\n"
15903           "subpd %%xmm6, %%xmm14\n"
15904           "movapd %%xmm3, %%xmm11\n"
15905           "movapd %%xmm3, %%xmm15\n"
15906           "addpd %%xmm7, %%xmm11\n"
15907           "subpd %%xmm7, %%xmm15\n"
15908           "movupd %%xmm8, (%0)\n"
15909           "movupd %%xmm9, (%1)\n"
15910           "movupd %%xmm10, (%2)\n"
15911           "movupd %%xmm11, (%3)\n"
15912           "movupd %%xmm12, (%4)\n"
15913           "movupd %%xmm13, (%5)\n"
15914           "movupd %%xmm14, (%6)\n"
15915           "movupd %%xmm15, (%7)\n"
15916           :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15917         );
15918       }
15919     }
15920     return;
15921   }
15922   if (depth == 12) {
15923     helper_double_12_recursive(buf + 0, 10);
15924     helper_double_12_recursive(buf + 1024, 10);
15925     helper_double_12_recursive(buf + 2048, 10);
15926     helper_double_12_recursive(buf + 3072, 10);
15927     for (int j = 0; j < 4096; j += 4096) {
15928       for (int k = 0; k < 1024; k += 2) {
15929         __asm__ volatile (
15930           "movupd (%0), %%xmm0\n"
15931           "movupd (%1), %%xmm1\n"
15932           "movupd (%2), %%xmm2\n"
15933           "movupd (%3), %%xmm3\n"
15934           "movapd %%xmm0, %%xmm8\n"
15935           "movapd %%xmm0, %%xmm9\n"
15936           "addpd %%xmm1, %%xmm8\n"
15937           "subpd %%xmm1, %%xmm9\n"
15938           "movapd %%xmm2, %%xmm10\n"
15939           "movapd %%xmm2, %%xmm11\n"
15940           "addpd %%xmm3, %%xmm10\n"
15941           "subpd %%xmm3, %%xmm11\n"
15942           "movapd %%xmm8, %%xmm0\n"
15943           "movapd %%xmm8, %%xmm2\n"
15944           "addpd %%xmm10, %%xmm0\n"
15945           "subpd %%xmm10, %%xmm2\n"
15946           "movapd %%xmm9, %%xmm1\n"
15947           "movapd %%xmm9, %%xmm3\n"
15948           "addpd %%xmm11, %%xmm1\n"
15949           "subpd %%xmm11, %%xmm3\n"
15950           "movupd %%xmm0, (%0)\n"
15951           "movupd %%xmm1, (%1)\n"
15952           "movupd %%xmm2, (%2)\n"
15953           "movupd %%xmm3, (%3)\n"
15954           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15955         );
15956       }
15957     }
15958     return;
15959   }
15960 }
15961 void helper_double_12(double *buf);
helper_double_12(double * buf)15962 void helper_double_12(double *buf) {
15963   helper_double_12_recursive(buf, 12);
15964 }
15965 static inline void helper_double_13(double *buf);
helper_double_13(double * buf)15966 static inline void helper_double_13(double *buf) {
15967   for (int j = 0; j < 8192; j += 16) {
15968     for (int k = 0; k < 2; k += 2) {
15969       __asm__ volatile (
15970         "movupd (%0), %%xmm0\n"
15971         "movupd (%1), %%xmm1\n"
15972         "movupd (%2), %%xmm2\n"
15973         "movupd (%3), %%xmm3\n"
15974         "movupd (%4), %%xmm4\n"
15975         "movupd (%5), %%xmm5\n"
15976         "movupd (%6), %%xmm6\n"
15977         "movupd (%7), %%xmm7\n"
15978         "movapd %%xmm0, %%xmm8\n"
15979         "haddpd %%xmm8, %%xmm8\n"
15980         "movapd %%xmm0, %%xmm9\n"
15981         "hsubpd %%xmm9, %%xmm9\n"
15982         "blendpd $1, %%xmm8, %%xmm9\n"
15983         "movapd %%xmm9, %%xmm0\n"
15984         "movapd %%xmm1, %%xmm8\n"
15985         "haddpd %%xmm8, %%xmm8\n"
15986         "movapd %%xmm1, %%xmm9\n"
15987         "hsubpd %%xmm9, %%xmm9\n"
15988         "blendpd $1, %%xmm8, %%xmm9\n"
15989         "movapd %%xmm9, %%xmm1\n"
15990         "movapd %%xmm2, %%xmm8\n"
15991         "haddpd %%xmm8, %%xmm8\n"
15992         "movapd %%xmm2, %%xmm9\n"
15993         "hsubpd %%xmm9, %%xmm9\n"
15994         "blendpd $1, %%xmm8, %%xmm9\n"
15995         "movapd %%xmm9, %%xmm2\n"
15996         "movapd %%xmm3, %%xmm8\n"
15997         "haddpd %%xmm8, %%xmm8\n"
15998         "movapd %%xmm3, %%xmm9\n"
15999         "hsubpd %%xmm9, %%xmm9\n"
16000         "blendpd $1, %%xmm8, %%xmm9\n"
16001         "movapd %%xmm9, %%xmm3\n"
16002         "movapd %%xmm4, %%xmm8\n"
16003         "haddpd %%xmm8, %%xmm8\n"
16004         "movapd %%xmm4, %%xmm9\n"
16005         "hsubpd %%xmm9, %%xmm9\n"
16006         "blendpd $1, %%xmm8, %%xmm9\n"
16007         "movapd %%xmm9, %%xmm4\n"
16008         "movapd %%xmm5, %%xmm8\n"
16009         "haddpd %%xmm8, %%xmm8\n"
16010         "movapd %%xmm5, %%xmm9\n"
16011         "hsubpd %%xmm9, %%xmm9\n"
16012         "blendpd $1, %%xmm8, %%xmm9\n"
16013         "movapd %%xmm9, %%xmm5\n"
16014         "movapd %%xmm6, %%xmm8\n"
16015         "haddpd %%xmm8, %%xmm8\n"
16016         "movapd %%xmm6, %%xmm9\n"
16017         "hsubpd %%xmm9, %%xmm9\n"
16018         "blendpd $1, %%xmm8, %%xmm9\n"
16019         "movapd %%xmm9, %%xmm6\n"
16020         "movapd %%xmm7, %%xmm8\n"
16021         "haddpd %%xmm8, %%xmm8\n"
16022         "movapd %%xmm7, %%xmm9\n"
16023         "hsubpd %%xmm9, %%xmm9\n"
16024         "blendpd $1, %%xmm8, %%xmm9\n"
16025         "movapd %%xmm9, %%xmm7\n"
16026         "movapd %%xmm0, %%xmm8\n"
16027         "movapd %%xmm0, %%xmm9\n"
16028         "addpd %%xmm1, %%xmm8\n"
16029         "subpd %%xmm1, %%xmm9\n"
16030         "movapd %%xmm2, %%xmm10\n"
16031         "movapd %%xmm2, %%xmm11\n"
16032         "addpd %%xmm3, %%xmm10\n"
16033         "subpd %%xmm3, %%xmm11\n"
16034         "movapd %%xmm4, %%xmm12\n"
16035         "movapd %%xmm4, %%xmm13\n"
16036         "addpd %%xmm5, %%xmm12\n"
16037         "subpd %%xmm5, %%xmm13\n"
16038         "movapd %%xmm6, %%xmm14\n"
16039         "movapd %%xmm6, %%xmm15\n"
16040         "addpd %%xmm7, %%xmm14\n"
16041         "subpd %%xmm7, %%xmm15\n"
16042         "movapd %%xmm8, %%xmm0\n"
16043         "movapd %%xmm8, %%xmm2\n"
16044         "addpd %%xmm10, %%xmm0\n"
16045         "subpd %%xmm10, %%xmm2\n"
16046         "movapd %%xmm9, %%xmm1\n"
16047         "movapd %%xmm9, %%xmm3\n"
16048         "addpd %%xmm11, %%xmm1\n"
16049         "subpd %%xmm11, %%xmm3\n"
16050         "movapd %%xmm12, %%xmm4\n"
16051         "movapd %%xmm12, %%xmm6\n"
16052         "addpd %%xmm14, %%xmm4\n"
16053         "subpd %%xmm14, %%xmm6\n"
16054         "movapd %%xmm13, %%xmm5\n"
16055         "movapd %%xmm13, %%xmm7\n"
16056         "addpd %%xmm15, %%xmm5\n"
16057         "subpd %%xmm15, %%xmm7\n"
16058         "movapd %%xmm0, %%xmm8\n"
16059         "movapd %%xmm0, %%xmm12\n"
16060         "addpd %%xmm4, %%xmm8\n"
16061         "subpd %%xmm4, %%xmm12\n"
16062         "movapd %%xmm1, %%xmm9\n"
16063         "movapd %%xmm1, %%xmm13\n"
16064         "addpd %%xmm5, %%xmm9\n"
16065         "subpd %%xmm5, %%xmm13\n"
16066         "movapd %%xmm2, %%xmm10\n"
16067         "movapd %%xmm2, %%xmm14\n"
16068         "addpd %%xmm6, %%xmm10\n"
16069         "subpd %%xmm6, %%xmm14\n"
16070         "movapd %%xmm3, %%xmm11\n"
16071         "movapd %%xmm3, %%xmm15\n"
16072         "addpd %%xmm7, %%xmm11\n"
16073         "subpd %%xmm7, %%xmm15\n"
16074         "movupd %%xmm8, (%0)\n"
16075         "movupd %%xmm9, (%1)\n"
16076         "movupd %%xmm10, (%2)\n"
16077         "movupd %%xmm11, (%3)\n"
16078         "movupd %%xmm12, (%4)\n"
16079         "movupd %%xmm13, (%5)\n"
16080         "movupd %%xmm14, (%6)\n"
16081         "movupd %%xmm15, (%7)\n"
16082         :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16083       );
16084     }
16085   }
16086   for (int j = 0; j < 8192; j += 128) {
16087     for (int k = 0; k < 16; k += 2) {
16088       __asm__ volatile (
16089         "movupd (%0), %%xmm0\n"
16090         "movupd (%1), %%xmm1\n"
16091         "movupd (%2), %%xmm2\n"
16092         "movupd (%3), %%xmm3\n"
16093         "movupd (%4), %%xmm4\n"
16094         "movupd (%5), %%xmm5\n"
16095         "movupd (%6), %%xmm6\n"
16096         "movupd (%7), %%xmm7\n"
16097         "movapd %%xmm0, %%xmm8\n"
16098         "movapd %%xmm0, %%xmm9\n"
16099         "addpd %%xmm1, %%xmm8\n"
16100         "subpd %%xmm1, %%xmm9\n"
16101         "movapd %%xmm2, %%xmm10\n"
16102         "movapd %%xmm2, %%xmm11\n"
16103         "addpd %%xmm3, %%xmm10\n"
16104         "subpd %%xmm3, %%xmm11\n"
16105         "movapd %%xmm4, %%xmm12\n"
16106         "movapd %%xmm4, %%xmm13\n"
16107         "addpd %%xmm5, %%xmm12\n"
16108         "subpd %%xmm5, %%xmm13\n"
16109         "movapd %%xmm6, %%xmm14\n"
16110         "movapd %%xmm6, %%xmm15\n"
16111         "addpd %%xmm7, %%xmm14\n"
16112         "subpd %%xmm7, %%xmm15\n"
16113         "movapd %%xmm8, %%xmm0\n"
16114         "movapd %%xmm8, %%xmm2\n"
16115         "addpd %%xmm10, %%xmm0\n"
16116         "subpd %%xmm10, %%xmm2\n"
16117         "movapd %%xmm9, %%xmm1\n"
16118         "movapd %%xmm9, %%xmm3\n"
16119         "addpd %%xmm11, %%xmm1\n"
16120         "subpd %%xmm11, %%xmm3\n"
16121         "movapd %%xmm12, %%xmm4\n"
16122         "movapd %%xmm12, %%xmm6\n"
16123         "addpd %%xmm14, %%xmm4\n"
16124         "subpd %%xmm14, %%xmm6\n"
16125         "movapd %%xmm13, %%xmm5\n"
16126         "movapd %%xmm13, %%xmm7\n"
16127         "addpd %%xmm15, %%xmm5\n"
16128         "subpd %%xmm15, %%xmm7\n"
16129         "movapd %%xmm0, %%xmm8\n"
16130         "movapd %%xmm0, %%xmm12\n"
16131         "addpd %%xmm4, %%xmm8\n"
16132         "subpd %%xmm4, %%xmm12\n"
16133         "movapd %%xmm1, %%xmm9\n"
16134         "movapd %%xmm1, %%xmm13\n"
16135         "addpd %%xmm5, %%xmm9\n"
16136         "subpd %%xmm5, %%xmm13\n"
16137         "movapd %%xmm2, %%xmm10\n"
16138         "movapd %%xmm2, %%xmm14\n"
16139         "addpd %%xmm6, %%xmm10\n"
16140         "subpd %%xmm6, %%xmm14\n"
16141         "movapd %%xmm3, %%xmm11\n"
16142         "movapd %%xmm3, %%xmm15\n"
16143         "addpd %%xmm7, %%xmm11\n"
16144         "subpd %%xmm7, %%xmm15\n"
16145         "movupd %%xmm8, (%0)\n"
16146         "movupd %%xmm9, (%1)\n"
16147         "movupd %%xmm10, (%2)\n"
16148         "movupd %%xmm11, (%3)\n"
16149         "movupd %%xmm12, (%4)\n"
16150         "movupd %%xmm13, (%5)\n"
16151         "movupd %%xmm14, (%6)\n"
16152         "movupd %%xmm15, (%7)\n"
16153         :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16154       );
16155     }
16156   }
16157   for (int j = 0; j < 8192; j += 1024) {
16158     for (int k = 0; k < 128; k += 2) {
16159       __asm__ volatile (
16160         "movupd (%0), %%xmm0\n"
16161         "movupd (%1), %%xmm1\n"
16162         "movupd (%2), %%xmm2\n"
16163         "movupd (%3), %%xmm3\n"
16164         "movupd (%4), %%xmm4\n"
16165         "movupd (%5), %%xmm5\n"
16166         "movupd (%6), %%xmm6\n"
16167         "movupd (%7), %%xmm7\n"
16168         "movapd %%xmm0, %%xmm8\n"
16169         "movapd %%xmm0, %%xmm9\n"
16170         "addpd %%xmm1, %%xmm8\n"
16171         "subpd %%xmm1, %%xmm9\n"
16172         "movapd %%xmm2, %%xmm10\n"
16173         "movapd %%xmm2, %%xmm11\n"
16174         "addpd %%xmm3, %%xmm10\n"
16175         "subpd %%xmm3, %%xmm11\n"
16176         "movapd %%xmm4, %%xmm12\n"
16177         "movapd %%xmm4, %%xmm13\n"
16178         "addpd %%xmm5, %%xmm12\n"
16179         "subpd %%xmm5, %%xmm13\n"
16180         "movapd %%xmm6, %%xmm14\n"
16181         "movapd %%xmm6, %%xmm15\n"
16182         "addpd %%xmm7, %%xmm14\n"
16183         "subpd %%xmm7, %%xmm15\n"
16184         "movapd %%xmm8, %%xmm0\n"
16185         "movapd %%xmm8, %%xmm2\n"
16186         "addpd %%xmm10, %%xmm0\n"
16187         "subpd %%xmm10, %%xmm2\n"
16188         "movapd %%xmm9, %%xmm1\n"
16189         "movapd %%xmm9, %%xmm3\n"
16190         "addpd %%xmm11, %%xmm1\n"
16191         "subpd %%xmm11, %%xmm3\n"
16192         "movapd %%xmm12, %%xmm4\n"
16193         "movapd %%xmm12, %%xmm6\n"
16194         "addpd %%xmm14, %%xmm4\n"
16195         "subpd %%xmm14, %%xmm6\n"
16196         "movapd %%xmm13, %%xmm5\n"
16197         "movapd %%xmm13, %%xmm7\n"
16198         "addpd %%xmm15, %%xmm5\n"
16199         "subpd %%xmm15, %%xmm7\n"
16200         "movapd %%xmm0, %%xmm8\n"
16201         "movapd %%xmm0, %%xmm12\n"
16202         "addpd %%xmm4, %%xmm8\n"
16203         "subpd %%xmm4, %%xmm12\n"
16204         "movapd %%xmm1, %%xmm9\n"
16205         "movapd %%xmm1, %%xmm13\n"
16206         "addpd %%xmm5, %%xmm9\n"
16207         "subpd %%xmm5, %%xmm13\n"
16208         "movapd %%xmm2, %%xmm10\n"
16209         "movapd %%xmm2, %%xmm14\n"
16210         "addpd %%xmm6, %%xmm10\n"
16211         "subpd %%xmm6, %%xmm14\n"
16212         "movapd %%xmm3, %%xmm11\n"
16213         "movapd %%xmm3, %%xmm15\n"
16214         "addpd %%xmm7, %%xmm11\n"
16215         "subpd %%xmm7, %%xmm15\n"
16216         "movupd %%xmm8, (%0)\n"
16217         "movupd %%xmm9, (%1)\n"
16218         "movupd %%xmm10, (%2)\n"
16219         "movupd %%xmm11, (%3)\n"
16220         "movupd %%xmm12, (%4)\n"
16221         "movupd %%xmm13, (%5)\n"
16222         "movupd %%xmm14, (%6)\n"
16223         "movupd %%xmm15, (%7)\n"
16224         :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16225       );
16226     }
16227   }
16228   for (int j = 0; j < 8192; j += 8192) {
16229     for (int k = 0; k < 1024; k += 2) {
16230       __asm__ volatile (
16231         "movupd (%0), %%xmm0\n"
16232         "movupd (%1), %%xmm1\n"
16233         "movupd (%2), %%xmm2\n"
16234         "movupd (%3), %%xmm3\n"
16235         "movupd (%4), %%xmm4\n"
16236         "movupd (%5), %%xmm5\n"
16237         "movupd (%6), %%xmm6\n"
16238         "movupd (%7), %%xmm7\n"
16239         "movapd %%xmm0, %%xmm8\n"
16240         "movapd %%xmm0, %%xmm9\n"
16241         "addpd %%xmm1, %%xmm8\n"
16242         "subpd %%xmm1, %%xmm9\n"
16243         "movapd %%xmm2, %%xmm10\n"
16244         "movapd %%xmm2, %%xmm11\n"
16245         "addpd %%xmm3, %%xmm10\n"
16246         "subpd %%xmm3, %%xmm11\n"
16247         "movapd %%xmm4, %%xmm12\n"
16248         "movapd %%xmm4, %%xmm13\n"
16249         "addpd %%xmm5, %%xmm12\n"
16250         "subpd %%xmm5, %%xmm13\n"
16251         "movapd %%xmm6, %%xmm14\n"
16252         "movapd %%xmm6, %%xmm15\n"
16253         "addpd %%xmm7, %%xmm14\n"
16254         "subpd %%xmm7, %%xmm15\n"
16255         "movapd %%xmm8, %%xmm0\n"
16256         "movapd %%xmm8, %%xmm2\n"
16257         "addpd %%xmm10, %%xmm0\n"
16258         "subpd %%xmm10, %%xmm2\n"
16259         "movapd %%xmm9, %%xmm1\n"
16260         "movapd %%xmm9, %%xmm3\n"
16261         "addpd %%xmm11, %%xmm1\n"
16262         "subpd %%xmm11, %%xmm3\n"
16263         "movapd %%xmm12, %%xmm4\n"
16264         "movapd %%xmm12, %%xmm6\n"
16265         "addpd %%xmm14, %%xmm4\n"
16266         "subpd %%xmm14, %%xmm6\n"
16267         "movapd %%xmm13, %%xmm5\n"
16268         "movapd %%xmm13, %%xmm7\n"
16269         "addpd %%xmm15, %%xmm5\n"
16270         "subpd %%xmm15, %%xmm7\n"
16271         "movapd %%xmm0, %%xmm8\n"
16272         "movapd %%xmm0, %%xmm12\n"
16273         "addpd %%xmm4, %%xmm8\n"
16274         "subpd %%xmm4, %%xmm12\n"
16275         "movapd %%xmm1, %%xmm9\n"
16276         "movapd %%xmm1, %%xmm13\n"
16277         "addpd %%xmm5, %%xmm9\n"
16278         "subpd %%xmm5, %%xmm13\n"
16279         "movapd %%xmm2, %%xmm10\n"
16280         "movapd %%xmm2, %%xmm14\n"
16281         "addpd %%xmm6, %%xmm10\n"
16282         "subpd %%xmm6, %%xmm14\n"
16283         "movapd %%xmm3, %%xmm11\n"
16284         "movapd %%xmm3, %%xmm15\n"
16285         "addpd %%xmm7, %%xmm11\n"
16286         "subpd %%xmm7, %%xmm15\n"
16287         "movupd %%xmm8, (%0)\n"
16288         "movupd %%xmm9, (%1)\n"
16289         "movupd %%xmm10, (%2)\n"
16290         "movupd %%xmm11, (%3)\n"
16291         "movupd %%xmm12, (%4)\n"
16292         "movupd %%xmm13, (%5)\n"
16293         "movupd %%xmm14, (%6)\n"
16294         "movupd %%xmm15, (%7)\n"
16295         :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16296       );
16297     }
16298   }
16299 }
16300 void helper_double_14_recursive(double *buf, int depth);
helper_double_14_recursive(double * buf,int depth)16301 void helper_double_14_recursive(double *buf, int depth) {
16302   if (depth == 9) {
16303     for (int j = 0; j < 512; j += 16) {
16304       for (int k = 0; k < 2; k += 2) {
16305         __asm__ volatile (
16306           "movupd (%0), %%xmm0\n"
16307           "movupd (%1), %%xmm1\n"
16308           "movupd (%2), %%xmm2\n"
16309           "movupd (%3), %%xmm3\n"
16310           "movupd (%4), %%xmm4\n"
16311           "movupd (%5), %%xmm5\n"
16312           "movupd (%6), %%xmm6\n"
16313           "movupd (%7), %%xmm7\n"
16314           "movapd %%xmm0, %%xmm8\n"
16315           "haddpd %%xmm8, %%xmm8\n"
16316           "movapd %%xmm0, %%xmm9\n"
16317           "hsubpd %%xmm9, %%xmm9\n"
16318           "blendpd $1, %%xmm8, %%xmm9\n"
16319           "movapd %%xmm9, %%xmm0\n"
16320           "movapd %%xmm1, %%xmm8\n"
16321           "haddpd %%xmm8, %%xmm8\n"
16322           "movapd %%xmm1, %%xmm9\n"
16323           "hsubpd %%xmm9, %%xmm9\n"
16324           "blendpd $1, %%xmm8, %%xmm9\n"
16325           "movapd %%xmm9, %%xmm1\n"
16326           "movapd %%xmm2, %%xmm8\n"
16327           "haddpd %%xmm8, %%xmm8\n"
16328           "movapd %%xmm2, %%xmm9\n"
16329           "hsubpd %%xmm9, %%xmm9\n"
16330           "blendpd $1, %%xmm8, %%xmm9\n"
16331           "movapd %%xmm9, %%xmm2\n"
16332           "movapd %%xmm3, %%xmm8\n"
16333           "haddpd %%xmm8, %%xmm8\n"
16334           "movapd %%xmm3, %%xmm9\n"
16335           "hsubpd %%xmm9, %%xmm9\n"
16336           "blendpd $1, %%xmm8, %%xmm9\n"
16337           "movapd %%xmm9, %%xmm3\n"
16338           "movapd %%xmm4, %%xmm8\n"
16339           "haddpd %%xmm8, %%xmm8\n"
16340           "movapd %%xmm4, %%xmm9\n"
16341           "hsubpd %%xmm9, %%xmm9\n"
16342           "blendpd $1, %%xmm8, %%xmm9\n"
16343           "movapd %%xmm9, %%xmm4\n"
16344           "movapd %%xmm5, %%xmm8\n"
16345           "haddpd %%xmm8, %%xmm8\n"
16346           "movapd %%xmm5, %%xmm9\n"
16347           "hsubpd %%xmm9, %%xmm9\n"
16348           "blendpd $1, %%xmm8, %%xmm9\n"
16349           "movapd %%xmm9, %%xmm5\n"
16350           "movapd %%xmm6, %%xmm8\n"
16351           "haddpd %%xmm8, %%xmm8\n"
16352           "movapd %%xmm6, %%xmm9\n"
16353           "hsubpd %%xmm9, %%xmm9\n"
16354           "blendpd $1, %%xmm8, %%xmm9\n"
16355           "movapd %%xmm9, %%xmm6\n"
16356           "movapd %%xmm7, %%xmm8\n"
16357           "haddpd %%xmm8, %%xmm8\n"
16358           "movapd %%xmm7, %%xmm9\n"
16359           "hsubpd %%xmm9, %%xmm9\n"
16360           "blendpd $1, %%xmm8, %%xmm9\n"
16361           "movapd %%xmm9, %%xmm7\n"
16362           "movapd %%xmm0, %%xmm8\n"
16363           "movapd %%xmm0, %%xmm9\n"
16364           "addpd %%xmm1, %%xmm8\n"
16365           "subpd %%xmm1, %%xmm9\n"
16366           "movapd %%xmm2, %%xmm10\n"
16367           "movapd %%xmm2, %%xmm11\n"
16368           "addpd %%xmm3, %%xmm10\n"
16369           "subpd %%xmm3, %%xmm11\n"
16370           "movapd %%xmm4, %%xmm12\n"
16371           "movapd %%xmm4, %%xmm13\n"
16372           "addpd %%xmm5, %%xmm12\n"
16373           "subpd %%xmm5, %%xmm13\n"
16374           "movapd %%xmm6, %%xmm14\n"
16375           "movapd %%xmm6, %%xmm15\n"
16376           "addpd %%xmm7, %%xmm14\n"
16377           "subpd %%xmm7, %%xmm15\n"
16378           "movapd %%xmm8, %%xmm0\n"
16379           "movapd %%xmm8, %%xmm2\n"
16380           "addpd %%xmm10, %%xmm0\n"
16381           "subpd %%xmm10, %%xmm2\n"
16382           "movapd %%xmm9, %%xmm1\n"
16383           "movapd %%xmm9, %%xmm3\n"
16384           "addpd %%xmm11, %%xmm1\n"
16385           "subpd %%xmm11, %%xmm3\n"
16386           "movapd %%xmm12, %%xmm4\n"
16387           "movapd %%xmm12, %%xmm6\n"
16388           "addpd %%xmm14, %%xmm4\n"
16389           "subpd %%xmm14, %%xmm6\n"
16390           "movapd %%xmm13, %%xmm5\n"
16391           "movapd %%xmm13, %%xmm7\n"
16392           "addpd %%xmm15, %%xmm5\n"
16393           "subpd %%xmm15, %%xmm7\n"
16394           "movapd %%xmm0, %%xmm8\n"
16395           "movapd %%xmm0, %%xmm12\n"
16396           "addpd %%xmm4, %%xmm8\n"
16397           "subpd %%xmm4, %%xmm12\n"
16398           "movapd %%xmm1, %%xmm9\n"
16399           "movapd %%xmm1, %%xmm13\n"
16400           "addpd %%xmm5, %%xmm9\n"
16401           "subpd %%xmm5, %%xmm13\n"
16402           "movapd %%xmm2, %%xmm10\n"
16403           "movapd %%xmm2, %%xmm14\n"
16404           "addpd %%xmm6, %%xmm10\n"
16405           "subpd %%xmm6, %%xmm14\n"
16406           "movapd %%xmm3, %%xmm11\n"
16407           "movapd %%xmm3, %%xmm15\n"
16408           "addpd %%xmm7, %%xmm11\n"
16409           "subpd %%xmm7, %%xmm15\n"
16410           "movupd %%xmm8, (%0)\n"
16411           "movupd %%xmm9, (%1)\n"
16412           "movupd %%xmm10, (%2)\n"
16413           "movupd %%xmm11, (%3)\n"
16414           "movupd %%xmm12, (%4)\n"
16415           "movupd %%xmm13, (%5)\n"
16416           "movupd %%xmm14, (%6)\n"
16417           "movupd %%xmm15, (%7)\n"
16418           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16419         );
16420       }
16421     }
16422     for (int j = 0; j < 512; j += 128) {
16423       for (int k = 0; k < 16; k += 2) {
16424         __asm__ volatile (
16425           "movupd (%0), %%xmm0\n"
16426           "movupd (%1), %%xmm1\n"
16427           "movupd (%2), %%xmm2\n"
16428           "movupd (%3), %%xmm3\n"
16429           "movupd (%4), %%xmm4\n"
16430           "movupd (%5), %%xmm5\n"
16431           "movupd (%6), %%xmm6\n"
16432           "movupd (%7), %%xmm7\n"
16433           "movapd %%xmm0, %%xmm8\n"
16434           "movapd %%xmm0, %%xmm9\n"
16435           "addpd %%xmm1, %%xmm8\n"
16436           "subpd %%xmm1, %%xmm9\n"
16437           "movapd %%xmm2, %%xmm10\n"
16438           "movapd %%xmm2, %%xmm11\n"
16439           "addpd %%xmm3, %%xmm10\n"
16440           "subpd %%xmm3, %%xmm11\n"
16441           "movapd %%xmm4, %%xmm12\n"
16442           "movapd %%xmm4, %%xmm13\n"
16443           "addpd %%xmm5, %%xmm12\n"
16444           "subpd %%xmm5, %%xmm13\n"
16445           "movapd %%xmm6, %%xmm14\n"
16446           "movapd %%xmm6, %%xmm15\n"
16447           "addpd %%xmm7, %%xmm14\n"
16448           "subpd %%xmm7, %%xmm15\n"
16449           "movapd %%xmm8, %%xmm0\n"
16450           "movapd %%xmm8, %%xmm2\n"
16451           "addpd %%xmm10, %%xmm0\n"
16452           "subpd %%xmm10, %%xmm2\n"
16453           "movapd %%xmm9, %%xmm1\n"
16454           "movapd %%xmm9, %%xmm3\n"
16455           "addpd %%xmm11, %%xmm1\n"
16456           "subpd %%xmm11, %%xmm3\n"
16457           "movapd %%xmm12, %%xmm4\n"
16458           "movapd %%xmm12, %%xmm6\n"
16459           "addpd %%xmm14, %%xmm4\n"
16460           "subpd %%xmm14, %%xmm6\n"
16461           "movapd %%xmm13, %%xmm5\n"
16462           "movapd %%xmm13, %%xmm7\n"
16463           "addpd %%xmm15, %%xmm5\n"
16464           "subpd %%xmm15, %%xmm7\n"
16465           "movapd %%xmm0, %%xmm8\n"
16466           "movapd %%xmm0, %%xmm12\n"
16467           "addpd %%xmm4, %%xmm8\n"
16468           "subpd %%xmm4, %%xmm12\n"
16469           "movapd %%xmm1, %%xmm9\n"
16470           "movapd %%xmm1, %%xmm13\n"
16471           "addpd %%xmm5, %%xmm9\n"
16472           "subpd %%xmm5, %%xmm13\n"
16473           "movapd %%xmm2, %%xmm10\n"
16474           "movapd %%xmm2, %%xmm14\n"
16475           "addpd %%xmm6, %%xmm10\n"
16476           "subpd %%xmm6, %%xmm14\n"
16477           "movapd %%xmm3, %%xmm11\n"
16478           "movapd %%xmm3, %%xmm15\n"
16479           "addpd %%xmm7, %%xmm11\n"
16480           "subpd %%xmm7, %%xmm15\n"
16481           "movupd %%xmm8, (%0)\n"
16482           "movupd %%xmm9, (%1)\n"
16483           "movupd %%xmm10, (%2)\n"
16484           "movupd %%xmm11, (%3)\n"
16485           "movupd %%xmm12, (%4)\n"
16486           "movupd %%xmm13, (%5)\n"
16487           "movupd %%xmm14, (%6)\n"
16488           "movupd %%xmm15, (%7)\n"
16489           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16490         );
16491       }
16492     }
16493     for (int j = 0; j < 512; j += 512) {
16494       for (int k = 0; k < 128; k += 2) {
16495         __asm__ volatile (
16496           "movupd (%0), %%xmm0\n"
16497           "movupd (%1), %%xmm1\n"
16498           "movupd (%2), %%xmm2\n"
16499           "movupd (%3), %%xmm3\n"
16500           "movapd %%xmm0, %%xmm8\n"
16501           "movapd %%xmm0, %%xmm9\n"
16502           "addpd %%xmm1, %%xmm8\n"
16503           "subpd %%xmm1, %%xmm9\n"
16504           "movapd %%xmm2, %%xmm10\n"
16505           "movapd %%xmm2, %%xmm11\n"
16506           "addpd %%xmm3, %%xmm10\n"
16507           "subpd %%xmm3, %%xmm11\n"
16508           "movapd %%xmm8, %%xmm0\n"
16509           "movapd %%xmm8, %%xmm2\n"
16510           "addpd %%xmm10, %%xmm0\n"
16511           "subpd %%xmm10, %%xmm2\n"
16512           "movapd %%xmm9, %%xmm1\n"
16513           "movapd %%xmm9, %%xmm3\n"
16514           "addpd %%xmm11, %%xmm1\n"
16515           "subpd %%xmm11, %%xmm3\n"
16516           "movupd %%xmm0, (%0)\n"
16517           "movupd %%xmm1, (%1)\n"
16518           "movupd %%xmm2, (%2)\n"
16519           "movupd %%xmm3, (%3)\n"
16520           :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16521         );
16522       }
16523     }
16524     return;
16525   }
16526   if (depth == 12) {
16527     helper_double_14_recursive(buf + 0, 9);
16528     helper_double_14_recursive(buf + 512, 9);
16529     helper_double_14_recursive(buf + 1024, 9);
16530     helper_double_14_recursive(buf + 1536, 9);
16531     helper_double_14_recursive(buf + 2048, 9);
16532     helper_double_14_recursive(buf + 2560, 9);
16533     helper_double_14_recursive(buf + 3072, 9);
16534     helper_double_14_recursive(buf + 3584, 9);
16535     for (int j = 0; j < 4096; j += 4096) {
16536       for (int k = 0; k < 512; k += 2) {
16537         __asm__ volatile (
16538           "movupd (%0), %%xmm0\n"
16539           "movupd (%1), %%xmm1\n"
16540           "movupd (%2), %%xmm2\n"
16541           "movupd (%3), %%xmm3\n"
16542           "movupd (%4), %%xmm4\n"
16543           "movupd (%5), %%xmm5\n"
16544           "movupd (%6), %%xmm6\n"
16545           "movupd (%7), %%xmm7\n"
16546           "movapd %%xmm0, %%xmm8\n"
16547           "movapd %%xmm0, %%xmm9\n"
16548           "addpd %%xmm1, %%xmm8\n"
16549           "subpd %%xmm1, %%xmm9\n"
16550           "movapd %%xmm2, %%xmm10\n"
16551           "movapd %%xmm2, %%xmm11\n"
16552           "addpd %%xmm3, %%xmm10\n"
16553           "subpd %%xmm3, %%xmm11\n"
16554           "movapd %%xmm4, %%xmm12\n"
16555           "movapd %%xmm4, %%xmm13\n"
16556           "addpd %%xmm5, %%xmm12\n"
16557           "subpd %%xmm5, %%xmm13\n"
16558           "movapd %%xmm6, %%xmm14\n"
16559           "movapd %%xmm6, %%xmm15\n"
16560           "addpd %%xmm7, %%xmm14\n"
16561           "subpd %%xmm7, %%xmm15\n"
16562           "movapd %%xmm8, %%xmm0\n"
16563           "movapd %%xmm8, %%xmm2\n"
16564           "addpd %%xmm10, %%xmm0\n"
16565           "subpd %%xmm10, %%xmm2\n"
16566           "movapd %%xmm9, %%xmm1\n"
16567           "movapd %%xmm9, %%xmm3\n"
16568           "addpd %%xmm11, %%xmm1\n"
16569           "subpd %%xmm11, %%xmm3\n"
16570           "movapd %%xmm12, %%xmm4\n"
16571           "movapd %%xmm12, %%xmm6\n"
16572           "addpd %%xmm14, %%xmm4\n"
16573           "subpd %%xmm14, %%xmm6\n"
16574           "movapd %%xmm13, %%xmm5\n"
16575           "movapd %%xmm13, %%xmm7\n"
16576           "addpd %%xmm15, %%xmm5\n"
16577           "subpd %%xmm15, %%xmm7\n"
16578           "movapd %%xmm0, %%xmm8\n"
16579           "movapd %%xmm0, %%xmm12\n"
16580           "addpd %%xmm4, %%xmm8\n"
16581           "subpd %%xmm4, %%xmm12\n"
16582           "movapd %%xmm1, %%xmm9\n"
16583           "movapd %%xmm1, %%xmm13\n"
16584           "addpd %%xmm5, %%xmm9\n"
16585           "subpd %%xmm5, %%xmm13\n"
16586           "movapd %%xmm2, %%xmm10\n"
16587           "movapd %%xmm2, %%xmm14\n"
16588           "addpd %%xmm6, %%xmm10\n"
16589           "subpd %%xmm6, %%xmm14\n"
16590           "movapd %%xmm3, %%xmm11\n"
16591           "movapd %%xmm3, %%xmm15\n"
16592           "addpd %%xmm7, %%xmm11\n"
16593           "subpd %%xmm7, %%xmm15\n"
16594           "movupd %%xmm8, (%0)\n"
16595           "movupd %%xmm9, (%1)\n"
16596           "movupd %%xmm10, (%2)\n"
16597           "movupd %%xmm11, (%3)\n"
16598           "movupd %%xmm12, (%4)\n"
16599           "movupd %%xmm13, (%5)\n"
16600           "movupd %%xmm14, (%6)\n"
16601           "movupd %%xmm15, (%7)\n"
16602           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16603         );
16604       }
16605     }
16606     return;
16607   }
16608   if (depth == 14) {
16609     helper_double_14_recursive(buf + 0, 12);
16610     helper_double_14_recursive(buf + 4096, 12);
16611     helper_double_14_recursive(buf + 8192, 12);
16612     helper_double_14_recursive(buf + 12288, 12);
16613     for (int j = 0; j < 16384; j += 16384) {
16614       for (int k = 0; k < 4096; k += 2) {
16615         __asm__ volatile (
16616           "movupd (%0), %%xmm0\n"
16617           "movupd (%1), %%xmm1\n"
16618           "movupd (%2), %%xmm2\n"
16619           "movupd (%3), %%xmm3\n"
16620           "movapd %%xmm0, %%xmm8\n"
16621           "movapd %%xmm0, %%xmm9\n"
16622           "addpd %%xmm1, %%xmm8\n"
16623           "subpd %%xmm1, %%xmm9\n"
16624           "movapd %%xmm2, %%xmm10\n"
16625           "movapd %%xmm2, %%xmm11\n"
16626           "addpd %%xmm3, %%xmm10\n"
16627           "subpd %%xmm3, %%xmm11\n"
16628           "movapd %%xmm8, %%xmm0\n"
16629           "movapd %%xmm8, %%xmm2\n"
16630           "addpd %%xmm10, %%xmm0\n"
16631           "subpd %%xmm10, %%xmm2\n"
16632           "movapd %%xmm9, %%xmm1\n"
16633           "movapd %%xmm9, %%xmm3\n"
16634           "addpd %%xmm11, %%xmm1\n"
16635           "subpd %%xmm11, %%xmm3\n"
16636           "movupd %%xmm0, (%0)\n"
16637           "movupd %%xmm1, (%1)\n"
16638           "movupd %%xmm2, (%2)\n"
16639           "movupd %%xmm3, (%3)\n"
16640           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16641         );
16642       }
16643     }
16644     return;
16645   }
16646 }
16647 void helper_double_14(double *buf);
helper_double_14(double * buf)16648 void helper_double_14(double *buf) {
16649   helper_double_14_recursive(buf, 14);
16650 }
16651 void helper_double_15_recursive(double *buf, int depth);
helper_double_15_recursive(double * buf,int depth)16652 void helper_double_15_recursive(double *buf, int depth) {
16653   if (depth == 10) {
16654     for (int j = 0; j < 1024; j += 16) {
16655       for (int k = 0; k < 2; k += 2) {
16656         __asm__ volatile (
16657           "movupd (%0), %%xmm0\n"
16658           "movupd (%1), %%xmm1\n"
16659           "movupd (%2), %%xmm2\n"
16660           "movupd (%3), %%xmm3\n"
16661           "movupd (%4), %%xmm4\n"
16662           "movupd (%5), %%xmm5\n"
16663           "movupd (%6), %%xmm6\n"
16664           "movupd (%7), %%xmm7\n"
16665           "movapd %%xmm0, %%xmm8\n"
16666           "haddpd %%xmm8, %%xmm8\n"
16667           "movapd %%xmm0, %%xmm9\n"
16668           "hsubpd %%xmm9, %%xmm9\n"
16669           "blendpd $1, %%xmm8, %%xmm9\n"
16670           "movapd %%xmm9, %%xmm0\n"
16671           "movapd %%xmm1, %%xmm8\n"
16672           "haddpd %%xmm8, %%xmm8\n"
16673           "movapd %%xmm1, %%xmm9\n"
16674           "hsubpd %%xmm9, %%xmm9\n"
16675           "blendpd $1, %%xmm8, %%xmm9\n"
16676           "movapd %%xmm9, %%xmm1\n"
16677           "movapd %%xmm2, %%xmm8\n"
16678           "haddpd %%xmm8, %%xmm8\n"
16679           "movapd %%xmm2, %%xmm9\n"
16680           "hsubpd %%xmm9, %%xmm9\n"
16681           "blendpd $1, %%xmm8, %%xmm9\n"
16682           "movapd %%xmm9, %%xmm2\n"
16683           "movapd %%xmm3, %%xmm8\n"
16684           "haddpd %%xmm8, %%xmm8\n"
16685           "movapd %%xmm3, %%xmm9\n"
16686           "hsubpd %%xmm9, %%xmm9\n"
16687           "blendpd $1, %%xmm8, %%xmm9\n"
16688           "movapd %%xmm9, %%xmm3\n"
16689           "movapd %%xmm4, %%xmm8\n"
16690           "haddpd %%xmm8, %%xmm8\n"
16691           "movapd %%xmm4, %%xmm9\n"
16692           "hsubpd %%xmm9, %%xmm9\n"
16693           "blendpd $1, %%xmm8, %%xmm9\n"
16694           "movapd %%xmm9, %%xmm4\n"
16695           "movapd %%xmm5, %%xmm8\n"
16696           "haddpd %%xmm8, %%xmm8\n"
16697           "movapd %%xmm5, %%xmm9\n"
16698           "hsubpd %%xmm9, %%xmm9\n"
16699           "blendpd $1, %%xmm8, %%xmm9\n"
16700           "movapd %%xmm9, %%xmm5\n"
16701           "movapd %%xmm6, %%xmm8\n"
16702           "haddpd %%xmm8, %%xmm8\n"
16703           "movapd %%xmm6, %%xmm9\n"
16704           "hsubpd %%xmm9, %%xmm9\n"
16705           "blendpd $1, %%xmm8, %%xmm9\n"
16706           "movapd %%xmm9, %%xmm6\n"
16707           "movapd %%xmm7, %%xmm8\n"
16708           "haddpd %%xmm8, %%xmm8\n"
16709           "movapd %%xmm7, %%xmm9\n"
16710           "hsubpd %%xmm9, %%xmm9\n"
16711           "blendpd $1, %%xmm8, %%xmm9\n"
16712           "movapd %%xmm9, %%xmm7\n"
16713           "movapd %%xmm0, %%xmm8\n"
16714           "movapd %%xmm0, %%xmm9\n"
16715           "addpd %%xmm1, %%xmm8\n"
16716           "subpd %%xmm1, %%xmm9\n"
16717           "movapd %%xmm2, %%xmm10\n"
16718           "movapd %%xmm2, %%xmm11\n"
16719           "addpd %%xmm3, %%xmm10\n"
16720           "subpd %%xmm3, %%xmm11\n"
16721           "movapd %%xmm4, %%xmm12\n"
16722           "movapd %%xmm4, %%xmm13\n"
16723           "addpd %%xmm5, %%xmm12\n"
16724           "subpd %%xmm5, %%xmm13\n"
16725           "movapd %%xmm6, %%xmm14\n"
16726           "movapd %%xmm6, %%xmm15\n"
16727           "addpd %%xmm7, %%xmm14\n"
16728           "subpd %%xmm7, %%xmm15\n"
16729           "movapd %%xmm8, %%xmm0\n"
16730           "movapd %%xmm8, %%xmm2\n"
16731           "addpd %%xmm10, %%xmm0\n"
16732           "subpd %%xmm10, %%xmm2\n"
16733           "movapd %%xmm9, %%xmm1\n"
16734           "movapd %%xmm9, %%xmm3\n"
16735           "addpd %%xmm11, %%xmm1\n"
16736           "subpd %%xmm11, %%xmm3\n"
16737           "movapd %%xmm12, %%xmm4\n"
16738           "movapd %%xmm12, %%xmm6\n"
16739           "addpd %%xmm14, %%xmm4\n"
16740           "subpd %%xmm14, %%xmm6\n"
16741           "movapd %%xmm13, %%xmm5\n"
16742           "movapd %%xmm13, %%xmm7\n"
16743           "addpd %%xmm15, %%xmm5\n"
16744           "subpd %%xmm15, %%xmm7\n"
16745           "movapd %%xmm0, %%xmm8\n"
16746           "movapd %%xmm0, %%xmm12\n"
16747           "addpd %%xmm4, %%xmm8\n"
16748           "subpd %%xmm4, %%xmm12\n"
16749           "movapd %%xmm1, %%xmm9\n"
16750           "movapd %%xmm1, %%xmm13\n"
16751           "addpd %%xmm5, %%xmm9\n"
16752           "subpd %%xmm5, %%xmm13\n"
16753           "movapd %%xmm2, %%xmm10\n"
16754           "movapd %%xmm2, %%xmm14\n"
16755           "addpd %%xmm6, %%xmm10\n"
16756           "subpd %%xmm6, %%xmm14\n"
16757           "movapd %%xmm3, %%xmm11\n"
16758           "movapd %%xmm3, %%xmm15\n"
16759           "addpd %%xmm7, %%xmm11\n"
16760           "subpd %%xmm7, %%xmm15\n"
16761           "movupd %%xmm8, (%0)\n"
16762           "movupd %%xmm9, (%1)\n"
16763           "movupd %%xmm10, (%2)\n"
16764           "movupd %%xmm11, (%3)\n"
16765           "movupd %%xmm12, (%4)\n"
16766           "movupd %%xmm13, (%5)\n"
16767           "movupd %%xmm14, (%6)\n"
16768           "movupd %%xmm15, (%7)\n"
16769           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16770         );
16771       }
16772     }
16773     for (int j = 0; j < 1024; j += 128) {
16774       for (int k = 0; k < 16; k += 2) {
16775         __asm__ volatile (
16776           "movupd (%0), %%xmm0\n"
16777           "movupd (%1), %%xmm1\n"
16778           "movupd (%2), %%xmm2\n"
16779           "movupd (%3), %%xmm3\n"
16780           "movupd (%4), %%xmm4\n"
16781           "movupd (%5), %%xmm5\n"
16782           "movupd (%6), %%xmm6\n"
16783           "movupd (%7), %%xmm7\n"
16784           "movapd %%xmm0, %%xmm8\n"
16785           "movapd %%xmm0, %%xmm9\n"
16786           "addpd %%xmm1, %%xmm8\n"
16787           "subpd %%xmm1, %%xmm9\n"
16788           "movapd %%xmm2, %%xmm10\n"
16789           "movapd %%xmm2, %%xmm11\n"
16790           "addpd %%xmm3, %%xmm10\n"
16791           "subpd %%xmm3, %%xmm11\n"
16792           "movapd %%xmm4, %%xmm12\n"
16793           "movapd %%xmm4, %%xmm13\n"
16794           "addpd %%xmm5, %%xmm12\n"
16795           "subpd %%xmm5, %%xmm13\n"
16796           "movapd %%xmm6, %%xmm14\n"
16797           "movapd %%xmm6, %%xmm15\n"
16798           "addpd %%xmm7, %%xmm14\n"
16799           "subpd %%xmm7, %%xmm15\n"
16800           "movapd %%xmm8, %%xmm0\n"
16801           "movapd %%xmm8, %%xmm2\n"
16802           "addpd %%xmm10, %%xmm0\n"
16803           "subpd %%xmm10, %%xmm2\n"
16804           "movapd %%xmm9, %%xmm1\n"
16805           "movapd %%xmm9, %%xmm3\n"
16806           "addpd %%xmm11, %%xmm1\n"
16807           "subpd %%xmm11, %%xmm3\n"
16808           "movapd %%xmm12, %%xmm4\n"
16809           "movapd %%xmm12, %%xmm6\n"
16810           "addpd %%xmm14, %%xmm4\n"
16811           "subpd %%xmm14, %%xmm6\n"
16812           "movapd %%xmm13, %%xmm5\n"
16813           "movapd %%xmm13, %%xmm7\n"
16814           "addpd %%xmm15, %%xmm5\n"
16815           "subpd %%xmm15, %%xmm7\n"
16816           "movapd %%xmm0, %%xmm8\n"
16817           "movapd %%xmm0, %%xmm12\n"
16818           "addpd %%xmm4, %%xmm8\n"
16819           "subpd %%xmm4, %%xmm12\n"
16820           "movapd %%xmm1, %%xmm9\n"
16821           "movapd %%xmm1, %%xmm13\n"
16822           "addpd %%xmm5, %%xmm9\n"
16823           "subpd %%xmm5, %%xmm13\n"
16824           "movapd %%xmm2, %%xmm10\n"
16825           "movapd %%xmm2, %%xmm14\n"
16826           "addpd %%xmm6, %%xmm10\n"
16827           "subpd %%xmm6, %%xmm14\n"
16828           "movapd %%xmm3, %%xmm11\n"
16829           "movapd %%xmm3, %%xmm15\n"
16830           "addpd %%xmm7, %%xmm11\n"
16831           "subpd %%xmm7, %%xmm15\n"
16832           "movupd %%xmm8, (%0)\n"
16833           "movupd %%xmm9, (%1)\n"
16834           "movupd %%xmm10, (%2)\n"
16835           "movupd %%xmm11, (%3)\n"
16836           "movupd %%xmm12, (%4)\n"
16837           "movupd %%xmm13, (%5)\n"
16838           "movupd %%xmm14, (%6)\n"
16839           "movupd %%xmm15, (%7)\n"
16840           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16841         );
16842       }
16843     }
16844     for (int j = 0; j < 1024; j += 1024) {
16845       for (int k = 0; k < 128; k += 2) {
16846         __asm__ volatile (
16847           "movupd (%0), %%xmm0\n"
16848           "movupd (%1), %%xmm1\n"
16849           "movupd (%2), %%xmm2\n"
16850           "movupd (%3), %%xmm3\n"
16851           "movupd (%4), %%xmm4\n"
16852           "movupd (%5), %%xmm5\n"
16853           "movupd (%6), %%xmm6\n"
16854           "movupd (%7), %%xmm7\n"
16855           "movapd %%xmm0, %%xmm8\n"
16856           "movapd %%xmm0, %%xmm9\n"
16857           "addpd %%xmm1, %%xmm8\n"
16858           "subpd %%xmm1, %%xmm9\n"
16859           "movapd %%xmm2, %%xmm10\n"
16860           "movapd %%xmm2, %%xmm11\n"
16861           "addpd %%xmm3, %%xmm10\n"
16862           "subpd %%xmm3, %%xmm11\n"
16863           "movapd %%xmm4, %%xmm12\n"
16864           "movapd %%xmm4, %%xmm13\n"
16865           "addpd %%xmm5, %%xmm12\n"
16866           "subpd %%xmm5, %%xmm13\n"
16867           "movapd %%xmm6, %%xmm14\n"
16868           "movapd %%xmm6, %%xmm15\n"
16869           "addpd %%xmm7, %%xmm14\n"
16870           "subpd %%xmm7, %%xmm15\n"
16871           "movapd %%xmm8, %%xmm0\n"
16872           "movapd %%xmm8, %%xmm2\n"
16873           "addpd %%xmm10, %%xmm0\n"
16874           "subpd %%xmm10, %%xmm2\n"
16875           "movapd %%xmm9, %%xmm1\n"
16876           "movapd %%xmm9, %%xmm3\n"
16877           "addpd %%xmm11, %%xmm1\n"
16878           "subpd %%xmm11, %%xmm3\n"
16879           "movapd %%xmm12, %%xmm4\n"
16880           "movapd %%xmm12, %%xmm6\n"
16881           "addpd %%xmm14, %%xmm4\n"
16882           "subpd %%xmm14, %%xmm6\n"
16883           "movapd %%xmm13, %%xmm5\n"
16884           "movapd %%xmm13, %%xmm7\n"
16885           "addpd %%xmm15, %%xmm5\n"
16886           "subpd %%xmm15, %%xmm7\n"
16887           "movapd %%xmm0, %%xmm8\n"
16888           "movapd %%xmm0, %%xmm12\n"
16889           "addpd %%xmm4, %%xmm8\n"
16890           "subpd %%xmm4, %%xmm12\n"
16891           "movapd %%xmm1, %%xmm9\n"
16892           "movapd %%xmm1, %%xmm13\n"
16893           "addpd %%xmm5, %%xmm9\n"
16894           "subpd %%xmm5, %%xmm13\n"
16895           "movapd %%xmm2, %%xmm10\n"
16896           "movapd %%xmm2, %%xmm14\n"
16897           "addpd %%xmm6, %%xmm10\n"
16898           "subpd %%xmm6, %%xmm14\n"
16899           "movapd %%xmm3, %%xmm11\n"
16900           "movapd %%xmm3, %%xmm15\n"
16901           "addpd %%xmm7, %%xmm11\n"
16902           "subpd %%xmm7, %%xmm15\n"
16903           "movupd %%xmm8, (%0)\n"
16904           "movupd %%xmm9, (%1)\n"
16905           "movupd %%xmm10, (%2)\n"
16906           "movupd %%xmm11, (%3)\n"
16907           "movupd %%xmm12, (%4)\n"
16908           "movupd %%xmm13, (%5)\n"
16909           "movupd %%xmm14, (%6)\n"
16910           "movupd %%xmm15, (%7)\n"
16911           :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16912         );
16913       }
16914     }
16915     return;
16916   }
16917   if (depth == 13) {
16918     helper_double_15_recursive(buf + 0, 10);
16919     helper_double_15_recursive(buf + 1024, 10);
16920     helper_double_15_recursive(buf + 2048, 10);
16921     helper_double_15_recursive(buf + 3072, 10);
16922     helper_double_15_recursive(buf + 4096, 10);
16923     helper_double_15_recursive(buf + 5120, 10);
16924     helper_double_15_recursive(buf + 6144, 10);
16925     helper_double_15_recursive(buf + 7168, 10);
16926     for (int j = 0; j < 8192; j += 8192) {
16927       for (int k = 0; k < 1024; k += 2) {
16928         __asm__ volatile (
16929           "movupd (%0), %%xmm0\n"
16930           "movupd (%1), %%xmm1\n"
16931           "movupd (%2), %%xmm2\n"
16932           "movupd (%3), %%xmm3\n"
16933           "movupd (%4), %%xmm4\n"
16934           "movupd (%5), %%xmm5\n"
16935           "movupd (%6), %%xmm6\n"
16936           "movupd (%7), %%xmm7\n"
16937           "movapd %%xmm0, %%xmm8\n"
16938           "movapd %%xmm0, %%xmm9\n"
16939           "addpd %%xmm1, %%xmm8\n"
16940           "subpd %%xmm1, %%xmm9\n"
16941           "movapd %%xmm2, %%xmm10\n"
16942           "movapd %%xmm2, %%xmm11\n"
16943           "addpd %%xmm3, %%xmm10\n"
16944           "subpd %%xmm3, %%xmm11\n"
16945           "movapd %%xmm4, %%xmm12\n"
16946           "movapd %%xmm4, %%xmm13\n"
16947           "addpd %%xmm5, %%xmm12\n"
16948           "subpd %%xmm5, %%xmm13\n"
16949           "movapd %%xmm6, %%xmm14\n"
16950           "movapd %%xmm6, %%xmm15\n"
16951           "addpd %%xmm7, %%xmm14\n"
16952           "subpd %%xmm7, %%xmm15\n"
16953           "movapd %%xmm8, %%xmm0\n"
16954           "movapd %%xmm8, %%xmm2\n"
16955           "addpd %%xmm10, %%xmm0\n"
16956           "subpd %%xmm10, %%xmm2\n"
16957           "movapd %%xmm9, %%xmm1\n"
16958           "movapd %%xmm9, %%xmm3\n"
16959           "addpd %%xmm11, %%xmm1\n"
16960           "subpd %%xmm11, %%xmm3\n"
16961           "movapd %%xmm12, %%xmm4\n"
16962           "movapd %%xmm12, %%xmm6\n"
16963           "addpd %%xmm14, %%xmm4\n"
16964           "subpd %%xmm14, %%xmm6\n"
16965           "movapd %%xmm13, %%xmm5\n"
16966           "movapd %%xmm13, %%xmm7\n"
16967           "addpd %%xmm15, %%xmm5\n"
16968           "subpd %%xmm15, %%xmm7\n"
16969           "movapd %%xmm0, %%xmm8\n"
16970           "movapd %%xmm0, %%xmm12\n"
16971           "addpd %%xmm4, %%xmm8\n"
16972           "subpd %%xmm4, %%xmm12\n"
16973           "movapd %%xmm1, %%xmm9\n"
16974           "movapd %%xmm1, %%xmm13\n"
16975           "addpd %%xmm5, %%xmm9\n"
16976           "subpd %%xmm5, %%xmm13\n"
16977           "movapd %%xmm2, %%xmm10\n"
16978           "movapd %%xmm2, %%xmm14\n"
16979           "addpd %%xmm6, %%xmm10\n"
16980           "subpd %%xmm6, %%xmm14\n"
16981           "movapd %%xmm3, %%xmm11\n"
16982           "movapd %%xmm3, %%xmm15\n"
16983           "addpd %%xmm7, %%xmm11\n"
16984           "subpd %%xmm7, %%xmm15\n"
16985           "movupd %%xmm8, (%0)\n"
16986           "movupd %%xmm9, (%1)\n"
16987           "movupd %%xmm10, (%2)\n"
16988           "movupd %%xmm11, (%3)\n"
16989           "movupd %%xmm12, (%4)\n"
16990           "movupd %%xmm13, (%5)\n"
16991           "movupd %%xmm14, (%6)\n"
16992           "movupd %%xmm15, (%7)\n"
16993           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16994         );
16995       }
16996     }
16997     return;
16998   }
16999   if (depth == 15) {
17000     helper_double_15_recursive(buf + 0, 13);
17001     helper_double_15_recursive(buf + 8192, 13);
17002     helper_double_15_recursive(buf + 16384, 13);
17003     helper_double_15_recursive(buf + 24576, 13);
17004     for (int j = 0; j < 32768; j += 32768) {
17005       for (int k = 0; k < 8192; k += 2) {
17006         __asm__ volatile (
17007           "movupd (%0), %%xmm0\n"
17008           "movupd (%1), %%xmm1\n"
17009           "movupd (%2), %%xmm2\n"
17010           "movupd (%3), %%xmm3\n"
17011           "movapd %%xmm0, %%xmm8\n"
17012           "movapd %%xmm0, %%xmm9\n"
17013           "addpd %%xmm1, %%xmm8\n"
17014           "subpd %%xmm1, %%xmm9\n"
17015           "movapd %%xmm2, %%xmm10\n"
17016           "movapd %%xmm2, %%xmm11\n"
17017           "addpd %%xmm3, %%xmm10\n"
17018           "subpd %%xmm3, %%xmm11\n"
17019           "movapd %%xmm8, %%xmm0\n"
17020           "movapd %%xmm8, %%xmm2\n"
17021           "addpd %%xmm10, %%xmm0\n"
17022           "subpd %%xmm10, %%xmm2\n"
17023           "movapd %%xmm9, %%xmm1\n"
17024           "movapd %%xmm9, %%xmm3\n"
17025           "addpd %%xmm11, %%xmm1\n"
17026           "subpd %%xmm11, %%xmm3\n"
17027           "movupd %%xmm0, (%0)\n"
17028           "movupd %%xmm1, (%1)\n"
17029           "movupd %%xmm2, (%2)\n"
17030           "movupd %%xmm3, (%3)\n"
17031           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17032         );
17033       }
17034     }
17035     return;
17036   }
17037 }
17038 void helper_double_15(double *buf);
helper_double_15(double * buf)17039 void helper_double_15(double *buf) {
17040   helper_double_15_recursive(buf, 15);
17041 }
17042 void helper_double_16_recursive(double *buf, int depth);
helper_double_16_recursive(double * buf,int depth)17043 void helper_double_16_recursive(double *buf, int depth) {
17044   if (depth == 2) {
17045     for (int j = 0; j < 4; j += 4) {
17046       for (int k = 0; k < 2; k += 2) {
17047         __asm__ volatile (
17048           "movupd (%0), %%xmm0\n"
17049           "movupd (%1), %%xmm1\n"
17050           "movapd %%xmm0, %%xmm8\n"
17051           "haddpd %%xmm8, %%xmm8\n"
17052           "movapd %%xmm0, %%xmm9\n"
17053           "hsubpd %%xmm9, %%xmm9\n"
17054           "blendpd $1, %%xmm8, %%xmm9\n"
17055           "movapd %%xmm9, %%xmm0\n"
17056           "movapd %%xmm1, %%xmm8\n"
17057           "haddpd %%xmm8, %%xmm8\n"
17058           "movapd %%xmm1, %%xmm9\n"
17059           "hsubpd %%xmm9, %%xmm9\n"
17060           "blendpd $1, %%xmm8, %%xmm9\n"
17061           "movapd %%xmm9, %%xmm1\n"
17062           "movapd %%xmm0, %%xmm8\n"
17063           "movapd %%xmm0, %%xmm9\n"
17064           "addpd %%xmm1, %%xmm8\n"
17065           "subpd %%xmm1, %%xmm9\n"
17066           "movupd %%xmm8, (%0)\n"
17067           "movupd %%xmm9, (%1)\n"
17068           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17069         );
17070       }
17071     }
17072     return;
17073   }
17074   if (depth == 5) {
17075     helper_double_16_recursive(buf + 0, 2);
17076     helper_double_16_recursive(buf + 4, 2);
17077     helper_double_16_recursive(buf + 8, 2);
17078     helper_double_16_recursive(buf + 12, 2);
17079     helper_double_16_recursive(buf + 16, 2);
17080     helper_double_16_recursive(buf + 20, 2);
17081     helper_double_16_recursive(buf + 24, 2);
17082     helper_double_16_recursive(buf + 28, 2);
17083     for (int j = 0; j < 32; j += 32) {
17084       for (int k = 0; k < 4; k += 2) {
17085         __asm__ volatile (
17086           "movupd (%0), %%xmm0\n"
17087           "movupd (%1), %%xmm1\n"
17088           "movupd (%2), %%xmm2\n"
17089           "movupd (%3), %%xmm3\n"
17090           "movupd (%4), %%xmm4\n"
17091           "movupd (%5), %%xmm5\n"
17092           "movupd (%6), %%xmm6\n"
17093           "movupd (%7), %%xmm7\n"
17094           "movapd %%xmm0, %%xmm8\n"
17095           "movapd %%xmm0, %%xmm9\n"
17096           "addpd %%xmm1, %%xmm8\n"
17097           "subpd %%xmm1, %%xmm9\n"
17098           "movapd %%xmm2, %%xmm10\n"
17099           "movapd %%xmm2, %%xmm11\n"
17100           "addpd %%xmm3, %%xmm10\n"
17101           "subpd %%xmm3, %%xmm11\n"
17102           "movapd %%xmm4, %%xmm12\n"
17103           "movapd %%xmm4, %%xmm13\n"
17104           "addpd %%xmm5, %%xmm12\n"
17105           "subpd %%xmm5, %%xmm13\n"
17106           "movapd %%xmm6, %%xmm14\n"
17107           "movapd %%xmm6, %%xmm15\n"
17108           "addpd %%xmm7, %%xmm14\n"
17109           "subpd %%xmm7, %%xmm15\n"
17110           "movapd %%xmm8, %%xmm0\n"
17111           "movapd %%xmm8, %%xmm2\n"
17112           "addpd %%xmm10, %%xmm0\n"
17113           "subpd %%xmm10, %%xmm2\n"
17114           "movapd %%xmm9, %%xmm1\n"
17115           "movapd %%xmm9, %%xmm3\n"
17116           "addpd %%xmm11, %%xmm1\n"
17117           "subpd %%xmm11, %%xmm3\n"
17118           "movapd %%xmm12, %%xmm4\n"
17119           "movapd %%xmm12, %%xmm6\n"
17120           "addpd %%xmm14, %%xmm4\n"
17121           "subpd %%xmm14, %%xmm6\n"
17122           "movapd %%xmm13, %%xmm5\n"
17123           "movapd %%xmm13, %%xmm7\n"
17124           "addpd %%xmm15, %%xmm5\n"
17125           "subpd %%xmm15, %%xmm7\n"
17126           "movapd %%xmm0, %%xmm8\n"
17127           "movapd %%xmm0, %%xmm12\n"
17128           "addpd %%xmm4, %%xmm8\n"
17129           "subpd %%xmm4, %%xmm12\n"
17130           "movapd %%xmm1, %%xmm9\n"
17131           "movapd %%xmm1, %%xmm13\n"
17132           "addpd %%xmm5, %%xmm9\n"
17133           "subpd %%xmm5, %%xmm13\n"
17134           "movapd %%xmm2, %%xmm10\n"
17135           "movapd %%xmm2, %%xmm14\n"
17136           "addpd %%xmm6, %%xmm10\n"
17137           "subpd %%xmm6, %%xmm14\n"
17138           "movapd %%xmm3, %%xmm11\n"
17139           "movapd %%xmm3, %%xmm15\n"
17140           "addpd %%xmm7, %%xmm11\n"
17141           "subpd %%xmm7, %%xmm15\n"
17142           "movupd %%xmm8, (%0)\n"
17143           "movupd %%xmm9, (%1)\n"
17144           "movupd %%xmm10, (%2)\n"
17145           "movupd %%xmm11, (%3)\n"
17146           "movupd %%xmm12, (%4)\n"
17147           "movupd %%xmm13, (%5)\n"
17148           "movupd %%xmm14, (%6)\n"
17149           "movupd %%xmm15, (%7)\n"
17150           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17151         );
17152       }
17153     }
17154     return;
17155   }
17156   if (depth == 8) {
17157     helper_double_16_recursive(buf + 0, 5);
17158     helper_double_16_recursive(buf + 32, 5);
17159     helper_double_16_recursive(buf + 64, 5);
17160     helper_double_16_recursive(buf + 96, 5);
17161     helper_double_16_recursive(buf + 128, 5);
17162     helper_double_16_recursive(buf + 160, 5);
17163     helper_double_16_recursive(buf + 192, 5);
17164     helper_double_16_recursive(buf + 224, 5);
17165     for (int j = 0; j < 256; j += 256) {
17166       for (int k = 0; k < 32; k += 2) {
17167         __asm__ volatile (
17168           "movupd (%0), %%xmm0\n"
17169           "movupd (%1), %%xmm1\n"
17170           "movupd (%2), %%xmm2\n"
17171           "movupd (%3), %%xmm3\n"
17172           "movupd (%4), %%xmm4\n"
17173           "movupd (%5), %%xmm5\n"
17174           "movupd (%6), %%xmm6\n"
17175           "movupd (%7), %%xmm7\n"
17176           "movapd %%xmm0, %%xmm8\n"
17177           "movapd %%xmm0, %%xmm9\n"
17178           "addpd %%xmm1, %%xmm8\n"
17179           "subpd %%xmm1, %%xmm9\n"
17180           "movapd %%xmm2, %%xmm10\n"
17181           "movapd %%xmm2, %%xmm11\n"
17182           "addpd %%xmm3, %%xmm10\n"
17183           "subpd %%xmm3, %%xmm11\n"
17184           "movapd %%xmm4, %%xmm12\n"
17185           "movapd %%xmm4, %%xmm13\n"
17186           "addpd %%xmm5, %%xmm12\n"
17187           "subpd %%xmm5, %%xmm13\n"
17188           "movapd %%xmm6, %%xmm14\n"
17189           "movapd %%xmm6, %%xmm15\n"
17190           "addpd %%xmm7, %%xmm14\n"
17191           "subpd %%xmm7, %%xmm15\n"
17192           "movapd %%xmm8, %%xmm0\n"
17193           "movapd %%xmm8, %%xmm2\n"
17194           "addpd %%xmm10, %%xmm0\n"
17195           "subpd %%xmm10, %%xmm2\n"
17196           "movapd %%xmm9, %%xmm1\n"
17197           "movapd %%xmm9, %%xmm3\n"
17198           "addpd %%xmm11, %%xmm1\n"
17199           "subpd %%xmm11, %%xmm3\n"
17200           "movapd %%xmm12, %%xmm4\n"
17201           "movapd %%xmm12, %%xmm6\n"
17202           "addpd %%xmm14, %%xmm4\n"
17203           "subpd %%xmm14, %%xmm6\n"
17204           "movapd %%xmm13, %%xmm5\n"
17205           "movapd %%xmm13, %%xmm7\n"
17206           "addpd %%xmm15, %%xmm5\n"
17207           "subpd %%xmm15, %%xmm7\n"
17208           "movapd %%xmm0, %%xmm8\n"
17209           "movapd %%xmm0, %%xmm12\n"
17210           "addpd %%xmm4, %%xmm8\n"
17211           "subpd %%xmm4, %%xmm12\n"
17212           "movapd %%xmm1, %%xmm9\n"
17213           "movapd %%xmm1, %%xmm13\n"
17214           "addpd %%xmm5, %%xmm9\n"
17215           "subpd %%xmm5, %%xmm13\n"
17216           "movapd %%xmm2, %%xmm10\n"
17217           "movapd %%xmm2, %%xmm14\n"
17218           "addpd %%xmm6, %%xmm10\n"
17219           "subpd %%xmm6, %%xmm14\n"
17220           "movapd %%xmm3, %%xmm11\n"
17221           "movapd %%xmm3, %%xmm15\n"
17222           "addpd %%xmm7, %%xmm11\n"
17223           "subpd %%xmm7, %%xmm15\n"
17224           "movupd %%xmm8, (%0)\n"
17225           "movupd %%xmm9, (%1)\n"
17226           "movupd %%xmm10, (%2)\n"
17227           "movupd %%xmm11, (%3)\n"
17228           "movupd %%xmm12, (%4)\n"
17229           "movupd %%xmm13, (%5)\n"
17230           "movupd %%xmm14, (%6)\n"
17231           "movupd %%xmm15, (%7)\n"
17232           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17233         );
17234       }
17235     }
17236     return;
17237   }
17238   if (depth == 11) {
17239     helper_double_16_recursive(buf + 0, 8);
17240     helper_double_16_recursive(buf + 256, 8);
17241     helper_double_16_recursive(buf + 512, 8);
17242     helper_double_16_recursive(buf + 768, 8);
17243     helper_double_16_recursive(buf + 1024, 8);
17244     helper_double_16_recursive(buf + 1280, 8);
17245     helper_double_16_recursive(buf + 1536, 8);
17246     helper_double_16_recursive(buf + 1792, 8);
17247     for (int j = 0; j < 2048; j += 2048) {
17248       for (int k = 0; k < 256; k += 2) {
17249         __asm__ volatile (
17250           "movupd (%0), %%xmm0\n"
17251           "movupd (%1), %%xmm1\n"
17252           "movupd (%2), %%xmm2\n"
17253           "movupd (%3), %%xmm3\n"
17254           "movupd (%4), %%xmm4\n"
17255           "movupd (%5), %%xmm5\n"
17256           "movupd (%6), %%xmm6\n"
17257           "movupd (%7), %%xmm7\n"
17258           "movapd %%xmm0, %%xmm8\n"
17259           "movapd %%xmm0, %%xmm9\n"
17260           "addpd %%xmm1, %%xmm8\n"
17261           "subpd %%xmm1, %%xmm9\n"
17262           "movapd %%xmm2, %%xmm10\n"
17263           "movapd %%xmm2, %%xmm11\n"
17264           "addpd %%xmm3, %%xmm10\n"
17265           "subpd %%xmm3, %%xmm11\n"
17266           "movapd %%xmm4, %%xmm12\n"
17267           "movapd %%xmm4, %%xmm13\n"
17268           "addpd %%xmm5, %%xmm12\n"
17269           "subpd %%xmm5, %%xmm13\n"
17270           "movapd %%xmm6, %%xmm14\n"
17271           "movapd %%xmm6, %%xmm15\n"
17272           "addpd %%xmm7, %%xmm14\n"
17273           "subpd %%xmm7, %%xmm15\n"
17274           "movapd %%xmm8, %%xmm0\n"
17275           "movapd %%xmm8, %%xmm2\n"
17276           "addpd %%xmm10, %%xmm0\n"
17277           "subpd %%xmm10, %%xmm2\n"
17278           "movapd %%xmm9, %%xmm1\n"
17279           "movapd %%xmm9, %%xmm3\n"
17280           "addpd %%xmm11, %%xmm1\n"
17281           "subpd %%xmm11, %%xmm3\n"
17282           "movapd %%xmm12, %%xmm4\n"
17283           "movapd %%xmm12, %%xmm6\n"
17284           "addpd %%xmm14, %%xmm4\n"
17285           "subpd %%xmm14, %%xmm6\n"
17286           "movapd %%xmm13, %%xmm5\n"
17287           "movapd %%xmm13, %%xmm7\n"
17288           "addpd %%xmm15, %%xmm5\n"
17289           "subpd %%xmm15, %%xmm7\n"
17290           "movapd %%xmm0, %%xmm8\n"
17291           "movapd %%xmm0, %%xmm12\n"
17292           "addpd %%xmm4, %%xmm8\n"
17293           "subpd %%xmm4, %%xmm12\n"
17294           "movapd %%xmm1, %%xmm9\n"
17295           "movapd %%xmm1, %%xmm13\n"
17296           "addpd %%xmm5, %%xmm9\n"
17297           "subpd %%xmm5, %%xmm13\n"
17298           "movapd %%xmm2, %%xmm10\n"
17299           "movapd %%xmm2, %%xmm14\n"
17300           "addpd %%xmm6, %%xmm10\n"
17301           "subpd %%xmm6, %%xmm14\n"
17302           "movapd %%xmm3, %%xmm11\n"
17303           "movapd %%xmm3, %%xmm15\n"
17304           "addpd %%xmm7, %%xmm11\n"
17305           "subpd %%xmm7, %%xmm15\n"
17306           "movupd %%xmm8, (%0)\n"
17307           "movupd %%xmm9, (%1)\n"
17308           "movupd %%xmm10, (%2)\n"
17309           "movupd %%xmm11, (%3)\n"
17310           "movupd %%xmm12, (%4)\n"
17311           "movupd %%xmm13, (%5)\n"
17312           "movupd %%xmm14, (%6)\n"
17313           "movupd %%xmm15, (%7)\n"
17314           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17315         );
17316       }
17317     }
17318     return;
17319   }
17320   if (depth == 14) {
17321     helper_double_16_recursive(buf + 0, 11);
17322     helper_double_16_recursive(buf + 2048, 11);
17323     helper_double_16_recursive(buf + 4096, 11);
17324     helper_double_16_recursive(buf + 6144, 11);
17325     helper_double_16_recursive(buf + 8192, 11);
17326     helper_double_16_recursive(buf + 10240, 11);
17327     helper_double_16_recursive(buf + 12288, 11);
17328     helper_double_16_recursive(buf + 14336, 11);
17329     for (int j = 0; j < 16384; j += 16384) {
17330       for (int k = 0; k < 2048; k += 2) {
17331         __asm__ volatile (
17332           "movupd (%0), %%xmm0\n"
17333           "movupd (%1), %%xmm1\n"
17334           "movupd (%2), %%xmm2\n"
17335           "movupd (%3), %%xmm3\n"
17336           "movupd (%4), %%xmm4\n"
17337           "movupd (%5), %%xmm5\n"
17338           "movupd (%6), %%xmm6\n"
17339           "movupd (%7), %%xmm7\n"
17340           "movapd %%xmm0, %%xmm8\n"
17341           "movapd %%xmm0, %%xmm9\n"
17342           "addpd %%xmm1, %%xmm8\n"
17343           "subpd %%xmm1, %%xmm9\n"
17344           "movapd %%xmm2, %%xmm10\n"
17345           "movapd %%xmm2, %%xmm11\n"
17346           "addpd %%xmm3, %%xmm10\n"
17347           "subpd %%xmm3, %%xmm11\n"
17348           "movapd %%xmm4, %%xmm12\n"
17349           "movapd %%xmm4, %%xmm13\n"
17350           "addpd %%xmm5, %%xmm12\n"
17351           "subpd %%xmm5, %%xmm13\n"
17352           "movapd %%xmm6, %%xmm14\n"
17353           "movapd %%xmm6, %%xmm15\n"
17354           "addpd %%xmm7, %%xmm14\n"
17355           "subpd %%xmm7, %%xmm15\n"
17356           "movapd %%xmm8, %%xmm0\n"
17357           "movapd %%xmm8, %%xmm2\n"
17358           "addpd %%xmm10, %%xmm0\n"
17359           "subpd %%xmm10, %%xmm2\n"
17360           "movapd %%xmm9, %%xmm1\n"
17361           "movapd %%xmm9, %%xmm3\n"
17362           "addpd %%xmm11, %%xmm1\n"
17363           "subpd %%xmm11, %%xmm3\n"
17364           "movapd %%xmm12, %%xmm4\n"
17365           "movapd %%xmm12, %%xmm6\n"
17366           "addpd %%xmm14, %%xmm4\n"
17367           "subpd %%xmm14, %%xmm6\n"
17368           "movapd %%xmm13, %%xmm5\n"
17369           "movapd %%xmm13, %%xmm7\n"
17370           "addpd %%xmm15, %%xmm5\n"
17371           "subpd %%xmm15, %%xmm7\n"
17372           "movapd %%xmm0, %%xmm8\n"
17373           "movapd %%xmm0, %%xmm12\n"
17374           "addpd %%xmm4, %%xmm8\n"
17375           "subpd %%xmm4, %%xmm12\n"
17376           "movapd %%xmm1, %%xmm9\n"
17377           "movapd %%xmm1, %%xmm13\n"
17378           "addpd %%xmm5, %%xmm9\n"
17379           "subpd %%xmm5, %%xmm13\n"
17380           "movapd %%xmm2, %%xmm10\n"
17381           "movapd %%xmm2, %%xmm14\n"
17382           "addpd %%xmm6, %%xmm10\n"
17383           "subpd %%xmm6, %%xmm14\n"
17384           "movapd %%xmm3, %%xmm11\n"
17385           "movapd %%xmm3, %%xmm15\n"
17386           "addpd %%xmm7, %%xmm11\n"
17387           "subpd %%xmm7, %%xmm15\n"
17388           "movupd %%xmm8, (%0)\n"
17389           "movupd %%xmm9, (%1)\n"
17390           "movupd %%xmm10, (%2)\n"
17391           "movupd %%xmm11, (%3)\n"
17392           "movupd %%xmm12, (%4)\n"
17393           "movupd %%xmm13, (%5)\n"
17394           "movupd %%xmm14, (%6)\n"
17395           "movupd %%xmm15, (%7)\n"
17396           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17397         );
17398       }
17399     }
17400     return;
17401   }
17402   if (depth == 16) {
17403     helper_double_16_recursive(buf + 0, 14);
17404     helper_double_16_recursive(buf + 16384, 14);
17405     helper_double_16_recursive(buf + 32768, 14);
17406     helper_double_16_recursive(buf + 49152, 14);
17407     for (int j = 0; j < 65536; j += 65536) {
17408       for (int k = 0; k < 16384; k += 2) {
17409         __asm__ volatile (
17410           "movupd (%0), %%xmm0\n"
17411           "movupd (%1), %%xmm1\n"
17412           "movupd (%2), %%xmm2\n"
17413           "movupd (%3), %%xmm3\n"
17414           "movapd %%xmm0, %%xmm8\n"
17415           "movapd %%xmm0, %%xmm9\n"
17416           "addpd %%xmm1, %%xmm8\n"
17417           "subpd %%xmm1, %%xmm9\n"
17418           "movapd %%xmm2, %%xmm10\n"
17419           "movapd %%xmm2, %%xmm11\n"
17420           "addpd %%xmm3, %%xmm10\n"
17421           "subpd %%xmm3, %%xmm11\n"
17422           "movapd %%xmm8, %%xmm0\n"
17423           "movapd %%xmm8, %%xmm2\n"
17424           "addpd %%xmm10, %%xmm0\n"
17425           "subpd %%xmm10, %%xmm2\n"
17426           "movapd %%xmm9, %%xmm1\n"
17427           "movapd %%xmm9, %%xmm3\n"
17428           "addpd %%xmm11, %%xmm1\n"
17429           "subpd %%xmm11, %%xmm3\n"
17430           "movupd %%xmm0, (%0)\n"
17431           "movupd %%xmm1, (%1)\n"
17432           "movupd %%xmm2, (%2)\n"
17433           "movupd %%xmm3, (%3)\n"
17434           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17435         );
17436       }
17437     }
17438     return;
17439   }
17440 }
17441 void helper_double_16(double *buf);
helper_double_16(double * buf)17442 void helper_double_16(double *buf) {
17443   helper_double_16_recursive(buf, 16);
17444 }
17445 void helper_double_17_recursive(double *buf, int depth);
helper_double_17_recursive(double * buf,int depth)17446 void helper_double_17_recursive(double *buf, int depth) {
17447   if (depth == 12) {
17448     for (int j = 0; j < 4096; j += 16) {
17449       for (int k = 0; k < 2; k += 2) {
17450         __asm__ volatile (
17451           "movupd (%0), %%xmm0\n"
17452           "movupd (%1), %%xmm1\n"
17453           "movupd (%2), %%xmm2\n"
17454           "movupd (%3), %%xmm3\n"
17455           "movupd (%4), %%xmm4\n"
17456           "movupd (%5), %%xmm5\n"
17457           "movupd (%6), %%xmm6\n"
17458           "movupd (%7), %%xmm7\n"
17459           "movapd %%xmm0, %%xmm8\n"
17460           "haddpd %%xmm8, %%xmm8\n"
17461           "movapd %%xmm0, %%xmm9\n"
17462           "hsubpd %%xmm9, %%xmm9\n"
17463           "blendpd $1, %%xmm8, %%xmm9\n"
17464           "movapd %%xmm9, %%xmm0\n"
17465           "movapd %%xmm1, %%xmm8\n"
17466           "haddpd %%xmm8, %%xmm8\n"
17467           "movapd %%xmm1, %%xmm9\n"
17468           "hsubpd %%xmm9, %%xmm9\n"
17469           "blendpd $1, %%xmm8, %%xmm9\n"
17470           "movapd %%xmm9, %%xmm1\n"
17471           "movapd %%xmm2, %%xmm8\n"
17472           "haddpd %%xmm8, %%xmm8\n"
17473           "movapd %%xmm2, %%xmm9\n"
17474           "hsubpd %%xmm9, %%xmm9\n"
17475           "blendpd $1, %%xmm8, %%xmm9\n"
17476           "movapd %%xmm9, %%xmm2\n"
17477           "movapd %%xmm3, %%xmm8\n"
17478           "haddpd %%xmm8, %%xmm8\n"
17479           "movapd %%xmm3, %%xmm9\n"
17480           "hsubpd %%xmm9, %%xmm9\n"
17481           "blendpd $1, %%xmm8, %%xmm9\n"
17482           "movapd %%xmm9, %%xmm3\n"
17483           "movapd %%xmm4, %%xmm8\n"
17484           "haddpd %%xmm8, %%xmm8\n"
17485           "movapd %%xmm4, %%xmm9\n"
17486           "hsubpd %%xmm9, %%xmm9\n"
17487           "blendpd $1, %%xmm8, %%xmm9\n"
17488           "movapd %%xmm9, %%xmm4\n"
17489           "movapd %%xmm5, %%xmm8\n"
17490           "haddpd %%xmm8, %%xmm8\n"
17491           "movapd %%xmm5, %%xmm9\n"
17492           "hsubpd %%xmm9, %%xmm9\n"
17493           "blendpd $1, %%xmm8, %%xmm9\n"
17494           "movapd %%xmm9, %%xmm5\n"
17495           "movapd %%xmm6, %%xmm8\n"
17496           "haddpd %%xmm8, %%xmm8\n"
17497           "movapd %%xmm6, %%xmm9\n"
17498           "hsubpd %%xmm9, %%xmm9\n"
17499           "blendpd $1, %%xmm8, %%xmm9\n"
17500           "movapd %%xmm9, %%xmm6\n"
17501           "movapd %%xmm7, %%xmm8\n"
17502           "haddpd %%xmm8, %%xmm8\n"
17503           "movapd %%xmm7, %%xmm9\n"
17504           "hsubpd %%xmm9, %%xmm9\n"
17505           "blendpd $1, %%xmm8, %%xmm9\n"
17506           "movapd %%xmm9, %%xmm7\n"
17507           "movapd %%xmm0, %%xmm8\n"
17508           "movapd %%xmm0, %%xmm9\n"
17509           "addpd %%xmm1, %%xmm8\n"
17510           "subpd %%xmm1, %%xmm9\n"
17511           "movapd %%xmm2, %%xmm10\n"
17512           "movapd %%xmm2, %%xmm11\n"
17513           "addpd %%xmm3, %%xmm10\n"
17514           "subpd %%xmm3, %%xmm11\n"
17515           "movapd %%xmm4, %%xmm12\n"
17516           "movapd %%xmm4, %%xmm13\n"
17517           "addpd %%xmm5, %%xmm12\n"
17518           "subpd %%xmm5, %%xmm13\n"
17519           "movapd %%xmm6, %%xmm14\n"
17520           "movapd %%xmm6, %%xmm15\n"
17521           "addpd %%xmm7, %%xmm14\n"
17522           "subpd %%xmm7, %%xmm15\n"
17523           "movapd %%xmm8, %%xmm0\n"
17524           "movapd %%xmm8, %%xmm2\n"
17525           "addpd %%xmm10, %%xmm0\n"
17526           "subpd %%xmm10, %%xmm2\n"
17527           "movapd %%xmm9, %%xmm1\n"
17528           "movapd %%xmm9, %%xmm3\n"
17529           "addpd %%xmm11, %%xmm1\n"
17530           "subpd %%xmm11, %%xmm3\n"
17531           "movapd %%xmm12, %%xmm4\n"
17532           "movapd %%xmm12, %%xmm6\n"
17533           "addpd %%xmm14, %%xmm4\n"
17534           "subpd %%xmm14, %%xmm6\n"
17535           "movapd %%xmm13, %%xmm5\n"
17536           "movapd %%xmm13, %%xmm7\n"
17537           "addpd %%xmm15, %%xmm5\n"
17538           "subpd %%xmm15, %%xmm7\n"
17539           "movapd %%xmm0, %%xmm8\n"
17540           "movapd %%xmm0, %%xmm12\n"
17541           "addpd %%xmm4, %%xmm8\n"
17542           "subpd %%xmm4, %%xmm12\n"
17543           "movapd %%xmm1, %%xmm9\n"
17544           "movapd %%xmm1, %%xmm13\n"
17545           "addpd %%xmm5, %%xmm9\n"
17546           "subpd %%xmm5, %%xmm13\n"
17547           "movapd %%xmm2, %%xmm10\n"
17548           "movapd %%xmm2, %%xmm14\n"
17549           "addpd %%xmm6, %%xmm10\n"
17550           "subpd %%xmm6, %%xmm14\n"
17551           "movapd %%xmm3, %%xmm11\n"
17552           "movapd %%xmm3, %%xmm15\n"
17553           "addpd %%xmm7, %%xmm11\n"
17554           "subpd %%xmm7, %%xmm15\n"
17555           "movupd %%xmm8, (%0)\n"
17556           "movupd %%xmm9, (%1)\n"
17557           "movupd %%xmm10, (%2)\n"
17558           "movupd %%xmm11, (%3)\n"
17559           "movupd %%xmm12, (%4)\n"
17560           "movupd %%xmm13, (%5)\n"
17561           "movupd %%xmm14, (%6)\n"
17562           "movupd %%xmm15, (%7)\n"
17563           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17564         );
17565       }
17566     }
17567     for (int j = 0; j < 4096; j += 128) {
17568       for (int k = 0; k < 16; k += 2) {
17569         __asm__ volatile (
17570           "movupd (%0), %%xmm0\n"
17571           "movupd (%1), %%xmm1\n"
17572           "movupd (%2), %%xmm2\n"
17573           "movupd (%3), %%xmm3\n"
17574           "movupd (%4), %%xmm4\n"
17575           "movupd (%5), %%xmm5\n"
17576           "movupd (%6), %%xmm6\n"
17577           "movupd (%7), %%xmm7\n"
17578           "movapd %%xmm0, %%xmm8\n"
17579           "movapd %%xmm0, %%xmm9\n"
17580           "addpd %%xmm1, %%xmm8\n"
17581           "subpd %%xmm1, %%xmm9\n"
17582           "movapd %%xmm2, %%xmm10\n"
17583           "movapd %%xmm2, %%xmm11\n"
17584           "addpd %%xmm3, %%xmm10\n"
17585           "subpd %%xmm3, %%xmm11\n"
17586           "movapd %%xmm4, %%xmm12\n"
17587           "movapd %%xmm4, %%xmm13\n"
17588           "addpd %%xmm5, %%xmm12\n"
17589           "subpd %%xmm5, %%xmm13\n"
17590           "movapd %%xmm6, %%xmm14\n"
17591           "movapd %%xmm6, %%xmm15\n"
17592           "addpd %%xmm7, %%xmm14\n"
17593           "subpd %%xmm7, %%xmm15\n"
17594           "movapd %%xmm8, %%xmm0\n"
17595           "movapd %%xmm8, %%xmm2\n"
17596           "addpd %%xmm10, %%xmm0\n"
17597           "subpd %%xmm10, %%xmm2\n"
17598           "movapd %%xmm9, %%xmm1\n"
17599           "movapd %%xmm9, %%xmm3\n"
17600           "addpd %%xmm11, %%xmm1\n"
17601           "subpd %%xmm11, %%xmm3\n"
17602           "movapd %%xmm12, %%xmm4\n"
17603           "movapd %%xmm12, %%xmm6\n"
17604           "addpd %%xmm14, %%xmm4\n"
17605           "subpd %%xmm14, %%xmm6\n"
17606           "movapd %%xmm13, %%xmm5\n"
17607           "movapd %%xmm13, %%xmm7\n"
17608           "addpd %%xmm15, %%xmm5\n"
17609           "subpd %%xmm15, %%xmm7\n"
17610           "movapd %%xmm0, %%xmm8\n"
17611           "movapd %%xmm0, %%xmm12\n"
17612           "addpd %%xmm4, %%xmm8\n"
17613           "subpd %%xmm4, %%xmm12\n"
17614           "movapd %%xmm1, %%xmm9\n"
17615           "movapd %%xmm1, %%xmm13\n"
17616           "addpd %%xmm5, %%xmm9\n"
17617           "subpd %%xmm5, %%xmm13\n"
17618           "movapd %%xmm2, %%xmm10\n"
17619           "movapd %%xmm2, %%xmm14\n"
17620           "addpd %%xmm6, %%xmm10\n"
17621           "subpd %%xmm6, %%xmm14\n"
17622           "movapd %%xmm3, %%xmm11\n"
17623           "movapd %%xmm3, %%xmm15\n"
17624           "addpd %%xmm7, %%xmm11\n"
17625           "subpd %%xmm7, %%xmm15\n"
17626           "movupd %%xmm8, (%0)\n"
17627           "movupd %%xmm9, (%1)\n"
17628           "movupd %%xmm10, (%2)\n"
17629           "movupd %%xmm11, (%3)\n"
17630           "movupd %%xmm12, (%4)\n"
17631           "movupd %%xmm13, (%5)\n"
17632           "movupd %%xmm14, (%6)\n"
17633           "movupd %%xmm15, (%7)\n"
17634           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17635         );
17636       }
17637     }
17638     for (int j = 0; j < 4096; j += 1024) {
17639       for (int k = 0; k < 128; k += 2) {
17640         __asm__ volatile (
17641           "movupd (%0), %%xmm0\n"
17642           "movupd (%1), %%xmm1\n"
17643           "movupd (%2), %%xmm2\n"
17644           "movupd (%3), %%xmm3\n"
17645           "movupd (%4), %%xmm4\n"
17646           "movupd (%5), %%xmm5\n"
17647           "movupd (%6), %%xmm6\n"
17648           "movupd (%7), %%xmm7\n"
17649           "movapd %%xmm0, %%xmm8\n"
17650           "movapd %%xmm0, %%xmm9\n"
17651           "addpd %%xmm1, %%xmm8\n"
17652           "subpd %%xmm1, %%xmm9\n"
17653           "movapd %%xmm2, %%xmm10\n"
17654           "movapd %%xmm2, %%xmm11\n"
17655           "addpd %%xmm3, %%xmm10\n"
17656           "subpd %%xmm3, %%xmm11\n"
17657           "movapd %%xmm4, %%xmm12\n"
17658           "movapd %%xmm4, %%xmm13\n"
17659           "addpd %%xmm5, %%xmm12\n"
17660           "subpd %%xmm5, %%xmm13\n"
17661           "movapd %%xmm6, %%xmm14\n"
17662           "movapd %%xmm6, %%xmm15\n"
17663           "addpd %%xmm7, %%xmm14\n"
17664           "subpd %%xmm7, %%xmm15\n"
17665           "movapd %%xmm8, %%xmm0\n"
17666           "movapd %%xmm8, %%xmm2\n"
17667           "addpd %%xmm10, %%xmm0\n"
17668           "subpd %%xmm10, %%xmm2\n"
17669           "movapd %%xmm9, %%xmm1\n"
17670           "movapd %%xmm9, %%xmm3\n"
17671           "addpd %%xmm11, %%xmm1\n"
17672           "subpd %%xmm11, %%xmm3\n"
17673           "movapd %%xmm12, %%xmm4\n"
17674           "movapd %%xmm12, %%xmm6\n"
17675           "addpd %%xmm14, %%xmm4\n"
17676           "subpd %%xmm14, %%xmm6\n"
17677           "movapd %%xmm13, %%xmm5\n"
17678           "movapd %%xmm13, %%xmm7\n"
17679           "addpd %%xmm15, %%xmm5\n"
17680           "subpd %%xmm15, %%xmm7\n"
17681           "movapd %%xmm0, %%xmm8\n"
17682           "movapd %%xmm0, %%xmm12\n"
17683           "addpd %%xmm4, %%xmm8\n"
17684           "subpd %%xmm4, %%xmm12\n"
17685           "movapd %%xmm1, %%xmm9\n"
17686           "movapd %%xmm1, %%xmm13\n"
17687           "addpd %%xmm5, %%xmm9\n"
17688           "subpd %%xmm5, %%xmm13\n"
17689           "movapd %%xmm2, %%xmm10\n"
17690           "movapd %%xmm2, %%xmm14\n"
17691           "addpd %%xmm6, %%xmm10\n"
17692           "subpd %%xmm6, %%xmm14\n"
17693           "movapd %%xmm3, %%xmm11\n"
17694           "movapd %%xmm3, %%xmm15\n"
17695           "addpd %%xmm7, %%xmm11\n"
17696           "subpd %%xmm7, %%xmm15\n"
17697           "movupd %%xmm8, (%0)\n"
17698           "movupd %%xmm9, (%1)\n"
17699           "movupd %%xmm10, (%2)\n"
17700           "movupd %%xmm11, (%3)\n"
17701           "movupd %%xmm12, (%4)\n"
17702           "movupd %%xmm13, (%5)\n"
17703           "movupd %%xmm14, (%6)\n"
17704           "movupd %%xmm15, (%7)\n"
17705           :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17706         );
17707       }
17708     }
17709     for (int j = 0; j < 4096; j += 4096) {
17710       for (int k = 0; k < 1024; k += 2) {
17711         __asm__ volatile (
17712           "movupd (%0), %%xmm0\n"
17713           "movupd (%1), %%xmm1\n"
17714           "movupd (%2), %%xmm2\n"
17715           "movupd (%3), %%xmm3\n"
17716           "movapd %%xmm0, %%xmm8\n"
17717           "movapd %%xmm0, %%xmm9\n"
17718           "addpd %%xmm1, %%xmm8\n"
17719           "subpd %%xmm1, %%xmm9\n"
17720           "movapd %%xmm2, %%xmm10\n"
17721           "movapd %%xmm2, %%xmm11\n"
17722           "addpd %%xmm3, %%xmm10\n"
17723           "subpd %%xmm3, %%xmm11\n"
17724           "movapd %%xmm8, %%xmm0\n"
17725           "movapd %%xmm8, %%xmm2\n"
17726           "addpd %%xmm10, %%xmm0\n"
17727           "subpd %%xmm10, %%xmm2\n"
17728           "movapd %%xmm9, %%xmm1\n"
17729           "movapd %%xmm9, %%xmm3\n"
17730           "addpd %%xmm11, %%xmm1\n"
17731           "subpd %%xmm11, %%xmm3\n"
17732           "movupd %%xmm0, (%0)\n"
17733           "movupd %%xmm1, (%1)\n"
17734           "movupd %%xmm2, (%2)\n"
17735           "movupd %%xmm3, (%3)\n"
17736           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17737         );
17738       }
17739     }
17740     return;
17741   }
17742   if (depth == 15) {
17743     helper_double_17_recursive(buf + 0, 12);
17744     helper_double_17_recursive(buf + 4096, 12);
17745     helper_double_17_recursive(buf + 8192, 12);
17746     helper_double_17_recursive(buf + 12288, 12);
17747     helper_double_17_recursive(buf + 16384, 12);
17748     helper_double_17_recursive(buf + 20480, 12);
17749     helper_double_17_recursive(buf + 24576, 12);
17750     helper_double_17_recursive(buf + 28672, 12);
17751     for (int j = 0; j < 32768; j += 32768) {
17752       for (int k = 0; k < 4096; k += 2) {
17753         __asm__ volatile (
17754           "movupd (%0), %%xmm0\n"
17755           "movupd (%1), %%xmm1\n"
17756           "movupd (%2), %%xmm2\n"
17757           "movupd (%3), %%xmm3\n"
17758           "movupd (%4), %%xmm4\n"
17759           "movupd (%5), %%xmm5\n"
17760           "movupd (%6), %%xmm6\n"
17761           "movupd (%7), %%xmm7\n"
17762           "movapd %%xmm0, %%xmm8\n"
17763           "movapd %%xmm0, %%xmm9\n"
17764           "addpd %%xmm1, %%xmm8\n"
17765           "subpd %%xmm1, %%xmm9\n"
17766           "movapd %%xmm2, %%xmm10\n"
17767           "movapd %%xmm2, %%xmm11\n"
17768           "addpd %%xmm3, %%xmm10\n"
17769           "subpd %%xmm3, %%xmm11\n"
17770           "movapd %%xmm4, %%xmm12\n"
17771           "movapd %%xmm4, %%xmm13\n"
17772           "addpd %%xmm5, %%xmm12\n"
17773           "subpd %%xmm5, %%xmm13\n"
17774           "movapd %%xmm6, %%xmm14\n"
17775           "movapd %%xmm6, %%xmm15\n"
17776           "addpd %%xmm7, %%xmm14\n"
17777           "subpd %%xmm7, %%xmm15\n"
17778           "movapd %%xmm8, %%xmm0\n"
17779           "movapd %%xmm8, %%xmm2\n"
17780           "addpd %%xmm10, %%xmm0\n"
17781           "subpd %%xmm10, %%xmm2\n"
17782           "movapd %%xmm9, %%xmm1\n"
17783           "movapd %%xmm9, %%xmm3\n"
17784           "addpd %%xmm11, %%xmm1\n"
17785           "subpd %%xmm11, %%xmm3\n"
17786           "movapd %%xmm12, %%xmm4\n"
17787           "movapd %%xmm12, %%xmm6\n"
17788           "addpd %%xmm14, %%xmm4\n"
17789           "subpd %%xmm14, %%xmm6\n"
17790           "movapd %%xmm13, %%xmm5\n"
17791           "movapd %%xmm13, %%xmm7\n"
17792           "addpd %%xmm15, %%xmm5\n"
17793           "subpd %%xmm15, %%xmm7\n"
17794           "movapd %%xmm0, %%xmm8\n"
17795           "movapd %%xmm0, %%xmm12\n"
17796           "addpd %%xmm4, %%xmm8\n"
17797           "subpd %%xmm4, %%xmm12\n"
17798           "movapd %%xmm1, %%xmm9\n"
17799           "movapd %%xmm1, %%xmm13\n"
17800           "addpd %%xmm5, %%xmm9\n"
17801           "subpd %%xmm5, %%xmm13\n"
17802           "movapd %%xmm2, %%xmm10\n"
17803           "movapd %%xmm2, %%xmm14\n"
17804           "addpd %%xmm6, %%xmm10\n"
17805           "subpd %%xmm6, %%xmm14\n"
17806           "movapd %%xmm3, %%xmm11\n"
17807           "movapd %%xmm3, %%xmm15\n"
17808           "addpd %%xmm7, %%xmm11\n"
17809           "subpd %%xmm7, %%xmm15\n"
17810           "movupd %%xmm8, (%0)\n"
17811           "movupd %%xmm9, (%1)\n"
17812           "movupd %%xmm10, (%2)\n"
17813           "movupd %%xmm11, (%3)\n"
17814           "movupd %%xmm12, (%4)\n"
17815           "movupd %%xmm13, (%5)\n"
17816           "movupd %%xmm14, (%6)\n"
17817           "movupd %%xmm15, (%7)\n"
17818           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17819         );
17820       }
17821     }
17822     return;
17823   }
17824   if (depth == 17) {
17825     helper_double_17_recursive(buf + 0, 15);
17826     helper_double_17_recursive(buf + 32768, 15);
17827     helper_double_17_recursive(buf + 65536, 15);
17828     helper_double_17_recursive(buf + 98304, 15);
17829     for (int j = 0; j < 131072; j += 131072) {
17830       for (int k = 0; k < 32768; k += 2) {
17831         __asm__ volatile (
17832           "movupd (%0), %%xmm0\n"
17833           "movupd (%1), %%xmm1\n"
17834           "movupd (%2), %%xmm2\n"
17835           "movupd (%3), %%xmm3\n"
17836           "movapd %%xmm0, %%xmm8\n"
17837           "movapd %%xmm0, %%xmm9\n"
17838           "addpd %%xmm1, %%xmm8\n"
17839           "subpd %%xmm1, %%xmm9\n"
17840           "movapd %%xmm2, %%xmm10\n"
17841           "movapd %%xmm2, %%xmm11\n"
17842           "addpd %%xmm3, %%xmm10\n"
17843           "subpd %%xmm3, %%xmm11\n"
17844           "movapd %%xmm8, %%xmm0\n"
17845           "movapd %%xmm8, %%xmm2\n"
17846           "addpd %%xmm10, %%xmm0\n"
17847           "subpd %%xmm10, %%xmm2\n"
17848           "movapd %%xmm9, %%xmm1\n"
17849           "movapd %%xmm9, %%xmm3\n"
17850           "addpd %%xmm11, %%xmm1\n"
17851           "subpd %%xmm11, %%xmm3\n"
17852           "movupd %%xmm0, (%0)\n"
17853           "movupd %%xmm1, (%1)\n"
17854           "movupd %%xmm2, (%2)\n"
17855           "movupd %%xmm3, (%3)\n"
17856           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17857         );
17858       }
17859     }
17860     return;
17861   }
17862 }
17863 void helper_double_17(double *buf);
helper_double_17(double * buf)17864 void helper_double_17(double *buf) {
17865   helper_double_17_recursive(buf, 17);
17866 }
17867 void helper_double_18_recursive(double *buf, int depth);
helper_double_18_recursive(double * buf,int depth)17868 void helper_double_18_recursive(double *buf, int depth) {
17869   if (depth == 12) {
17870     for (int j = 0; j < 4096; j += 16) {
17871       for (int k = 0; k < 2; k += 2) {
17872         __asm__ volatile (
17873           "movupd (%0), %%xmm0\n"
17874           "movupd (%1), %%xmm1\n"
17875           "movupd (%2), %%xmm2\n"
17876           "movupd (%3), %%xmm3\n"
17877           "movupd (%4), %%xmm4\n"
17878           "movupd (%5), %%xmm5\n"
17879           "movupd (%6), %%xmm6\n"
17880           "movupd (%7), %%xmm7\n"
17881           "movapd %%xmm0, %%xmm8\n"
17882           "haddpd %%xmm8, %%xmm8\n"
17883           "movapd %%xmm0, %%xmm9\n"
17884           "hsubpd %%xmm9, %%xmm9\n"
17885           "blendpd $1, %%xmm8, %%xmm9\n"
17886           "movapd %%xmm9, %%xmm0\n"
17887           "movapd %%xmm1, %%xmm8\n"
17888           "haddpd %%xmm8, %%xmm8\n"
17889           "movapd %%xmm1, %%xmm9\n"
17890           "hsubpd %%xmm9, %%xmm9\n"
17891           "blendpd $1, %%xmm8, %%xmm9\n"
17892           "movapd %%xmm9, %%xmm1\n"
17893           "movapd %%xmm2, %%xmm8\n"
17894           "haddpd %%xmm8, %%xmm8\n"
17895           "movapd %%xmm2, %%xmm9\n"
17896           "hsubpd %%xmm9, %%xmm9\n"
17897           "blendpd $1, %%xmm8, %%xmm9\n"
17898           "movapd %%xmm9, %%xmm2\n"
17899           "movapd %%xmm3, %%xmm8\n"
17900           "haddpd %%xmm8, %%xmm8\n"
17901           "movapd %%xmm3, %%xmm9\n"
17902           "hsubpd %%xmm9, %%xmm9\n"
17903           "blendpd $1, %%xmm8, %%xmm9\n"
17904           "movapd %%xmm9, %%xmm3\n"
17905           "movapd %%xmm4, %%xmm8\n"
17906           "haddpd %%xmm8, %%xmm8\n"
17907           "movapd %%xmm4, %%xmm9\n"
17908           "hsubpd %%xmm9, %%xmm9\n"
17909           "blendpd $1, %%xmm8, %%xmm9\n"
17910           "movapd %%xmm9, %%xmm4\n"
17911           "movapd %%xmm5, %%xmm8\n"
17912           "haddpd %%xmm8, %%xmm8\n"
17913           "movapd %%xmm5, %%xmm9\n"
17914           "hsubpd %%xmm9, %%xmm9\n"
17915           "blendpd $1, %%xmm8, %%xmm9\n"
17916           "movapd %%xmm9, %%xmm5\n"
17917           "movapd %%xmm6, %%xmm8\n"
17918           "haddpd %%xmm8, %%xmm8\n"
17919           "movapd %%xmm6, %%xmm9\n"
17920           "hsubpd %%xmm9, %%xmm9\n"
17921           "blendpd $1, %%xmm8, %%xmm9\n"
17922           "movapd %%xmm9, %%xmm6\n"
17923           "movapd %%xmm7, %%xmm8\n"
17924           "haddpd %%xmm8, %%xmm8\n"
17925           "movapd %%xmm7, %%xmm9\n"
17926           "hsubpd %%xmm9, %%xmm9\n"
17927           "blendpd $1, %%xmm8, %%xmm9\n"
17928           "movapd %%xmm9, %%xmm7\n"
17929           "movapd %%xmm0, %%xmm8\n"
17930           "movapd %%xmm0, %%xmm9\n"
17931           "addpd %%xmm1, %%xmm8\n"
17932           "subpd %%xmm1, %%xmm9\n"
17933           "movapd %%xmm2, %%xmm10\n"
17934           "movapd %%xmm2, %%xmm11\n"
17935           "addpd %%xmm3, %%xmm10\n"
17936           "subpd %%xmm3, %%xmm11\n"
17937           "movapd %%xmm4, %%xmm12\n"
17938           "movapd %%xmm4, %%xmm13\n"
17939           "addpd %%xmm5, %%xmm12\n"
17940           "subpd %%xmm5, %%xmm13\n"
17941           "movapd %%xmm6, %%xmm14\n"
17942           "movapd %%xmm6, %%xmm15\n"
17943           "addpd %%xmm7, %%xmm14\n"
17944           "subpd %%xmm7, %%xmm15\n"
17945           "movapd %%xmm8, %%xmm0\n"
17946           "movapd %%xmm8, %%xmm2\n"
17947           "addpd %%xmm10, %%xmm0\n"
17948           "subpd %%xmm10, %%xmm2\n"
17949           "movapd %%xmm9, %%xmm1\n"
17950           "movapd %%xmm9, %%xmm3\n"
17951           "addpd %%xmm11, %%xmm1\n"
17952           "subpd %%xmm11, %%xmm3\n"
17953           "movapd %%xmm12, %%xmm4\n"
17954           "movapd %%xmm12, %%xmm6\n"
17955           "addpd %%xmm14, %%xmm4\n"
17956           "subpd %%xmm14, %%xmm6\n"
17957           "movapd %%xmm13, %%xmm5\n"
17958           "movapd %%xmm13, %%xmm7\n"
17959           "addpd %%xmm15, %%xmm5\n"
17960           "subpd %%xmm15, %%xmm7\n"
17961           "movapd %%xmm0, %%xmm8\n"
17962           "movapd %%xmm0, %%xmm12\n"
17963           "addpd %%xmm4, %%xmm8\n"
17964           "subpd %%xmm4, %%xmm12\n"
17965           "movapd %%xmm1, %%xmm9\n"
17966           "movapd %%xmm1, %%xmm13\n"
17967           "addpd %%xmm5, %%xmm9\n"
17968           "subpd %%xmm5, %%xmm13\n"
17969           "movapd %%xmm2, %%xmm10\n"
17970           "movapd %%xmm2, %%xmm14\n"
17971           "addpd %%xmm6, %%xmm10\n"
17972           "subpd %%xmm6, %%xmm14\n"
17973           "movapd %%xmm3, %%xmm11\n"
17974           "movapd %%xmm3, %%xmm15\n"
17975           "addpd %%xmm7, %%xmm11\n"
17976           "subpd %%xmm7, %%xmm15\n"
17977           "movupd %%xmm8, (%0)\n"
17978           "movupd %%xmm9, (%1)\n"
17979           "movupd %%xmm10, (%2)\n"
17980           "movupd %%xmm11, (%3)\n"
17981           "movupd %%xmm12, (%4)\n"
17982           "movupd %%xmm13, (%5)\n"
17983           "movupd %%xmm14, (%6)\n"
17984           "movupd %%xmm15, (%7)\n"
17985           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17986         );
17987       }
17988     }
17989     for (int j = 0; j < 4096; j += 128) {
17990       for (int k = 0; k < 16; k += 2) {
17991         __asm__ volatile (
17992           "movupd (%0), %%xmm0\n"
17993           "movupd (%1), %%xmm1\n"
17994           "movupd (%2), %%xmm2\n"
17995           "movupd (%3), %%xmm3\n"
17996           "movupd (%4), %%xmm4\n"
17997           "movupd (%5), %%xmm5\n"
17998           "movupd (%6), %%xmm6\n"
17999           "movupd (%7), %%xmm7\n"
18000           "movapd %%xmm0, %%xmm8\n"
18001           "movapd %%xmm0, %%xmm9\n"
18002           "addpd %%xmm1, %%xmm8\n"
18003           "subpd %%xmm1, %%xmm9\n"
18004           "movapd %%xmm2, %%xmm10\n"
18005           "movapd %%xmm2, %%xmm11\n"
18006           "addpd %%xmm3, %%xmm10\n"
18007           "subpd %%xmm3, %%xmm11\n"
18008           "movapd %%xmm4, %%xmm12\n"
18009           "movapd %%xmm4, %%xmm13\n"
18010           "addpd %%xmm5, %%xmm12\n"
18011           "subpd %%xmm5, %%xmm13\n"
18012           "movapd %%xmm6, %%xmm14\n"
18013           "movapd %%xmm6, %%xmm15\n"
18014           "addpd %%xmm7, %%xmm14\n"
18015           "subpd %%xmm7, %%xmm15\n"
18016           "movapd %%xmm8, %%xmm0\n"
18017           "movapd %%xmm8, %%xmm2\n"
18018           "addpd %%xmm10, %%xmm0\n"
18019           "subpd %%xmm10, %%xmm2\n"
18020           "movapd %%xmm9, %%xmm1\n"
18021           "movapd %%xmm9, %%xmm3\n"
18022           "addpd %%xmm11, %%xmm1\n"
18023           "subpd %%xmm11, %%xmm3\n"
18024           "movapd %%xmm12, %%xmm4\n"
18025           "movapd %%xmm12, %%xmm6\n"
18026           "addpd %%xmm14, %%xmm4\n"
18027           "subpd %%xmm14, %%xmm6\n"
18028           "movapd %%xmm13, %%xmm5\n"
18029           "movapd %%xmm13, %%xmm7\n"
18030           "addpd %%xmm15, %%xmm5\n"
18031           "subpd %%xmm15, %%xmm7\n"
18032           "movapd %%xmm0, %%xmm8\n"
18033           "movapd %%xmm0, %%xmm12\n"
18034           "addpd %%xmm4, %%xmm8\n"
18035           "subpd %%xmm4, %%xmm12\n"
18036           "movapd %%xmm1, %%xmm9\n"
18037           "movapd %%xmm1, %%xmm13\n"
18038           "addpd %%xmm5, %%xmm9\n"
18039           "subpd %%xmm5, %%xmm13\n"
18040           "movapd %%xmm2, %%xmm10\n"
18041           "movapd %%xmm2, %%xmm14\n"
18042           "addpd %%xmm6, %%xmm10\n"
18043           "subpd %%xmm6, %%xmm14\n"
18044           "movapd %%xmm3, %%xmm11\n"
18045           "movapd %%xmm3, %%xmm15\n"
18046           "addpd %%xmm7, %%xmm11\n"
18047           "subpd %%xmm7, %%xmm15\n"
18048           "movupd %%xmm8, (%0)\n"
18049           "movupd %%xmm9, (%1)\n"
18050           "movupd %%xmm10, (%2)\n"
18051           "movupd %%xmm11, (%3)\n"
18052           "movupd %%xmm12, (%4)\n"
18053           "movupd %%xmm13, (%5)\n"
18054           "movupd %%xmm14, (%6)\n"
18055           "movupd %%xmm15, (%7)\n"
18056           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18057         );
18058       }
18059     }
18060     for (int j = 0; j < 4096; j += 1024) {
18061       for (int k = 0; k < 128; k += 2) {
18062         __asm__ volatile (
18063           "movupd (%0), %%xmm0\n"
18064           "movupd (%1), %%xmm1\n"
18065           "movupd (%2), %%xmm2\n"
18066           "movupd (%3), %%xmm3\n"
18067           "movupd (%4), %%xmm4\n"
18068           "movupd (%5), %%xmm5\n"
18069           "movupd (%6), %%xmm6\n"
18070           "movupd (%7), %%xmm7\n"
18071           "movapd %%xmm0, %%xmm8\n"
18072           "movapd %%xmm0, %%xmm9\n"
18073           "addpd %%xmm1, %%xmm8\n"
18074           "subpd %%xmm1, %%xmm9\n"
18075           "movapd %%xmm2, %%xmm10\n"
18076           "movapd %%xmm2, %%xmm11\n"
18077           "addpd %%xmm3, %%xmm10\n"
18078           "subpd %%xmm3, %%xmm11\n"
18079           "movapd %%xmm4, %%xmm12\n"
18080           "movapd %%xmm4, %%xmm13\n"
18081           "addpd %%xmm5, %%xmm12\n"
18082           "subpd %%xmm5, %%xmm13\n"
18083           "movapd %%xmm6, %%xmm14\n"
18084           "movapd %%xmm6, %%xmm15\n"
18085           "addpd %%xmm7, %%xmm14\n"
18086           "subpd %%xmm7, %%xmm15\n"
18087           "movapd %%xmm8, %%xmm0\n"
18088           "movapd %%xmm8, %%xmm2\n"
18089           "addpd %%xmm10, %%xmm0\n"
18090           "subpd %%xmm10, %%xmm2\n"
18091           "movapd %%xmm9, %%xmm1\n"
18092           "movapd %%xmm9, %%xmm3\n"
18093           "addpd %%xmm11, %%xmm1\n"
18094           "subpd %%xmm11, %%xmm3\n"
18095           "movapd %%xmm12, %%xmm4\n"
18096           "movapd %%xmm12, %%xmm6\n"
18097           "addpd %%xmm14, %%xmm4\n"
18098           "subpd %%xmm14, %%xmm6\n"
18099           "movapd %%xmm13, %%xmm5\n"
18100           "movapd %%xmm13, %%xmm7\n"
18101           "addpd %%xmm15, %%xmm5\n"
18102           "subpd %%xmm15, %%xmm7\n"
18103           "movapd %%xmm0, %%xmm8\n"
18104           "movapd %%xmm0, %%xmm12\n"
18105           "addpd %%xmm4, %%xmm8\n"
18106           "subpd %%xmm4, %%xmm12\n"
18107           "movapd %%xmm1, %%xmm9\n"
18108           "movapd %%xmm1, %%xmm13\n"
18109           "addpd %%xmm5, %%xmm9\n"
18110           "subpd %%xmm5, %%xmm13\n"
18111           "movapd %%xmm2, %%xmm10\n"
18112           "movapd %%xmm2, %%xmm14\n"
18113           "addpd %%xmm6, %%xmm10\n"
18114           "subpd %%xmm6, %%xmm14\n"
18115           "movapd %%xmm3, %%xmm11\n"
18116           "movapd %%xmm3, %%xmm15\n"
18117           "addpd %%xmm7, %%xmm11\n"
18118           "subpd %%xmm7, %%xmm15\n"
18119           "movupd %%xmm8, (%0)\n"
18120           "movupd %%xmm9, (%1)\n"
18121           "movupd %%xmm10, (%2)\n"
18122           "movupd %%xmm11, (%3)\n"
18123           "movupd %%xmm12, (%4)\n"
18124           "movupd %%xmm13, (%5)\n"
18125           "movupd %%xmm14, (%6)\n"
18126           "movupd %%xmm15, (%7)\n"
18127           :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18128         );
18129       }
18130     }
18131     for (int j = 0; j < 4096; j += 4096) {
18132       for (int k = 0; k < 1024; k += 2) {
18133         __asm__ volatile (
18134           "movupd (%0), %%xmm0\n"
18135           "movupd (%1), %%xmm1\n"
18136           "movupd (%2), %%xmm2\n"
18137           "movupd (%3), %%xmm3\n"
18138           "movapd %%xmm0, %%xmm8\n"
18139           "movapd %%xmm0, %%xmm9\n"
18140           "addpd %%xmm1, %%xmm8\n"
18141           "subpd %%xmm1, %%xmm9\n"
18142           "movapd %%xmm2, %%xmm10\n"
18143           "movapd %%xmm2, %%xmm11\n"
18144           "addpd %%xmm3, %%xmm10\n"
18145           "subpd %%xmm3, %%xmm11\n"
18146           "movapd %%xmm8, %%xmm0\n"
18147           "movapd %%xmm8, %%xmm2\n"
18148           "addpd %%xmm10, %%xmm0\n"
18149           "subpd %%xmm10, %%xmm2\n"
18150           "movapd %%xmm9, %%xmm1\n"
18151           "movapd %%xmm9, %%xmm3\n"
18152           "addpd %%xmm11, %%xmm1\n"
18153           "subpd %%xmm11, %%xmm3\n"
18154           "movupd %%xmm0, (%0)\n"
18155           "movupd %%xmm1, (%1)\n"
18156           "movupd %%xmm2, (%2)\n"
18157           "movupd %%xmm3, (%3)\n"
18158           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18159         );
18160       }
18161     }
18162     return;
18163   }
18164   if (depth == 15) {
18165     helper_double_18_recursive(buf + 0, 12);
18166     helper_double_18_recursive(buf + 4096, 12);
18167     helper_double_18_recursive(buf + 8192, 12);
18168     helper_double_18_recursive(buf + 12288, 12);
18169     helper_double_18_recursive(buf + 16384, 12);
18170     helper_double_18_recursive(buf + 20480, 12);
18171     helper_double_18_recursive(buf + 24576, 12);
18172     helper_double_18_recursive(buf + 28672, 12);
18173     for (int j = 0; j < 32768; j += 32768) {
18174       for (int k = 0; k < 4096; k += 2) {
18175         __asm__ volatile (
18176           "movupd (%0), %%xmm0\n"
18177           "movupd (%1), %%xmm1\n"
18178           "movupd (%2), %%xmm2\n"
18179           "movupd (%3), %%xmm3\n"
18180           "movupd (%4), %%xmm4\n"
18181           "movupd (%5), %%xmm5\n"
18182           "movupd (%6), %%xmm6\n"
18183           "movupd (%7), %%xmm7\n"
18184           "movapd %%xmm0, %%xmm8\n"
18185           "movapd %%xmm0, %%xmm9\n"
18186           "addpd %%xmm1, %%xmm8\n"
18187           "subpd %%xmm1, %%xmm9\n"
18188           "movapd %%xmm2, %%xmm10\n"
18189           "movapd %%xmm2, %%xmm11\n"
18190           "addpd %%xmm3, %%xmm10\n"
18191           "subpd %%xmm3, %%xmm11\n"
18192           "movapd %%xmm4, %%xmm12\n"
18193           "movapd %%xmm4, %%xmm13\n"
18194           "addpd %%xmm5, %%xmm12\n"
18195           "subpd %%xmm5, %%xmm13\n"
18196           "movapd %%xmm6, %%xmm14\n"
18197           "movapd %%xmm6, %%xmm15\n"
18198           "addpd %%xmm7, %%xmm14\n"
18199           "subpd %%xmm7, %%xmm15\n"
18200           "movapd %%xmm8, %%xmm0\n"
18201           "movapd %%xmm8, %%xmm2\n"
18202           "addpd %%xmm10, %%xmm0\n"
18203           "subpd %%xmm10, %%xmm2\n"
18204           "movapd %%xmm9, %%xmm1\n"
18205           "movapd %%xmm9, %%xmm3\n"
18206           "addpd %%xmm11, %%xmm1\n"
18207           "subpd %%xmm11, %%xmm3\n"
18208           "movapd %%xmm12, %%xmm4\n"
18209           "movapd %%xmm12, %%xmm6\n"
18210           "addpd %%xmm14, %%xmm4\n"
18211           "subpd %%xmm14, %%xmm6\n"
18212           "movapd %%xmm13, %%xmm5\n"
18213           "movapd %%xmm13, %%xmm7\n"
18214           "addpd %%xmm15, %%xmm5\n"
18215           "subpd %%xmm15, %%xmm7\n"
18216           "movapd %%xmm0, %%xmm8\n"
18217           "movapd %%xmm0, %%xmm12\n"
18218           "addpd %%xmm4, %%xmm8\n"
18219           "subpd %%xmm4, %%xmm12\n"
18220           "movapd %%xmm1, %%xmm9\n"
18221           "movapd %%xmm1, %%xmm13\n"
18222           "addpd %%xmm5, %%xmm9\n"
18223           "subpd %%xmm5, %%xmm13\n"
18224           "movapd %%xmm2, %%xmm10\n"
18225           "movapd %%xmm2, %%xmm14\n"
18226           "addpd %%xmm6, %%xmm10\n"
18227           "subpd %%xmm6, %%xmm14\n"
18228           "movapd %%xmm3, %%xmm11\n"
18229           "movapd %%xmm3, %%xmm15\n"
18230           "addpd %%xmm7, %%xmm11\n"
18231           "subpd %%xmm7, %%xmm15\n"
18232           "movupd %%xmm8, (%0)\n"
18233           "movupd %%xmm9, (%1)\n"
18234           "movupd %%xmm10, (%2)\n"
18235           "movupd %%xmm11, (%3)\n"
18236           "movupd %%xmm12, (%4)\n"
18237           "movupd %%xmm13, (%5)\n"
18238           "movupd %%xmm14, (%6)\n"
18239           "movupd %%xmm15, (%7)\n"
18240           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18241         );
18242       }
18243     }
18244     return;
18245   }
18246   if (depth == 18) {
18247     helper_double_18_recursive(buf + 0, 15);
18248     helper_double_18_recursive(buf + 32768, 15);
18249     helper_double_18_recursive(buf + 65536, 15);
18250     helper_double_18_recursive(buf + 98304, 15);
18251     helper_double_18_recursive(buf + 131072, 15);
18252     helper_double_18_recursive(buf + 163840, 15);
18253     helper_double_18_recursive(buf + 196608, 15);
18254     helper_double_18_recursive(buf + 229376, 15);
18255     for (int j = 0; j < 262144; j += 262144) {
18256       for (int k = 0; k < 32768; k += 2) {
18257         __asm__ volatile (
18258           "movupd (%0), %%xmm0\n"
18259           "movupd (%1), %%xmm1\n"
18260           "movupd (%2), %%xmm2\n"
18261           "movupd (%3), %%xmm3\n"
18262           "movupd (%4), %%xmm4\n"
18263           "movupd (%5), %%xmm5\n"
18264           "movupd (%6), %%xmm6\n"
18265           "movupd (%7), %%xmm7\n"
18266           "movapd %%xmm0, %%xmm8\n"
18267           "movapd %%xmm0, %%xmm9\n"
18268           "addpd %%xmm1, %%xmm8\n"
18269           "subpd %%xmm1, %%xmm9\n"
18270           "movapd %%xmm2, %%xmm10\n"
18271           "movapd %%xmm2, %%xmm11\n"
18272           "addpd %%xmm3, %%xmm10\n"
18273           "subpd %%xmm3, %%xmm11\n"
18274           "movapd %%xmm4, %%xmm12\n"
18275           "movapd %%xmm4, %%xmm13\n"
18276           "addpd %%xmm5, %%xmm12\n"
18277           "subpd %%xmm5, %%xmm13\n"
18278           "movapd %%xmm6, %%xmm14\n"
18279           "movapd %%xmm6, %%xmm15\n"
18280           "addpd %%xmm7, %%xmm14\n"
18281           "subpd %%xmm7, %%xmm15\n"
18282           "movapd %%xmm8, %%xmm0\n"
18283           "movapd %%xmm8, %%xmm2\n"
18284           "addpd %%xmm10, %%xmm0\n"
18285           "subpd %%xmm10, %%xmm2\n"
18286           "movapd %%xmm9, %%xmm1\n"
18287           "movapd %%xmm9, %%xmm3\n"
18288           "addpd %%xmm11, %%xmm1\n"
18289           "subpd %%xmm11, %%xmm3\n"
18290           "movapd %%xmm12, %%xmm4\n"
18291           "movapd %%xmm12, %%xmm6\n"
18292           "addpd %%xmm14, %%xmm4\n"
18293           "subpd %%xmm14, %%xmm6\n"
18294           "movapd %%xmm13, %%xmm5\n"
18295           "movapd %%xmm13, %%xmm7\n"
18296           "addpd %%xmm15, %%xmm5\n"
18297           "subpd %%xmm15, %%xmm7\n"
18298           "movapd %%xmm0, %%xmm8\n"
18299           "movapd %%xmm0, %%xmm12\n"
18300           "addpd %%xmm4, %%xmm8\n"
18301           "subpd %%xmm4, %%xmm12\n"
18302           "movapd %%xmm1, %%xmm9\n"
18303           "movapd %%xmm1, %%xmm13\n"
18304           "addpd %%xmm5, %%xmm9\n"
18305           "subpd %%xmm5, %%xmm13\n"
18306           "movapd %%xmm2, %%xmm10\n"
18307           "movapd %%xmm2, %%xmm14\n"
18308           "addpd %%xmm6, %%xmm10\n"
18309           "subpd %%xmm6, %%xmm14\n"
18310           "movapd %%xmm3, %%xmm11\n"
18311           "movapd %%xmm3, %%xmm15\n"
18312           "addpd %%xmm7, %%xmm11\n"
18313           "subpd %%xmm7, %%xmm15\n"
18314           "movupd %%xmm8, (%0)\n"
18315           "movupd %%xmm9, (%1)\n"
18316           "movupd %%xmm10, (%2)\n"
18317           "movupd %%xmm11, (%3)\n"
18318           "movupd %%xmm12, (%4)\n"
18319           "movupd %%xmm13, (%5)\n"
18320           "movupd %%xmm14, (%6)\n"
18321           "movupd %%xmm15, (%7)\n"
18322           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18323         );
18324       }
18325     }
18326     return;
18327   }
18328 }
18329 void helper_double_18(double *buf);
helper_double_18(double * buf)18330 void helper_double_18(double *buf) {
18331   helper_double_18_recursive(buf, 18);
18332 }
18333 void helper_double_19_recursive(double *buf, int depth);
helper_double_19_recursive(double * buf,int depth)18334 void helper_double_19_recursive(double *buf, int depth) {
18335   if (depth == 2) {
18336     for (int j = 0; j < 4; j += 4) {
18337       for (int k = 0; k < 2; k += 2) {
18338         __asm__ volatile (
18339           "movupd (%0), %%xmm0\n"
18340           "movupd (%1), %%xmm1\n"
18341           "movapd %%xmm0, %%xmm8\n"
18342           "haddpd %%xmm8, %%xmm8\n"
18343           "movapd %%xmm0, %%xmm9\n"
18344           "hsubpd %%xmm9, %%xmm9\n"
18345           "blendpd $1, %%xmm8, %%xmm9\n"
18346           "movapd %%xmm9, %%xmm0\n"
18347           "movapd %%xmm1, %%xmm8\n"
18348           "haddpd %%xmm8, %%xmm8\n"
18349           "movapd %%xmm1, %%xmm9\n"
18350           "hsubpd %%xmm9, %%xmm9\n"
18351           "blendpd $1, %%xmm8, %%xmm9\n"
18352           "movapd %%xmm9, %%xmm1\n"
18353           "movapd %%xmm0, %%xmm8\n"
18354           "movapd %%xmm0, %%xmm9\n"
18355           "addpd %%xmm1, %%xmm8\n"
18356           "subpd %%xmm1, %%xmm9\n"
18357           "movupd %%xmm8, (%0)\n"
18358           "movupd %%xmm9, (%1)\n"
18359           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18360         );
18361       }
18362     }
18363     return;
18364   }
18365   if (depth == 5) {
18366     helper_double_19_recursive(buf + 0, 2);
18367     helper_double_19_recursive(buf + 4, 2);
18368     helper_double_19_recursive(buf + 8, 2);
18369     helper_double_19_recursive(buf + 12, 2);
18370     helper_double_19_recursive(buf + 16, 2);
18371     helper_double_19_recursive(buf + 20, 2);
18372     helper_double_19_recursive(buf + 24, 2);
18373     helper_double_19_recursive(buf + 28, 2);
18374     for (int j = 0; j < 32; j += 32) {
18375       for (int k = 0; k < 4; k += 2) {
18376         __asm__ volatile (
18377           "movupd (%0), %%xmm0\n"
18378           "movupd (%1), %%xmm1\n"
18379           "movupd (%2), %%xmm2\n"
18380           "movupd (%3), %%xmm3\n"
18381           "movupd (%4), %%xmm4\n"
18382           "movupd (%5), %%xmm5\n"
18383           "movupd (%6), %%xmm6\n"
18384           "movupd (%7), %%xmm7\n"
18385           "movapd %%xmm0, %%xmm8\n"
18386           "movapd %%xmm0, %%xmm9\n"
18387           "addpd %%xmm1, %%xmm8\n"
18388           "subpd %%xmm1, %%xmm9\n"
18389           "movapd %%xmm2, %%xmm10\n"
18390           "movapd %%xmm2, %%xmm11\n"
18391           "addpd %%xmm3, %%xmm10\n"
18392           "subpd %%xmm3, %%xmm11\n"
18393           "movapd %%xmm4, %%xmm12\n"
18394           "movapd %%xmm4, %%xmm13\n"
18395           "addpd %%xmm5, %%xmm12\n"
18396           "subpd %%xmm5, %%xmm13\n"
18397           "movapd %%xmm6, %%xmm14\n"
18398           "movapd %%xmm6, %%xmm15\n"
18399           "addpd %%xmm7, %%xmm14\n"
18400           "subpd %%xmm7, %%xmm15\n"
18401           "movapd %%xmm8, %%xmm0\n"
18402           "movapd %%xmm8, %%xmm2\n"
18403           "addpd %%xmm10, %%xmm0\n"
18404           "subpd %%xmm10, %%xmm2\n"
18405           "movapd %%xmm9, %%xmm1\n"
18406           "movapd %%xmm9, %%xmm3\n"
18407           "addpd %%xmm11, %%xmm1\n"
18408           "subpd %%xmm11, %%xmm3\n"
18409           "movapd %%xmm12, %%xmm4\n"
18410           "movapd %%xmm12, %%xmm6\n"
18411           "addpd %%xmm14, %%xmm4\n"
18412           "subpd %%xmm14, %%xmm6\n"
18413           "movapd %%xmm13, %%xmm5\n"
18414           "movapd %%xmm13, %%xmm7\n"
18415           "addpd %%xmm15, %%xmm5\n"
18416           "subpd %%xmm15, %%xmm7\n"
18417           "movapd %%xmm0, %%xmm8\n"
18418           "movapd %%xmm0, %%xmm12\n"
18419           "addpd %%xmm4, %%xmm8\n"
18420           "subpd %%xmm4, %%xmm12\n"
18421           "movapd %%xmm1, %%xmm9\n"
18422           "movapd %%xmm1, %%xmm13\n"
18423           "addpd %%xmm5, %%xmm9\n"
18424           "subpd %%xmm5, %%xmm13\n"
18425           "movapd %%xmm2, %%xmm10\n"
18426           "movapd %%xmm2, %%xmm14\n"
18427           "addpd %%xmm6, %%xmm10\n"
18428           "subpd %%xmm6, %%xmm14\n"
18429           "movapd %%xmm3, %%xmm11\n"
18430           "movapd %%xmm3, %%xmm15\n"
18431           "addpd %%xmm7, %%xmm11\n"
18432           "subpd %%xmm7, %%xmm15\n"
18433           "movupd %%xmm8, (%0)\n"
18434           "movupd %%xmm9, (%1)\n"
18435           "movupd %%xmm10, (%2)\n"
18436           "movupd %%xmm11, (%3)\n"
18437           "movupd %%xmm12, (%4)\n"
18438           "movupd %%xmm13, (%5)\n"
18439           "movupd %%xmm14, (%6)\n"
18440           "movupd %%xmm15, (%7)\n"
18441           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18442         );
18443       }
18444     }
18445     return;
18446   }
18447   if (depth == 8) {
18448     helper_double_19_recursive(buf + 0, 5);
18449     helper_double_19_recursive(buf + 32, 5);
18450     helper_double_19_recursive(buf + 64, 5);
18451     helper_double_19_recursive(buf + 96, 5);
18452     helper_double_19_recursive(buf + 128, 5);
18453     helper_double_19_recursive(buf + 160, 5);
18454     helper_double_19_recursive(buf + 192, 5);
18455     helper_double_19_recursive(buf + 224, 5);
18456     for (int j = 0; j < 256; j += 256) {
18457       for (int k = 0; k < 32; k += 2) {
18458         __asm__ volatile (
18459           "movupd (%0), %%xmm0\n"
18460           "movupd (%1), %%xmm1\n"
18461           "movupd (%2), %%xmm2\n"
18462           "movupd (%3), %%xmm3\n"
18463           "movupd (%4), %%xmm4\n"
18464           "movupd (%5), %%xmm5\n"
18465           "movupd (%6), %%xmm6\n"
18466           "movupd (%7), %%xmm7\n"
18467           "movapd %%xmm0, %%xmm8\n"
18468           "movapd %%xmm0, %%xmm9\n"
18469           "addpd %%xmm1, %%xmm8\n"
18470           "subpd %%xmm1, %%xmm9\n"
18471           "movapd %%xmm2, %%xmm10\n"
18472           "movapd %%xmm2, %%xmm11\n"
18473           "addpd %%xmm3, %%xmm10\n"
18474           "subpd %%xmm3, %%xmm11\n"
18475           "movapd %%xmm4, %%xmm12\n"
18476           "movapd %%xmm4, %%xmm13\n"
18477           "addpd %%xmm5, %%xmm12\n"
18478           "subpd %%xmm5, %%xmm13\n"
18479           "movapd %%xmm6, %%xmm14\n"
18480           "movapd %%xmm6, %%xmm15\n"
18481           "addpd %%xmm7, %%xmm14\n"
18482           "subpd %%xmm7, %%xmm15\n"
18483           "movapd %%xmm8, %%xmm0\n"
18484           "movapd %%xmm8, %%xmm2\n"
18485           "addpd %%xmm10, %%xmm0\n"
18486           "subpd %%xmm10, %%xmm2\n"
18487           "movapd %%xmm9, %%xmm1\n"
18488           "movapd %%xmm9, %%xmm3\n"
18489           "addpd %%xmm11, %%xmm1\n"
18490           "subpd %%xmm11, %%xmm3\n"
18491           "movapd %%xmm12, %%xmm4\n"
18492           "movapd %%xmm12, %%xmm6\n"
18493           "addpd %%xmm14, %%xmm4\n"
18494           "subpd %%xmm14, %%xmm6\n"
18495           "movapd %%xmm13, %%xmm5\n"
18496           "movapd %%xmm13, %%xmm7\n"
18497           "addpd %%xmm15, %%xmm5\n"
18498           "subpd %%xmm15, %%xmm7\n"
18499           "movapd %%xmm0, %%xmm8\n"
18500           "movapd %%xmm0, %%xmm12\n"
18501           "addpd %%xmm4, %%xmm8\n"
18502           "subpd %%xmm4, %%xmm12\n"
18503           "movapd %%xmm1, %%xmm9\n"
18504           "movapd %%xmm1, %%xmm13\n"
18505           "addpd %%xmm5, %%xmm9\n"
18506           "subpd %%xmm5, %%xmm13\n"
18507           "movapd %%xmm2, %%xmm10\n"
18508           "movapd %%xmm2, %%xmm14\n"
18509           "addpd %%xmm6, %%xmm10\n"
18510           "subpd %%xmm6, %%xmm14\n"
18511           "movapd %%xmm3, %%xmm11\n"
18512           "movapd %%xmm3, %%xmm15\n"
18513           "addpd %%xmm7, %%xmm11\n"
18514           "subpd %%xmm7, %%xmm15\n"
18515           "movupd %%xmm8, (%0)\n"
18516           "movupd %%xmm9, (%1)\n"
18517           "movupd %%xmm10, (%2)\n"
18518           "movupd %%xmm11, (%3)\n"
18519           "movupd %%xmm12, (%4)\n"
18520           "movupd %%xmm13, (%5)\n"
18521           "movupd %%xmm14, (%6)\n"
18522           "movupd %%xmm15, (%7)\n"
18523           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18524         );
18525       }
18526     }
18527     return;
18528   }
18529   if (depth == 11) {
18530     helper_double_19_recursive(buf + 0, 8);
18531     helper_double_19_recursive(buf + 256, 8);
18532     helper_double_19_recursive(buf + 512, 8);
18533     helper_double_19_recursive(buf + 768, 8);
18534     helper_double_19_recursive(buf + 1024, 8);
18535     helper_double_19_recursive(buf + 1280, 8);
18536     helper_double_19_recursive(buf + 1536, 8);
18537     helper_double_19_recursive(buf + 1792, 8);
18538     for (int j = 0; j < 2048; j += 2048) {
18539       for (int k = 0; k < 256; k += 2) {
18540         __asm__ volatile (
18541           "movupd (%0), %%xmm0\n"
18542           "movupd (%1), %%xmm1\n"
18543           "movupd (%2), %%xmm2\n"
18544           "movupd (%3), %%xmm3\n"
18545           "movupd (%4), %%xmm4\n"
18546           "movupd (%5), %%xmm5\n"
18547           "movupd (%6), %%xmm6\n"
18548           "movupd (%7), %%xmm7\n"
18549           "movapd %%xmm0, %%xmm8\n"
18550           "movapd %%xmm0, %%xmm9\n"
18551           "addpd %%xmm1, %%xmm8\n"
18552           "subpd %%xmm1, %%xmm9\n"
18553           "movapd %%xmm2, %%xmm10\n"
18554           "movapd %%xmm2, %%xmm11\n"
18555           "addpd %%xmm3, %%xmm10\n"
18556           "subpd %%xmm3, %%xmm11\n"
18557           "movapd %%xmm4, %%xmm12\n"
18558           "movapd %%xmm4, %%xmm13\n"
18559           "addpd %%xmm5, %%xmm12\n"
18560           "subpd %%xmm5, %%xmm13\n"
18561           "movapd %%xmm6, %%xmm14\n"
18562           "movapd %%xmm6, %%xmm15\n"
18563           "addpd %%xmm7, %%xmm14\n"
18564           "subpd %%xmm7, %%xmm15\n"
18565           "movapd %%xmm8, %%xmm0\n"
18566           "movapd %%xmm8, %%xmm2\n"
18567           "addpd %%xmm10, %%xmm0\n"
18568           "subpd %%xmm10, %%xmm2\n"
18569           "movapd %%xmm9, %%xmm1\n"
18570           "movapd %%xmm9, %%xmm3\n"
18571           "addpd %%xmm11, %%xmm1\n"
18572           "subpd %%xmm11, %%xmm3\n"
18573           "movapd %%xmm12, %%xmm4\n"
18574           "movapd %%xmm12, %%xmm6\n"
18575           "addpd %%xmm14, %%xmm4\n"
18576           "subpd %%xmm14, %%xmm6\n"
18577           "movapd %%xmm13, %%xmm5\n"
18578           "movapd %%xmm13, %%xmm7\n"
18579           "addpd %%xmm15, %%xmm5\n"
18580           "subpd %%xmm15, %%xmm7\n"
18581           "movapd %%xmm0, %%xmm8\n"
18582           "movapd %%xmm0, %%xmm12\n"
18583           "addpd %%xmm4, %%xmm8\n"
18584           "subpd %%xmm4, %%xmm12\n"
18585           "movapd %%xmm1, %%xmm9\n"
18586           "movapd %%xmm1, %%xmm13\n"
18587           "addpd %%xmm5, %%xmm9\n"
18588           "subpd %%xmm5, %%xmm13\n"
18589           "movapd %%xmm2, %%xmm10\n"
18590           "movapd %%xmm2, %%xmm14\n"
18591           "addpd %%xmm6, %%xmm10\n"
18592           "subpd %%xmm6, %%xmm14\n"
18593           "movapd %%xmm3, %%xmm11\n"
18594           "movapd %%xmm3, %%xmm15\n"
18595           "addpd %%xmm7, %%xmm11\n"
18596           "subpd %%xmm7, %%xmm15\n"
18597           "movupd %%xmm8, (%0)\n"
18598           "movupd %%xmm9, (%1)\n"
18599           "movupd %%xmm10, (%2)\n"
18600           "movupd %%xmm11, (%3)\n"
18601           "movupd %%xmm12, (%4)\n"
18602           "movupd %%xmm13, (%5)\n"
18603           "movupd %%xmm14, (%6)\n"
18604           "movupd %%xmm15, (%7)\n"
18605           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18606         );
18607       }
18608     }
18609     return;
18610   }
18611   if (depth == 14) {
18612     helper_double_19_recursive(buf + 0, 11);
18613     helper_double_19_recursive(buf + 2048, 11);
18614     helper_double_19_recursive(buf + 4096, 11);
18615     helper_double_19_recursive(buf + 6144, 11);
18616     helper_double_19_recursive(buf + 8192, 11);
18617     helper_double_19_recursive(buf + 10240, 11);
18618     helper_double_19_recursive(buf + 12288, 11);
18619     helper_double_19_recursive(buf + 14336, 11);
18620     for (int j = 0; j < 16384; j += 16384) {
18621       for (int k = 0; k < 2048; k += 2) {
18622         __asm__ volatile (
18623           "movupd (%0), %%xmm0\n"
18624           "movupd (%1), %%xmm1\n"
18625           "movupd (%2), %%xmm2\n"
18626           "movupd (%3), %%xmm3\n"
18627           "movupd (%4), %%xmm4\n"
18628           "movupd (%5), %%xmm5\n"
18629           "movupd (%6), %%xmm6\n"
18630           "movupd (%7), %%xmm7\n"
18631           "movapd %%xmm0, %%xmm8\n"
18632           "movapd %%xmm0, %%xmm9\n"
18633           "addpd %%xmm1, %%xmm8\n"
18634           "subpd %%xmm1, %%xmm9\n"
18635           "movapd %%xmm2, %%xmm10\n"
18636           "movapd %%xmm2, %%xmm11\n"
18637           "addpd %%xmm3, %%xmm10\n"
18638           "subpd %%xmm3, %%xmm11\n"
18639           "movapd %%xmm4, %%xmm12\n"
18640           "movapd %%xmm4, %%xmm13\n"
18641           "addpd %%xmm5, %%xmm12\n"
18642           "subpd %%xmm5, %%xmm13\n"
18643           "movapd %%xmm6, %%xmm14\n"
18644           "movapd %%xmm6, %%xmm15\n"
18645           "addpd %%xmm7, %%xmm14\n"
18646           "subpd %%xmm7, %%xmm15\n"
18647           "movapd %%xmm8, %%xmm0\n"
18648           "movapd %%xmm8, %%xmm2\n"
18649           "addpd %%xmm10, %%xmm0\n"
18650           "subpd %%xmm10, %%xmm2\n"
18651           "movapd %%xmm9, %%xmm1\n"
18652           "movapd %%xmm9, %%xmm3\n"
18653           "addpd %%xmm11, %%xmm1\n"
18654           "subpd %%xmm11, %%xmm3\n"
18655           "movapd %%xmm12, %%xmm4\n"
18656           "movapd %%xmm12, %%xmm6\n"
18657           "addpd %%xmm14, %%xmm4\n"
18658           "subpd %%xmm14, %%xmm6\n"
18659           "movapd %%xmm13, %%xmm5\n"
18660           "movapd %%xmm13, %%xmm7\n"
18661           "addpd %%xmm15, %%xmm5\n"
18662           "subpd %%xmm15, %%xmm7\n"
18663           "movapd %%xmm0, %%xmm8\n"
18664           "movapd %%xmm0, %%xmm12\n"
18665           "addpd %%xmm4, %%xmm8\n"
18666           "subpd %%xmm4, %%xmm12\n"
18667           "movapd %%xmm1, %%xmm9\n"
18668           "movapd %%xmm1, %%xmm13\n"
18669           "addpd %%xmm5, %%xmm9\n"
18670           "subpd %%xmm5, %%xmm13\n"
18671           "movapd %%xmm2, %%xmm10\n"
18672           "movapd %%xmm2, %%xmm14\n"
18673           "addpd %%xmm6, %%xmm10\n"
18674           "subpd %%xmm6, %%xmm14\n"
18675           "movapd %%xmm3, %%xmm11\n"
18676           "movapd %%xmm3, %%xmm15\n"
18677           "addpd %%xmm7, %%xmm11\n"
18678           "subpd %%xmm7, %%xmm15\n"
18679           "movupd %%xmm8, (%0)\n"
18680           "movupd %%xmm9, (%1)\n"
18681           "movupd %%xmm10, (%2)\n"
18682           "movupd %%xmm11, (%3)\n"
18683           "movupd %%xmm12, (%4)\n"
18684           "movupd %%xmm13, (%5)\n"
18685           "movupd %%xmm14, (%6)\n"
18686           "movupd %%xmm15, (%7)\n"
18687           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18688         );
18689       }
18690     }
18691     return;
18692   }
18693   if (depth == 17) {
18694     helper_double_19_recursive(buf + 0, 14);
18695     helper_double_19_recursive(buf + 16384, 14);
18696     helper_double_19_recursive(buf + 32768, 14);
18697     helper_double_19_recursive(buf + 49152, 14);
18698     helper_double_19_recursive(buf + 65536, 14);
18699     helper_double_19_recursive(buf + 81920, 14);
18700     helper_double_19_recursive(buf + 98304, 14);
18701     helper_double_19_recursive(buf + 114688, 14);
18702     for (int j = 0; j < 131072; j += 131072) {
18703       for (int k = 0; k < 16384; k += 2) {
18704         __asm__ volatile (
18705           "movupd (%0), %%xmm0\n"
18706           "movupd (%1), %%xmm1\n"
18707           "movupd (%2), %%xmm2\n"
18708           "movupd (%3), %%xmm3\n"
18709           "movupd (%4), %%xmm4\n"
18710           "movupd (%5), %%xmm5\n"
18711           "movupd (%6), %%xmm6\n"
18712           "movupd (%7), %%xmm7\n"
18713           "movapd %%xmm0, %%xmm8\n"
18714           "movapd %%xmm0, %%xmm9\n"
18715           "addpd %%xmm1, %%xmm8\n"
18716           "subpd %%xmm1, %%xmm9\n"
18717           "movapd %%xmm2, %%xmm10\n"
18718           "movapd %%xmm2, %%xmm11\n"
18719           "addpd %%xmm3, %%xmm10\n"
18720           "subpd %%xmm3, %%xmm11\n"
18721           "movapd %%xmm4, %%xmm12\n"
18722           "movapd %%xmm4, %%xmm13\n"
18723           "addpd %%xmm5, %%xmm12\n"
18724           "subpd %%xmm5, %%xmm13\n"
18725           "movapd %%xmm6, %%xmm14\n"
18726           "movapd %%xmm6, %%xmm15\n"
18727           "addpd %%xmm7, %%xmm14\n"
18728           "subpd %%xmm7, %%xmm15\n"
18729           "movapd %%xmm8, %%xmm0\n"
18730           "movapd %%xmm8, %%xmm2\n"
18731           "addpd %%xmm10, %%xmm0\n"
18732           "subpd %%xmm10, %%xmm2\n"
18733           "movapd %%xmm9, %%xmm1\n"
18734           "movapd %%xmm9, %%xmm3\n"
18735           "addpd %%xmm11, %%xmm1\n"
18736           "subpd %%xmm11, %%xmm3\n"
18737           "movapd %%xmm12, %%xmm4\n"
18738           "movapd %%xmm12, %%xmm6\n"
18739           "addpd %%xmm14, %%xmm4\n"
18740           "subpd %%xmm14, %%xmm6\n"
18741           "movapd %%xmm13, %%xmm5\n"
18742           "movapd %%xmm13, %%xmm7\n"
18743           "addpd %%xmm15, %%xmm5\n"
18744           "subpd %%xmm15, %%xmm7\n"
18745           "movapd %%xmm0, %%xmm8\n"
18746           "movapd %%xmm0, %%xmm12\n"
18747           "addpd %%xmm4, %%xmm8\n"
18748           "subpd %%xmm4, %%xmm12\n"
18749           "movapd %%xmm1, %%xmm9\n"
18750           "movapd %%xmm1, %%xmm13\n"
18751           "addpd %%xmm5, %%xmm9\n"
18752           "subpd %%xmm5, %%xmm13\n"
18753           "movapd %%xmm2, %%xmm10\n"
18754           "movapd %%xmm2, %%xmm14\n"
18755           "addpd %%xmm6, %%xmm10\n"
18756           "subpd %%xmm6, %%xmm14\n"
18757           "movapd %%xmm3, %%xmm11\n"
18758           "movapd %%xmm3, %%xmm15\n"
18759           "addpd %%xmm7, %%xmm11\n"
18760           "subpd %%xmm7, %%xmm15\n"
18761           "movupd %%xmm8, (%0)\n"
18762           "movupd %%xmm9, (%1)\n"
18763           "movupd %%xmm10, (%2)\n"
18764           "movupd %%xmm11, (%3)\n"
18765           "movupd %%xmm12, (%4)\n"
18766           "movupd %%xmm13, (%5)\n"
18767           "movupd %%xmm14, (%6)\n"
18768           "movupd %%xmm15, (%7)\n"
18769           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18770         );
18771       }
18772     }
18773     return;
18774   }
18775   if (depth == 19) {
18776     helper_double_19_recursive(buf + 0, 17);
18777     helper_double_19_recursive(buf + 131072, 17);
18778     helper_double_19_recursive(buf + 262144, 17);
18779     helper_double_19_recursive(buf + 393216, 17);
18780     for (int j = 0; j < 524288; j += 524288) {
18781       for (int k = 0; k < 131072; k += 2) {
18782         __asm__ volatile (
18783           "movupd (%0), %%xmm0\n"
18784           "movupd (%1), %%xmm1\n"
18785           "movupd (%2), %%xmm2\n"
18786           "movupd (%3), %%xmm3\n"
18787           "movapd %%xmm0, %%xmm8\n"
18788           "movapd %%xmm0, %%xmm9\n"
18789           "addpd %%xmm1, %%xmm8\n"
18790           "subpd %%xmm1, %%xmm9\n"
18791           "movapd %%xmm2, %%xmm10\n"
18792           "movapd %%xmm2, %%xmm11\n"
18793           "addpd %%xmm3, %%xmm10\n"
18794           "subpd %%xmm3, %%xmm11\n"
18795           "movapd %%xmm8, %%xmm0\n"
18796           "movapd %%xmm8, %%xmm2\n"
18797           "addpd %%xmm10, %%xmm0\n"
18798           "subpd %%xmm10, %%xmm2\n"
18799           "movapd %%xmm9, %%xmm1\n"
18800           "movapd %%xmm9, %%xmm3\n"
18801           "addpd %%xmm11, %%xmm1\n"
18802           "subpd %%xmm11, %%xmm3\n"
18803           "movupd %%xmm0, (%0)\n"
18804           "movupd %%xmm1, (%1)\n"
18805           "movupd %%xmm2, (%2)\n"
18806           "movupd %%xmm3, (%3)\n"
18807           :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18808         );
18809       }
18810     }
18811     return;
18812   }
18813 }
18814 void helper_double_19(double *buf);
helper_double_19(double * buf)18815 void helper_double_19(double *buf) {
18816   helper_double_19_recursive(buf, 19);
18817 }
18818 void helper_double_20_recursive(double *buf, int depth);
helper_double_20_recursive(double * buf,int depth)18819 void helper_double_20_recursive(double *buf, int depth) {
18820   if (depth == 12) {
18821     for (int j = 0; j < 4096; j += 16) {
18822       for (int k = 0; k < 2; k += 2) {
18823         __asm__ volatile (
18824           "movupd (%0), %%xmm0\n"
18825           "movupd (%1), %%xmm1\n"
18826           "movupd (%2), %%xmm2\n"
18827           "movupd (%3), %%xmm3\n"
18828           "movupd (%4), %%xmm4\n"
18829           "movupd (%5), %%xmm5\n"
18830           "movupd (%6), %%xmm6\n"
18831           "movupd (%7), %%xmm7\n"
18832           "movapd %%xmm0, %%xmm8\n"
18833           "haddpd %%xmm8, %%xmm8\n"
18834           "movapd %%xmm0, %%xmm9\n"
18835           "hsubpd %%xmm9, %%xmm9\n"
18836           "blendpd $1, %%xmm8, %%xmm9\n"
18837           "movapd %%xmm9, %%xmm0\n"
18838           "movapd %%xmm1, %%xmm8\n"
18839           "haddpd %%xmm8, %%xmm8\n"
18840           "movapd %%xmm1, %%xmm9\n"
18841           "hsubpd %%xmm9, %%xmm9\n"
18842           "blendpd $1, %%xmm8, %%xmm9\n"
18843           "movapd %%xmm9, %%xmm1\n"
18844           "movapd %%xmm2, %%xmm8\n"
18845           "haddpd %%xmm8, %%xmm8\n"
18846           "movapd %%xmm2, %%xmm9\n"
18847           "hsubpd %%xmm9, %%xmm9\n"
18848           "blendpd $1, %%xmm8, %%xmm9\n"
18849           "movapd %%xmm9, %%xmm2\n"
18850           "movapd %%xmm3, %%xmm8\n"
18851           "haddpd %%xmm8, %%xmm8\n"
18852           "movapd %%xmm3, %%xmm9\n"
18853           "hsubpd %%xmm9, %%xmm9\n"
18854           "blendpd $1, %%xmm8, %%xmm9\n"
18855           "movapd %%xmm9, %%xmm3\n"
18856           "movapd %%xmm4, %%xmm8\n"
18857           "haddpd %%xmm8, %%xmm8\n"
18858           "movapd %%xmm4, %%xmm9\n"
18859           "hsubpd %%xmm9, %%xmm9\n"
18860           "blendpd $1, %%xmm8, %%xmm9\n"
18861           "movapd %%xmm9, %%xmm4\n"
18862           "movapd %%xmm5, %%xmm8\n"
18863           "haddpd %%xmm8, %%xmm8\n"
18864           "movapd %%xmm5, %%xmm9\n"
18865           "hsubpd %%xmm9, %%xmm9\n"
18866           "blendpd $1, %%xmm8, %%xmm9\n"
18867           "movapd %%xmm9, %%xmm5\n"
18868           "movapd %%xmm6, %%xmm8\n"
18869           "haddpd %%xmm8, %%xmm8\n"
18870           "movapd %%xmm6, %%xmm9\n"
18871           "hsubpd %%xmm9, %%xmm9\n"
18872           "blendpd $1, %%xmm8, %%xmm9\n"
18873           "movapd %%xmm9, %%xmm6\n"
18874           "movapd %%xmm7, %%xmm8\n"
18875           "haddpd %%xmm8, %%xmm8\n"
18876           "movapd %%xmm7, %%xmm9\n"
18877           "hsubpd %%xmm9, %%xmm9\n"
18878           "blendpd $1, %%xmm8, %%xmm9\n"
18879           "movapd %%xmm9, %%xmm7\n"
18880           "movapd %%xmm0, %%xmm8\n"
18881           "movapd %%xmm0, %%xmm9\n"
18882           "addpd %%xmm1, %%xmm8\n"
18883           "subpd %%xmm1, %%xmm9\n"
18884           "movapd %%xmm2, %%xmm10\n"
18885           "movapd %%xmm2, %%xmm11\n"
18886           "addpd %%xmm3, %%xmm10\n"
18887           "subpd %%xmm3, %%xmm11\n"
18888           "movapd %%xmm4, %%xmm12\n"
18889           "movapd %%xmm4, %%xmm13\n"
18890           "addpd %%xmm5, %%xmm12\n"
18891           "subpd %%xmm5, %%xmm13\n"
18892           "movapd %%xmm6, %%xmm14\n"
18893           "movapd %%xmm6, %%xmm15\n"
18894           "addpd %%xmm7, %%xmm14\n"
18895           "subpd %%xmm7, %%xmm15\n"
18896           "movapd %%xmm8, %%xmm0\n"
18897           "movapd %%xmm8, %%xmm2\n"
18898           "addpd %%xmm10, %%xmm0\n"
18899           "subpd %%xmm10, %%xmm2\n"
18900           "movapd %%xmm9, %%xmm1\n"
18901           "movapd %%xmm9, %%xmm3\n"
18902           "addpd %%xmm11, %%xmm1\n"
18903           "subpd %%xmm11, %%xmm3\n"
18904           "movapd %%xmm12, %%xmm4\n"
18905           "movapd %%xmm12, %%xmm6\n"
18906           "addpd %%xmm14, %%xmm4\n"
18907           "subpd %%xmm14, %%xmm6\n"
18908           "movapd %%xmm13, %%xmm5\n"
18909           "movapd %%xmm13, %%xmm7\n"
18910           "addpd %%xmm15, %%xmm5\n"
18911           "subpd %%xmm15, %%xmm7\n"
18912           "movapd %%xmm0, %%xmm8\n"
18913           "movapd %%xmm0, %%xmm12\n"
18914           "addpd %%xmm4, %%xmm8\n"
18915           "subpd %%xmm4, %%xmm12\n"
18916           "movapd %%xmm1, %%xmm9\n"
18917           "movapd %%xmm1, %%xmm13\n"
18918           "addpd %%xmm5, %%xmm9\n"
18919           "subpd %%xmm5, %%xmm13\n"
18920           "movapd %%xmm2, %%xmm10\n"
18921           "movapd %%xmm2, %%xmm14\n"
18922           "addpd %%xmm6, %%xmm10\n"
18923           "subpd %%xmm6, %%xmm14\n"
18924           "movapd %%xmm3, %%xmm11\n"
18925           "movapd %%xmm3, %%xmm15\n"
18926           "addpd %%xmm7, %%xmm11\n"
18927           "subpd %%xmm7, %%xmm15\n"
18928           "movupd %%xmm8, (%0)\n"
18929           "movupd %%xmm9, (%1)\n"
18930           "movupd %%xmm10, (%2)\n"
18931           "movupd %%xmm11, (%3)\n"
18932           "movupd %%xmm12, (%4)\n"
18933           "movupd %%xmm13, (%5)\n"
18934           "movupd %%xmm14, (%6)\n"
18935           "movupd %%xmm15, (%7)\n"
18936           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18937         );
18938       }
18939     }
18940     for (int j = 0; j < 4096; j += 128) {
18941       for (int k = 0; k < 16; k += 2) {
18942         __asm__ volatile (
18943           "movupd (%0), %%xmm0\n"
18944           "movupd (%1), %%xmm1\n"
18945           "movupd (%2), %%xmm2\n"
18946           "movupd (%3), %%xmm3\n"
18947           "movupd (%4), %%xmm4\n"
18948           "movupd (%5), %%xmm5\n"
18949           "movupd (%6), %%xmm6\n"
18950           "movupd (%7), %%xmm7\n"
18951           "movapd %%xmm0, %%xmm8\n"
18952           "movapd %%xmm0, %%xmm9\n"
18953           "addpd %%xmm1, %%xmm8\n"
18954           "subpd %%xmm1, %%xmm9\n"
18955           "movapd %%xmm2, %%xmm10\n"
18956           "movapd %%xmm2, %%xmm11\n"
18957           "addpd %%xmm3, %%xmm10\n"
18958           "subpd %%xmm3, %%xmm11\n"
18959           "movapd %%xmm4, %%xmm12\n"
18960           "movapd %%xmm4, %%xmm13\n"
18961           "addpd %%xmm5, %%xmm12\n"
18962           "subpd %%xmm5, %%xmm13\n"
18963           "movapd %%xmm6, %%xmm14\n"
18964           "movapd %%xmm6, %%xmm15\n"
18965           "addpd %%xmm7, %%xmm14\n"
18966           "subpd %%xmm7, %%xmm15\n"
18967           "movapd %%xmm8, %%xmm0\n"
18968           "movapd %%xmm8, %%xmm2\n"
18969           "addpd %%xmm10, %%xmm0\n"
18970           "subpd %%xmm10, %%xmm2\n"
18971           "movapd %%xmm9, %%xmm1\n"
18972           "movapd %%xmm9, %%xmm3\n"
18973           "addpd %%xmm11, %%xmm1\n"
18974           "subpd %%xmm11, %%xmm3\n"
18975           "movapd %%xmm12, %%xmm4\n"
18976           "movapd %%xmm12, %%xmm6\n"
18977           "addpd %%xmm14, %%xmm4\n"
18978           "subpd %%xmm14, %%xmm6\n"
18979           "movapd %%xmm13, %%xmm5\n"
18980           "movapd %%xmm13, %%xmm7\n"
18981           "addpd %%xmm15, %%xmm5\n"
18982           "subpd %%xmm15, %%xmm7\n"
18983           "movapd %%xmm0, %%xmm8\n"
18984           "movapd %%xmm0, %%xmm12\n"
18985           "addpd %%xmm4, %%xmm8\n"
18986           "subpd %%xmm4, %%xmm12\n"
18987           "movapd %%xmm1, %%xmm9\n"
18988           "movapd %%xmm1, %%xmm13\n"
18989           "addpd %%xmm5, %%xmm9\n"
18990           "subpd %%xmm5, %%xmm13\n"
18991           "movapd %%xmm2, %%xmm10\n"
18992           "movapd %%xmm2, %%xmm14\n"
18993           "addpd %%xmm6, %%xmm10\n"
18994           "subpd %%xmm6, %%xmm14\n"
18995           "movapd %%xmm3, %%xmm11\n"
18996           "movapd %%xmm3, %%xmm15\n"
18997           "addpd %%xmm7, %%xmm11\n"
18998           "subpd %%xmm7, %%xmm15\n"
18999           "movupd %%xmm8, (%0)\n"
19000           "movupd %%xmm9, (%1)\n"
19001           "movupd %%xmm10, (%2)\n"
19002           "movupd %%xmm11, (%3)\n"
19003           "movupd %%xmm12, (%4)\n"
19004           "movupd %%xmm13, (%5)\n"
19005           "movupd %%xmm14, (%6)\n"
19006           "movupd %%xmm15, (%7)\n"
19007           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19008         );
19009       }
19010     }
19011     for (int j = 0; j < 4096; j += 1024) {
19012       for (int k = 0; k < 128; k += 2) {
19013         __asm__ volatile (
19014           "movupd (%0), %%xmm0\n"
19015           "movupd (%1), %%xmm1\n"
19016           "movupd (%2), %%xmm2\n"
19017           "movupd (%3), %%xmm3\n"
19018           "movupd (%4), %%xmm4\n"
19019           "movupd (%5), %%xmm5\n"
19020           "movupd (%6), %%xmm6\n"
19021           "movupd (%7), %%xmm7\n"
19022           "movapd %%xmm0, %%xmm8\n"
19023           "movapd %%xmm0, %%xmm9\n"
19024           "addpd %%xmm1, %%xmm8\n"
19025           "subpd %%xmm1, %%xmm9\n"
19026           "movapd %%xmm2, %%xmm10\n"
19027           "movapd %%xmm2, %%xmm11\n"
19028           "addpd %%xmm3, %%xmm10\n"
19029           "subpd %%xmm3, %%xmm11\n"
19030           "movapd %%xmm4, %%xmm12\n"
19031           "movapd %%xmm4, %%xmm13\n"
19032           "addpd %%xmm5, %%xmm12\n"
19033           "subpd %%xmm5, %%xmm13\n"
19034           "movapd %%xmm6, %%xmm14\n"
19035           "movapd %%xmm6, %%xmm15\n"
19036           "addpd %%xmm7, %%xmm14\n"
19037           "subpd %%xmm7, %%xmm15\n"
19038           "movapd %%xmm8, %%xmm0\n"
19039           "movapd %%xmm8, %%xmm2\n"
19040           "addpd %%xmm10, %%xmm0\n"
19041           "subpd %%xmm10, %%xmm2\n"
19042           "movapd %%xmm9, %%xmm1\n"
19043           "movapd %%xmm9, %%xmm3\n"
19044           "addpd %%xmm11, %%xmm1\n"
19045           "subpd %%xmm11, %%xmm3\n"
19046           "movapd %%xmm12, %%xmm4\n"
19047           "movapd %%xmm12, %%xmm6\n"
19048           "addpd %%xmm14, %%xmm4\n"
19049           "subpd %%xmm14, %%xmm6\n"
19050           "movapd %%xmm13, %%xmm5\n"
19051           "movapd %%xmm13, %%xmm7\n"
19052           "addpd %%xmm15, %%xmm5\n"
19053           "subpd %%xmm15, %%xmm7\n"
19054           "movapd %%xmm0, %%xmm8\n"
19055           "movapd %%xmm0, %%xmm12\n"
19056           "addpd %%xmm4, %%xmm8\n"
19057           "subpd %%xmm4, %%xmm12\n"
19058           "movapd %%xmm1, %%xmm9\n"
19059           "movapd %%xmm1, %%xmm13\n"
19060           "addpd %%xmm5, %%xmm9\n"
19061           "subpd %%xmm5, %%xmm13\n"
19062           "movapd %%xmm2, %%xmm10\n"
19063           "movapd %%xmm2, %%xmm14\n"
19064           "addpd %%xmm6, %%xmm10\n"
19065           "subpd %%xmm6, %%xmm14\n"
19066           "movapd %%xmm3, %%xmm11\n"
19067           "movapd %%xmm3, %%xmm15\n"
19068           "addpd %%xmm7, %%xmm11\n"
19069           "subpd %%xmm7, %%xmm15\n"
19070           "movupd %%xmm8, (%0)\n"
19071           "movupd %%xmm9, (%1)\n"
19072           "movupd %%xmm10, (%2)\n"
19073           "movupd %%xmm11, (%3)\n"
19074           "movupd %%xmm12, (%4)\n"
19075           "movupd %%xmm13, (%5)\n"
19076           "movupd %%xmm14, (%6)\n"
19077           "movupd %%xmm15, (%7)\n"
19078           :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19079         );
19080       }
19081     }
19082     for (int j = 0; j < 4096; j += 4096) {
19083       for (int k = 0; k < 1024; k += 2) {
19084         __asm__ volatile (
19085           "movupd (%0), %%xmm0\n"
19086           "movupd (%1), %%xmm1\n"
19087           "movupd (%2), %%xmm2\n"
19088           "movupd (%3), %%xmm3\n"
19089           "movapd %%xmm0, %%xmm8\n"
19090           "movapd %%xmm0, %%xmm9\n"
19091           "addpd %%xmm1, %%xmm8\n"
19092           "subpd %%xmm1, %%xmm9\n"
19093           "movapd %%xmm2, %%xmm10\n"
19094           "movapd %%xmm2, %%xmm11\n"
19095           "addpd %%xmm3, %%xmm10\n"
19096           "subpd %%xmm3, %%xmm11\n"
19097           "movapd %%xmm8, %%xmm0\n"
19098           "movapd %%xmm8, %%xmm2\n"
19099           "addpd %%xmm10, %%xmm0\n"
19100           "subpd %%xmm10, %%xmm2\n"
19101           "movapd %%xmm9, %%xmm1\n"
19102           "movapd %%xmm9, %%xmm3\n"
19103           "addpd %%xmm11, %%xmm1\n"
19104           "subpd %%xmm11, %%xmm3\n"
19105           "movupd %%xmm0, (%0)\n"
19106           "movupd %%xmm1, (%1)\n"
19107           "movupd %%xmm2, (%2)\n"
19108           "movupd %%xmm3, (%3)\n"
19109           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19110         );
19111       }
19112     }
19113     return;
19114   }
19115   if (depth == 15) {
19116     helper_double_20_recursive(buf + 0, 12);
19117     helper_double_20_recursive(buf + 4096, 12);
19118     helper_double_20_recursive(buf + 8192, 12);
19119     helper_double_20_recursive(buf + 12288, 12);
19120     helper_double_20_recursive(buf + 16384, 12);
19121     helper_double_20_recursive(buf + 20480, 12);
19122     helper_double_20_recursive(buf + 24576, 12);
19123     helper_double_20_recursive(buf + 28672, 12);
19124     for (int j = 0; j < 32768; j += 32768) {
19125       for (int k = 0; k < 4096; k += 2) {
19126         __asm__ volatile (
19127           "movupd (%0), %%xmm0\n"
19128           "movupd (%1), %%xmm1\n"
19129           "movupd (%2), %%xmm2\n"
19130           "movupd (%3), %%xmm3\n"
19131           "movupd (%4), %%xmm4\n"
19132           "movupd (%5), %%xmm5\n"
19133           "movupd (%6), %%xmm6\n"
19134           "movupd (%7), %%xmm7\n"
19135           "movapd %%xmm0, %%xmm8\n"
19136           "movapd %%xmm0, %%xmm9\n"
19137           "addpd %%xmm1, %%xmm8\n"
19138           "subpd %%xmm1, %%xmm9\n"
19139           "movapd %%xmm2, %%xmm10\n"
19140           "movapd %%xmm2, %%xmm11\n"
19141           "addpd %%xmm3, %%xmm10\n"
19142           "subpd %%xmm3, %%xmm11\n"
19143           "movapd %%xmm4, %%xmm12\n"
19144           "movapd %%xmm4, %%xmm13\n"
19145           "addpd %%xmm5, %%xmm12\n"
19146           "subpd %%xmm5, %%xmm13\n"
19147           "movapd %%xmm6, %%xmm14\n"
19148           "movapd %%xmm6, %%xmm15\n"
19149           "addpd %%xmm7, %%xmm14\n"
19150           "subpd %%xmm7, %%xmm15\n"
19151           "movapd %%xmm8, %%xmm0\n"
19152           "movapd %%xmm8, %%xmm2\n"
19153           "addpd %%xmm10, %%xmm0\n"
19154           "subpd %%xmm10, %%xmm2\n"
19155           "movapd %%xmm9, %%xmm1\n"
19156           "movapd %%xmm9, %%xmm3\n"
19157           "addpd %%xmm11, %%xmm1\n"
19158           "subpd %%xmm11, %%xmm3\n"
19159           "movapd %%xmm12, %%xmm4\n"
19160           "movapd %%xmm12, %%xmm6\n"
19161           "addpd %%xmm14, %%xmm4\n"
19162           "subpd %%xmm14, %%xmm6\n"
19163           "movapd %%xmm13, %%xmm5\n"
19164           "movapd %%xmm13, %%xmm7\n"
19165           "addpd %%xmm15, %%xmm5\n"
19166           "subpd %%xmm15, %%xmm7\n"
19167           "movapd %%xmm0, %%xmm8\n"
19168           "movapd %%xmm0, %%xmm12\n"
19169           "addpd %%xmm4, %%xmm8\n"
19170           "subpd %%xmm4, %%xmm12\n"
19171           "movapd %%xmm1, %%xmm9\n"
19172           "movapd %%xmm1, %%xmm13\n"
19173           "addpd %%xmm5, %%xmm9\n"
19174           "subpd %%xmm5, %%xmm13\n"
19175           "movapd %%xmm2, %%xmm10\n"
19176           "movapd %%xmm2, %%xmm14\n"
19177           "addpd %%xmm6, %%xmm10\n"
19178           "subpd %%xmm6, %%xmm14\n"
19179           "movapd %%xmm3, %%xmm11\n"
19180           "movapd %%xmm3, %%xmm15\n"
19181           "addpd %%xmm7, %%xmm11\n"
19182           "subpd %%xmm7, %%xmm15\n"
19183           "movupd %%xmm8, (%0)\n"
19184           "movupd %%xmm9, (%1)\n"
19185           "movupd %%xmm10, (%2)\n"
19186           "movupd %%xmm11, (%3)\n"
19187           "movupd %%xmm12, (%4)\n"
19188           "movupd %%xmm13, (%5)\n"
19189           "movupd %%xmm14, (%6)\n"
19190           "movupd %%xmm15, (%7)\n"
19191           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19192         );
19193       }
19194     }
19195     return;
19196   }
19197   if (depth == 18) {
19198     helper_double_20_recursive(buf + 0, 15);
19199     helper_double_20_recursive(buf + 32768, 15);
19200     helper_double_20_recursive(buf + 65536, 15);
19201     helper_double_20_recursive(buf + 98304, 15);
19202     helper_double_20_recursive(buf + 131072, 15);
19203     helper_double_20_recursive(buf + 163840, 15);
19204     helper_double_20_recursive(buf + 196608, 15);
19205     helper_double_20_recursive(buf + 229376, 15);
19206     for (int j = 0; j < 262144; j += 262144) {
19207       for (int k = 0; k < 32768; k += 2) {
19208         __asm__ volatile (
19209           "movupd (%0), %%xmm0\n"
19210           "movupd (%1), %%xmm1\n"
19211           "movupd (%2), %%xmm2\n"
19212           "movupd (%3), %%xmm3\n"
19213           "movupd (%4), %%xmm4\n"
19214           "movupd (%5), %%xmm5\n"
19215           "movupd (%6), %%xmm6\n"
19216           "movupd (%7), %%xmm7\n"
19217           "movapd %%xmm0, %%xmm8\n"
19218           "movapd %%xmm0, %%xmm9\n"
19219           "addpd %%xmm1, %%xmm8\n"
19220           "subpd %%xmm1, %%xmm9\n"
19221           "movapd %%xmm2, %%xmm10\n"
19222           "movapd %%xmm2, %%xmm11\n"
19223           "addpd %%xmm3, %%xmm10\n"
19224           "subpd %%xmm3, %%xmm11\n"
19225           "movapd %%xmm4, %%xmm12\n"
19226           "movapd %%xmm4, %%xmm13\n"
19227           "addpd %%xmm5, %%xmm12\n"
19228           "subpd %%xmm5, %%xmm13\n"
19229           "movapd %%xmm6, %%xmm14\n"
19230           "movapd %%xmm6, %%xmm15\n"
19231           "addpd %%xmm7, %%xmm14\n"
19232           "subpd %%xmm7, %%xmm15\n"
19233           "movapd %%xmm8, %%xmm0\n"
19234           "movapd %%xmm8, %%xmm2\n"
19235           "addpd %%xmm10, %%xmm0\n"
19236           "subpd %%xmm10, %%xmm2\n"
19237           "movapd %%xmm9, %%xmm1\n"
19238           "movapd %%xmm9, %%xmm3\n"
19239           "addpd %%xmm11, %%xmm1\n"
19240           "subpd %%xmm11, %%xmm3\n"
19241           "movapd %%xmm12, %%xmm4\n"
19242           "movapd %%xmm12, %%xmm6\n"
19243           "addpd %%xmm14, %%xmm4\n"
19244           "subpd %%xmm14, %%xmm6\n"
19245           "movapd %%xmm13, %%xmm5\n"
19246           "movapd %%xmm13, %%xmm7\n"
19247           "addpd %%xmm15, %%xmm5\n"
19248           "subpd %%xmm15, %%xmm7\n"
19249           "movapd %%xmm0, %%xmm8\n"
19250           "movapd %%xmm0, %%xmm12\n"
19251           "addpd %%xmm4, %%xmm8\n"
19252           "subpd %%xmm4, %%xmm12\n"
19253           "movapd %%xmm1, %%xmm9\n"
19254           "movapd %%xmm1, %%xmm13\n"
19255           "addpd %%xmm5, %%xmm9\n"
19256           "subpd %%xmm5, %%xmm13\n"
19257           "movapd %%xmm2, %%xmm10\n"
19258           "movapd %%xmm2, %%xmm14\n"
19259           "addpd %%xmm6, %%xmm10\n"
19260           "subpd %%xmm6, %%xmm14\n"
19261           "movapd %%xmm3, %%xmm11\n"
19262           "movapd %%xmm3, %%xmm15\n"
19263           "addpd %%xmm7, %%xmm11\n"
19264           "subpd %%xmm7, %%xmm15\n"
19265           "movupd %%xmm8, (%0)\n"
19266           "movupd %%xmm9, (%1)\n"
19267           "movupd %%xmm10, (%2)\n"
19268           "movupd %%xmm11, (%3)\n"
19269           "movupd %%xmm12, (%4)\n"
19270           "movupd %%xmm13, (%5)\n"
19271           "movupd %%xmm14, (%6)\n"
19272           "movupd %%xmm15, (%7)\n"
19273           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19274         );
19275       }
19276     }
19277     return;
19278   }
19279   if (depth == 20) {
19280     helper_double_20_recursive(buf + 0, 18);
19281     helper_double_20_recursive(buf + 262144, 18);
19282     helper_double_20_recursive(buf + 524288, 18);
19283     helper_double_20_recursive(buf + 786432, 18);
19284     for (int j = 0; j < 1048576; j += 1048576) {
19285       for (int k = 0; k < 262144; k += 2) {
19286         __asm__ volatile (
19287           "movupd (%0), %%xmm0\n"
19288           "movupd (%1), %%xmm1\n"
19289           "movupd (%2), %%xmm2\n"
19290           "movupd (%3), %%xmm3\n"
19291           "movapd %%xmm0, %%xmm8\n"
19292           "movapd %%xmm0, %%xmm9\n"
19293           "addpd %%xmm1, %%xmm8\n"
19294           "subpd %%xmm1, %%xmm9\n"
19295           "movapd %%xmm2, %%xmm10\n"
19296           "movapd %%xmm2, %%xmm11\n"
19297           "addpd %%xmm3, %%xmm10\n"
19298           "subpd %%xmm3, %%xmm11\n"
19299           "movapd %%xmm8, %%xmm0\n"
19300           "movapd %%xmm8, %%xmm2\n"
19301           "addpd %%xmm10, %%xmm0\n"
19302           "subpd %%xmm10, %%xmm2\n"
19303           "movapd %%xmm9, %%xmm1\n"
19304           "movapd %%xmm9, %%xmm3\n"
19305           "addpd %%xmm11, %%xmm1\n"
19306           "subpd %%xmm11, %%xmm3\n"
19307           "movupd %%xmm0, (%0)\n"
19308           "movupd %%xmm1, (%1)\n"
19309           "movupd %%xmm2, (%2)\n"
19310           "movupd %%xmm3, (%3)\n"
19311           :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19312         );
19313       }
19314     }
19315     return;
19316   }
19317 }
19318 void helper_double_20(double *buf);
helper_double_20(double * buf)19319 void helper_double_20(double *buf) {
19320   helper_double_20_recursive(buf, 20);
19321 }
19322 void helper_double_21_recursive(double *buf, int depth);
helper_double_21_recursive(double * buf,int depth)19323 void helper_double_21_recursive(double *buf, int depth) {
19324   if (depth == 13) {
19325     for (int j = 0; j < 8192; j += 16) {
19326       for (int k = 0; k < 2; k += 2) {
19327         __asm__ volatile (
19328           "movupd (%0), %%xmm0\n"
19329           "movupd (%1), %%xmm1\n"
19330           "movupd (%2), %%xmm2\n"
19331           "movupd (%3), %%xmm3\n"
19332           "movupd (%4), %%xmm4\n"
19333           "movupd (%5), %%xmm5\n"
19334           "movupd (%6), %%xmm6\n"
19335           "movupd (%7), %%xmm7\n"
19336           "movapd %%xmm0, %%xmm8\n"
19337           "haddpd %%xmm8, %%xmm8\n"
19338           "movapd %%xmm0, %%xmm9\n"
19339           "hsubpd %%xmm9, %%xmm9\n"
19340           "blendpd $1, %%xmm8, %%xmm9\n"
19341           "movapd %%xmm9, %%xmm0\n"
19342           "movapd %%xmm1, %%xmm8\n"
19343           "haddpd %%xmm8, %%xmm8\n"
19344           "movapd %%xmm1, %%xmm9\n"
19345           "hsubpd %%xmm9, %%xmm9\n"
19346           "blendpd $1, %%xmm8, %%xmm9\n"
19347           "movapd %%xmm9, %%xmm1\n"
19348           "movapd %%xmm2, %%xmm8\n"
19349           "haddpd %%xmm8, %%xmm8\n"
19350           "movapd %%xmm2, %%xmm9\n"
19351           "hsubpd %%xmm9, %%xmm9\n"
19352           "blendpd $1, %%xmm8, %%xmm9\n"
19353           "movapd %%xmm9, %%xmm2\n"
19354           "movapd %%xmm3, %%xmm8\n"
19355           "haddpd %%xmm8, %%xmm8\n"
19356           "movapd %%xmm3, %%xmm9\n"
19357           "hsubpd %%xmm9, %%xmm9\n"
19358           "blendpd $1, %%xmm8, %%xmm9\n"
19359           "movapd %%xmm9, %%xmm3\n"
19360           "movapd %%xmm4, %%xmm8\n"
19361           "haddpd %%xmm8, %%xmm8\n"
19362           "movapd %%xmm4, %%xmm9\n"
19363           "hsubpd %%xmm9, %%xmm9\n"
19364           "blendpd $1, %%xmm8, %%xmm9\n"
19365           "movapd %%xmm9, %%xmm4\n"
19366           "movapd %%xmm5, %%xmm8\n"
19367           "haddpd %%xmm8, %%xmm8\n"
19368           "movapd %%xmm5, %%xmm9\n"
19369           "hsubpd %%xmm9, %%xmm9\n"
19370           "blendpd $1, %%xmm8, %%xmm9\n"
19371           "movapd %%xmm9, %%xmm5\n"
19372           "movapd %%xmm6, %%xmm8\n"
19373           "haddpd %%xmm8, %%xmm8\n"
19374           "movapd %%xmm6, %%xmm9\n"
19375           "hsubpd %%xmm9, %%xmm9\n"
19376           "blendpd $1, %%xmm8, %%xmm9\n"
19377           "movapd %%xmm9, %%xmm6\n"
19378           "movapd %%xmm7, %%xmm8\n"
19379           "haddpd %%xmm8, %%xmm8\n"
19380           "movapd %%xmm7, %%xmm9\n"
19381           "hsubpd %%xmm9, %%xmm9\n"
19382           "blendpd $1, %%xmm8, %%xmm9\n"
19383           "movapd %%xmm9, %%xmm7\n"
19384           "movapd %%xmm0, %%xmm8\n"
19385           "movapd %%xmm0, %%xmm9\n"
19386           "addpd %%xmm1, %%xmm8\n"
19387           "subpd %%xmm1, %%xmm9\n"
19388           "movapd %%xmm2, %%xmm10\n"
19389           "movapd %%xmm2, %%xmm11\n"
19390           "addpd %%xmm3, %%xmm10\n"
19391           "subpd %%xmm3, %%xmm11\n"
19392           "movapd %%xmm4, %%xmm12\n"
19393           "movapd %%xmm4, %%xmm13\n"
19394           "addpd %%xmm5, %%xmm12\n"
19395           "subpd %%xmm5, %%xmm13\n"
19396           "movapd %%xmm6, %%xmm14\n"
19397           "movapd %%xmm6, %%xmm15\n"
19398           "addpd %%xmm7, %%xmm14\n"
19399           "subpd %%xmm7, %%xmm15\n"
19400           "movapd %%xmm8, %%xmm0\n"
19401           "movapd %%xmm8, %%xmm2\n"
19402           "addpd %%xmm10, %%xmm0\n"
19403           "subpd %%xmm10, %%xmm2\n"
19404           "movapd %%xmm9, %%xmm1\n"
19405           "movapd %%xmm9, %%xmm3\n"
19406           "addpd %%xmm11, %%xmm1\n"
19407           "subpd %%xmm11, %%xmm3\n"
19408           "movapd %%xmm12, %%xmm4\n"
19409           "movapd %%xmm12, %%xmm6\n"
19410           "addpd %%xmm14, %%xmm4\n"
19411           "subpd %%xmm14, %%xmm6\n"
19412           "movapd %%xmm13, %%xmm5\n"
19413           "movapd %%xmm13, %%xmm7\n"
19414           "addpd %%xmm15, %%xmm5\n"
19415           "subpd %%xmm15, %%xmm7\n"
19416           "movapd %%xmm0, %%xmm8\n"
19417           "movapd %%xmm0, %%xmm12\n"
19418           "addpd %%xmm4, %%xmm8\n"
19419           "subpd %%xmm4, %%xmm12\n"
19420           "movapd %%xmm1, %%xmm9\n"
19421           "movapd %%xmm1, %%xmm13\n"
19422           "addpd %%xmm5, %%xmm9\n"
19423           "subpd %%xmm5, %%xmm13\n"
19424           "movapd %%xmm2, %%xmm10\n"
19425           "movapd %%xmm2, %%xmm14\n"
19426           "addpd %%xmm6, %%xmm10\n"
19427           "subpd %%xmm6, %%xmm14\n"
19428           "movapd %%xmm3, %%xmm11\n"
19429           "movapd %%xmm3, %%xmm15\n"
19430           "addpd %%xmm7, %%xmm11\n"
19431           "subpd %%xmm7, %%xmm15\n"
19432           "movupd %%xmm8, (%0)\n"
19433           "movupd %%xmm9, (%1)\n"
19434           "movupd %%xmm10, (%2)\n"
19435           "movupd %%xmm11, (%3)\n"
19436           "movupd %%xmm12, (%4)\n"
19437           "movupd %%xmm13, (%5)\n"
19438           "movupd %%xmm14, (%6)\n"
19439           "movupd %%xmm15, (%7)\n"
19440           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19441         );
19442       }
19443     }
19444     for (int j = 0; j < 8192; j += 128) {
19445       for (int k = 0; k < 16; k += 2) {
19446         __asm__ volatile (
19447           "movupd (%0), %%xmm0\n"
19448           "movupd (%1), %%xmm1\n"
19449           "movupd (%2), %%xmm2\n"
19450           "movupd (%3), %%xmm3\n"
19451           "movupd (%4), %%xmm4\n"
19452           "movupd (%5), %%xmm5\n"
19453           "movupd (%6), %%xmm6\n"
19454           "movupd (%7), %%xmm7\n"
19455           "movapd %%xmm0, %%xmm8\n"
19456           "movapd %%xmm0, %%xmm9\n"
19457           "addpd %%xmm1, %%xmm8\n"
19458           "subpd %%xmm1, %%xmm9\n"
19459           "movapd %%xmm2, %%xmm10\n"
19460           "movapd %%xmm2, %%xmm11\n"
19461           "addpd %%xmm3, %%xmm10\n"
19462           "subpd %%xmm3, %%xmm11\n"
19463           "movapd %%xmm4, %%xmm12\n"
19464           "movapd %%xmm4, %%xmm13\n"
19465           "addpd %%xmm5, %%xmm12\n"
19466           "subpd %%xmm5, %%xmm13\n"
19467           "movapd %%xmm6, %%xmm14\n"
19468           "movapd %%xmm6, %%xmm15\n"
19469           "addpd %%xmm7, %%xmm14\n"
19470           "subpd %%xmm7, %%xmm15\n"
19471           "movapd %%xmm8, %%xmm0\n"
19472           "movapd %%xmm8, %%xmm2\n"
19473           "addpd %%xmm10, %%xmm0\n"
19474           "subpd %%xmm10, %%xmm2\n"
19475           "movapd %%xmm9, %%xmm1\n"
19476           "movapd %%xmm9, %%xmm3\n"
19477           "addpd %%xmm11, %%xmm1\n"
19478           "subpd %%xmm11, %%xmm3\n"
19479           "movapd %%xmm12, %%xmm4\n"
19480           "movapd %%xmm12, %%xmm6\n"
19481           "addpd %%xmm14, %%xmm4\n"
19482           "subpd %%xmm14, %%xmm6\n"
19483           "movapd %%xmm13, %%xmm5\n"
19484           "movapd %%xmm13, %%xmm7\n"
19485           "addpd %%xmm15, %%xmm5\n"
19486           "subpd %%xmm15, %%xmm7\n"
19487           "movapd %%xmm0, %%xmm8\n"
19488           "movapd %%xmm0, %%xmm12\n"
19489           "addpd %%xmm4, %%xmm8\n"
19490           "subpd %%xmm4, %%xmm12\n"
19491           "movapd %%xmm1, %%xmm9\n"
19492           "movapd %%xmm1, %%xmm13\n"
19493           "addpd %%xmm5, %%xmm9\n"
19494           "subpd %%xmm5, %%xmm13\n"
19495           "movapd %%xmm2, %%xmm10\n"
19496           "movapd %%xmm2, %%xmm14\n"
19497           "addpd %%xmm6, %%xmm10\n"
19498           "subpd %%xmm6, %%xmm14\n"
19499           "movapd %%xmm3, %%xmm11\n"
19500           "movapd %%xmm3, %%xmm15\n"
19501           "addpd %%xmm7, %%xmm11\n"
19502           "subpd %%xmm7, %%xmm15\n"
19503           "movupd %%xmm8, (%0)\n"
19504           "movupd %%xmm9, (%1)\n"
19505           "movupd %%xmm10, (%2)\n"
19506           "movupd %%xmm11, (%3)\n"
19507           "movupd %%xmm12, (%4)\n"
19508           "movupd %%xmm13, (%5)\n"
19509           "movupd %%xmm14, (%6)\n"
19510           "movupd %%xmm15, (%7)\n"
19511           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19512         );
19513       }
19514     }
19515     for (int j = 0; j < 8192; j += 1024) {
19516       for (int k = 0; k < 128; k += 2) {
19517         __asm__ volatile (
19518           "movupd (%0), %%xmm0\n"
19519           "movupd (%1), %%xmm1\n"
19520           "movupd (%2), %%xmm2\n"
19521           "movupd (%3), %%xmm3\n"
19522           "movupd (%4), %%xmm4\n"
19523           "movupd (%5), %%xmm5\n"
19524           "movupd (%6), %%xmm6\n"
19525           "movupd (%7), %%xmm7\n"
19526           "movapd %%xmm0, %%xmm8\n"
19527           "movapd %%xmm0, %%xmm9\n"
19528           "addpd %%xmm1, %%xmm8\n"
19529           "subpd %%xmm1, %%xmm9\n"
19530           "movapd %%xmm2, %%xmm10\n"
19531           "movapd %%xmm2, %%xmm11\n"
19532           "addpd %%xmm3, %%xmm10\n"
19533           "subpd %%xmm3, %%xmm11\n"
19534           "movapd %%xmm4, %%xmm12\n"
19535           "movapd %%xmm4, %%xmm13\n"
19536           "addpd %%xmm5, %%xmm12\n"
19537           "subpd %%xmm5, %%xmm13\n"
19538           "movapd %%xmm6, %%xmm14\n"
19539           "movapd %%xmm6, %%xmm15\n"
19540           "addpd %%xmm7, %%xmm14\n"
19541           "subpd %%xmm7, %%xmm15\n"
19542           "movapd %%xmm8, %%xmm0\n"
19543           "movapd %%xmm8, %%xmm2\n"
19544           "addpd %%xmm10, %%xmm0\n"
19545           "subpd %%xmm10, %%xmm2\n"
19546           "movapd %%xmm9, %%xmm1\n"
19547           "movapd %%xmm9, %%xmm3\n"
19548           "addpd %%xmm11, %%xmm1\n"
19549           "subpd %%xmm11, %%xmm3\n"
19550           "movapd %%xmm12, %%xmm4\n"
19551           "movapd %%xmm12, %%xmm6\n"
19552           "addpd %%xmm14, %%xmm4\n"
19553           "subpd %%xmm14, %%xmm6\n"
19554           "movapd %%xmm13, %%xmm5\n"
19555           "movapd %%xmm13, %%xmm7\n"
19556           "addpd %%xmm15, %%xmm5\n"
19557           "subpd %%xmm15, %%xmm7\n"
19558           "movapd %%xmm0, %%xmm8\n"
19559           "movapd %%xmm0, %%xmm12\n"
19560           "addpd %%xmm4, %%xmm8\n"
19561           "subpd %%xmm4, %%xmm12\n"
19562           "movapd %%xmm1, %%xmm9\n"
19563           "movapd %%xmm1, %%xmm13\n"
19564           "addpd %%xmm5, %%xmm9\n"
19565           "subpd %%xmm5, %%xmm13\n"
19566           "movapd %%xmm2, %%xmm10\n"
19567           "movapd %%xmm2, %%xmm14\n"
19568           "addpd %%xmm6, %%xmm10\n"
19569           "subpd %%xmm6, %%xmm14\n"
19570           "movapd %%xmm3, %%xmm11\n"
19571           "movapd %%xmm3, %%xmm15\n"
19572           "addpd %%xmm7, %%xmm11\n"
19573           "subpd %%xmm7, %%xmm15\n"
19574           "movupd %%xmm8, (%0)\n"
19575           "movupd %%xmm9, (%1)\n"
19576           "movupd %%xmm10, (%2)\n"
19577           "movupd %%xmm11, (%3)\n"
19578           "movupd %%xmm12, (%4)\n"
19579           "movupd %%xmm13, (%5)\n"
19580           "movupd %%xmm14, (%6)\n"
19581           "movupd %%xmm15, (%7)\n"
19582           :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19583         );
19584       }
19585     }
19586     for (int j = 0; j < 8192; j += 8192) {
19587       for (int k = 0; k < 1024; k += 2) {
19588         __asm__ volatile (
19589           "movupd (%0), %%xmm0\n"
19590           "movupd (%1), %%xmm1\n"
19591           "movupd (%2), %%xmm2\n"
19592           "movupd (%3), %%xmm3\n"
19593           "movupd (%4), %%xmm4\n"
19594           "movupd (%5), %%xmm5\n"
19595           "movupd (%6), %%xmm6\n"
19596           "movupd (%7), %%xmm7\n"
19597           "movapd %%xmm0, %%xmm8\n"
19598           "movapd %%xmm0, %%xmm9\n"
19599           "addpd %%xmm1, %%xmm8\n"
19600           "subpd %%xmm1, %%xmm9\n"
19601           "movapd %%xmm2, %%xmm10\n"
19602           "movapd %%xmm2, %%xmm11\n"
19603           "addpd %%xmm3, %%xmm10\n"
19604           "subpd %%xmm3, %%xmm11\n"
19605           "movapd %%xmm4, %%xmm12\n"
19606           "movapd %%xmm4, %%xmm13\n"
19607           "addpd %%xmm5, %%xmm12\n"
19608           "subpd %%xmm5, %%xmm13\n"
19609           "movapd %%xmm6, %%xmm14\n"
19610           "movapd %%xmm6, %%xmm15\n"
19611           "addpd %%xmm7, %%xmm14\n"
19612           "subpd %%xmm7, %%xmm15\n"
19613           "movapd %%xmm8, %%xmm0\n"
19614           "movapd %%xmm8, %%xmm2\n"
19615           "addpd %%xmm10, %%xmm0\n"
19616           "subpd %%xmm10, %%xmm2\n"
19617           "movapd %%xmm9, %%xmm1\n"
19618           "movapd %%xmm9, %%xmm3\n"
19619           "addpd %%xmm11, %%xmm1\n"
19620           "subpd %%xmm11, %%xmm3\n"
19621           "movapd %%xmm12, %%xmm4\n"
19622           "movapd %%xmm12, %%xmm6\n"
19623           "addpd %%xmm14, %%xmm4\n"
19624           "subpd %%xmm14, %%xmm6\n"
19625           "movapd %%xmm13, %%xmm5\n"
19626           "movapd %%xmm13, %%xmm7\n"
19627           "addpd %%xmm15, %%xmm5\n"
19628           "subpd %%xmm15, %%xmm7\n"
19629           "movapd %%xmm0, %%xmm8\n"
19630           "movapd %%xmm0, %%xmm12\n"
19631           "addpd %%xmm4, %%xmm8\n"
19632           "subpd %%xmm4, %%xmm12\n"
19633           "movapd %%xmm1, %%xmm9\n"
19634           "movapd %%xmm1, %%xmm13\n"
19635           "addpd %%xmm5, %%xmm9\n"
19636           "subpd %%xmm5, %%xmm13\n"
19637           "movapd %%xmm2, %%xmm10\n"
19638           "movapd %%xmm2, %%xmm14\n"
19639           "addpd %%xmm6, %%xmm10\n"
19640           "subpd %%xmm6, %%xmm14\n"
19641           "movapd %%xmm3, %%xmm11\n"
19642           "movapd %%xmm3, %%xmm15\n"
19643           "addpd %%xmm7, %%xmm11\n"
19644           "subpd %%xmm7, %%xmm15\n"
19645           "movupd %%xmm8, (%0)\n"
19646           "movupd %%xmm9, (%1)\n"
19647           "movupd %%xmm10, (%2)\n"
19648           "movupd %%xmm11, (%3)\n"
19649           "movupd %%xmm12, (%4)\n"
19650           "movupd %%xmm13, (%5)\n"
19651           "movupd %%xmm14, (%6)\n"
19652           "movupd %%xmm15, (%7)\n"
19653           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19654         );
19655       }
19656     }
19657     return;
19658   }
19659   if (depth == 16) {
19660     helper_double_21_recursive(buf + 0, 13);
19661     helper_double_21_recursive(buf + 8192, 13);
19662     helper_double_21_recursive(buf + 16384, 13);
19663     helper_double_21_recursive(buf + 24576, 13);
19664     helper_double_21_recursive(buf + 32768, 13);
19665     helper_double_21_recursive(buf + 40960, 13);
19666     helper_double_21_recursive(buf + 49152, 13);
19667     helper_double_21_recursive(buf + 57344, 13);
19668     for (int j = 0; j < 65536; j += 65536) {
19669       for (int k = 0; k < 8192; k += 2) {
19670         __asm__ volatile (
19671           "movupd (%0), %%xmm0\n"
19672           "movupd (%1), %%xmm1\n"
19673           "movupd (%2), %%xmm2\n"
19674           "movupd (%3), %%xmm3\n"
19675           "movupd (%4), %%xmm4\n"
19676           "movupd (%5), %%xmm5\n"
19677           "movupd (%6), %%xmm6\n"
19678           "movupd (%7), %%xmm7\n"
19679           "movapd %%xmm0, %%xmm8\n"
19680           "movapd %%xmm0, %%xmm9\n"
19681           "addpd %%xmm1, %%xmm8\n"
19682           "subpd %%xmm1, %%xmm9\n"
19683           "movapd %%xmm2, %%xmm10\n"
19684           "movapd %%xmm2, %%xmm11\n"
19685           "addpd %%xmm3, %%xmm10\n"
19686           "subpd %%xmm3, %%xmm11\n"
19687           "movapd %%xmm4, %%xmm12\n"
19688           "movapd %%xmm4, %%xmm13\n"
19689           "addpd %%xmm5, %%xmm12\n"
19690           "subpd %%xmm5, %%xmm13\n"
19691           "movapd %%xmm6, %%xmm14\n"
19692           "movapd %%xmm6, %%xmm15\n"
19693           "addpd %%xmm7, %%xmm14\n"
19694           "subpd %%xmm7, %%xmm15\n"
19695           "movapd %%xmm8, %%xmm0\n"
19696           "movapd %%xmm8, %%xmm2\n"
19697           "addpd %%xmm10, %%xmm0\n"
19698           "subpd %%xmm10, %%xmm2\n"
19699           "movapd %%xmm9, %%xmm1\n"
19700           "movapd %%xmm9, %%xmm3\n"
19701           "addpd %%xmm11, %%xmm1\n"
19702           "subpd %%xmm11, %%xmm3\n"
19703           "movapd %%xmm12, %%xmm4\n"
19704           "movapd %%xmm12, %%xmm6\n"
19705           "addpd %%xmm14, %%xmm4\n"
19706           "subpd %%xmm14, %%xmm6\n"
19707           "movapd %%xmm13, %%xmm5\n"
19708           "movapd %%xmm13, %%xmm7\n"
19709           "addpd %%xmm15, %%xmm5\n"
19710           "subpd %%xmm15, %%xmm7\n"
19711           "movapd %%xmm0, %%xmm8\n"
19712           "movapd %%xmm0, %%xmm12\n"
19713           "addpd %%xmm4, %%xmm8\n"
19714           "subpd %%xmm4, %%xmm12\n"
19715           "movapd %%xmm1, %%xmm9\n"
19716           "movapd %%xmm1, %%xmm13\n"
19717           "addpd %%xmm5, %%xmm9\n"
19718           "subpd %%xmm5, %%xmm13\n"
19719           "movapd %%xmm2, %%xmm10\n"
19720           "movapd %%xmm2, %%xmm14\n"
19721           "addpd %%xmm6, %%xmm10\n"
19722           "subpd %%xmm6, %%xmm14\n"
19723           "movapd %%xmm3, %%xmm11\n"
19724           "movapd %%xmm3, %%xmm15\n"
19725           "addpd %%xmm7, %%xmm11\n"
19726           "subpd %%xmm7, %%xmm15\n"
19727           "movupd %%xmm8, (%0)\n"
19728           "movupd %%xmm9, (%1)\n"
19729           "movupd %%xmm10, (%2)\n"
19730           "movupd %%xmm11, (%3)\n"
19731           "movupd %%xmm12, (%4)\n"
19732           "movupd %%xmm13, (%5)\n"
19733           "movupd %%xmm14, (%6)\n"
19734           "movupd %%xmm15, (%7)\n"
19735           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19736         );
19737       }
19738     }
19739     return;
19740   }
19741   if (depth == 19) {
19742     helper_double_21_recursive(buf + 0, 16);
19743     helper_double_21_recursive(buf + 65536, 16);
19744     helper_double_21_recursive(buf + 131072, 16);
19745     helper_double_21_recursive(buf + 196608, 16);
19746     helper_double_21_recursive(buf + 262144, 16);
19747     helper_double_21_recursive(buf + 327680, 16);
19748     helper_double_21_recursive(buf + 393216, 16);
19749     helper_double_21_recursive(buf + 458752, 16);
19750     for (int j = 0; j < 524288; j += 524288) {
19751       for (int k = 0; k < 65536; k += 2) {
19752         __asm__ volatile (
19753           "movupd (%0), %%xmm0\n"
19754           "movupd (%1), %%xmm1\n"
19755           "movupd (%2), %%xmm2\n"
19756           "movupd (%3), %%xmm3\n"
19757           "movupd (%4), %%xmm4\n"
19758           "movupd (%5), %%xmm5\n"
19759           "movupd (%6), %%xmm6\n"
19760           "movupd (%7), %%xmm7\n"
19761           "movapd %%xmm0, %%xmm8\n"
19762           "movapd %%xmm0, %%xmm9\n"
19763           "addpd %%xmm1, %%xmm8\n"
19764           "subpd %%xmm1, %%xmm9\n"
19765           "movapd %%xmm2, %%xmm10\n"
19766           "movapd %%xmm2, %%xmm11\n"
19767           "addpd %%xmm3, %%xmm10\n"
19768           "subpd %%xmm3, %%xmm11\n"
19769           "movapd %%xmm4, %%xmm12\n"
19770           "movapd %%xmm4, %%xmm13\n"
19771           "addpd %%xmm5, %%xmm12\n"
19772           "subpd %%xmm5, %%xmm13\n"
19773           "movapd %%xmm6, %%xmm14\n"
19774           "movapd %%xmm6, %%xmm15\n"
19775           "addpd %%xmm7, %%xmm14\n"
19776           "subpd %%xmm7, %%xmm15\n"
19777           "movapd %%xmm8, %%xmm0\n"
19778           "movapd %%xmm8, %%xmm2\n"
19779           "addpd %%xmm10, %%xmm0\n"
19780           "subpd %%xmm10, %%xmm2\n"
19781           "movapd %%xmm9, %%xmm1\n"
19782           "movapd %%xmm9, %%xmm3\n"
19783           "addpd %%xmm11, %%xmm1\n"
19784           "subpd %%xmm11, %%xmm3\n"
19785           "movapd %%xmm12, %%xmm4\n"
19786           "movapd %%xmm12, %%xmm6\n"
19787           "addpd %%xmm14, %%xmm4\n"
19788           "subpd %%xmm14, %%xmm6\n"
19789           "movapd %%xmm13, %%xmm5\n"
19790           "movapd %%xmm13, %%xmm7\n"
19791           "addpd %%xmm15, %%xmm5\n"
19792           "subpd %%xmm15, %%xmm7\n"
19793           "movapd %%xmm0, %%xmm8\n"
19794           "movapd %%xmm0, %%xmm12\n"
19795           "addpd %%xmm4, %%xmm8\n"
19796           "subpd %%xmm4, %%xmm12\n"
19797           "movapd %%xmm1, %%xmm9\n"
19798           "movapd %%xmm1, %%xmm13\n"
19799           "addpd %%xmm5, %%xmm9\n"
19800           "subpd %%xmm5, %%xmm13\n"
19801           "movapd %%xmm2, %%xmm10\n"
19802           "movapd %%xmm2, %%xmm14\n"
19803           "addpd %%xmm6, %%xmm10\n"
19804           "subpd %%xmm6, %%xmm14\n"
19805           "movapd %%xmm3, %%xmm11\n"
19806           "movapd %%xmm3, %%xmm15\n"
19807           "addpd %%xmm7, %%xmm11\n"
19808           "subpd %%xmm7, %%xmm15\n"
19809           "movupd %%xmm8, (%0)\n"
19810           "movupd %%xmm9, (%1)\n"
19811           "movupd %%xmm10, (%2)\n"
19812           "movupd %%xmm11, (%3)\n"
19813           "movupd %%xmm12, (%4)\n"
19814           "movupd %%xmm13, (%5)\n"
19815           "movupd %%xmm14, (%6)\n"
19816           "movupd %%xmm15, (%7)\n"
19817           :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19818         );
19819       }
19820     }
19821     return;
19822   }
19823   if (depth == 21) {
19824     helper_double_21_recursive(buf + 0, 19);
19825     helper_double_21_recursive(buf + 524288, 19);
19826     helper_double_21_recursive(buf + 1048576, 19);
19827     helper_double_21_recursive(buf + 1572864, 19);
19828     for (int j = 0; j < 2097152; j += 2097152) {
19829       for (int k = 0; k < 524288; k += 2) {
19830         __asm__ volatile (
19831           "movupd (%0), %%xmm0\n"
19832           "movupd (%1), %%xmm1\n"
19833           "movupd (%2), %%xmm2\n"
19834           "movupd (%3), %%xmm3\n"
19835           "movapd %%xmm0, %%xmm8\n"
19836           "movapd %%xmm0, %%xmm9\n"
19837           "addpd %%xmm1, %%xmm8\n"
19838           "subpd %%xmm1, %%xmm9\n"
19839           "movapd %%xmm2, %%xmm10\n"
19840           "movapd %%xmm2, %%xmm11\n"
19841           "addpd %%xmm3, %%xmm10\n"
19842           "subpd %%xmm3, %%xmm11\n"
19843           "movapd %%xmm8, %%xmm0\n"
19844           "movapd %%xmm8, %%xmm2\n"
19845           "addpd %%xmm10, %%xmm0\n"
19846           "subpd %%xmm10, %%xmm2\n"
19847           "movapd %%xmm9, %%xmm1\n"
19848           "movapd %%xmm9, %%xmm3\n"
19849           "addpd %%xmm11, %%xmm1\n"
19850           "subpd %%xmm11, %%xmm3\n"
19851           "movupd %%xmm0, (%0)\n"
19852           "movupd %%xmm1, (%1)\n"
19853           "movupd %%xmm2, (%2)\n"
19854           "movupd %%xmm3, (%3)\n"
19855           :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19856         );
19857       }
19858     }
19859     return;
19860   }
19861 }
19862 void helper_double_21(double *buf);
helper_double_21(double * buf)19863 void helper_double_21(double *buf) {
19864   helper_double_21_recursive(buf, 21);
19865 }
19866 void helper_double_22_recursive(double *buf, int depth);
helper_double_22_recursive(double * buf,int depth)19867 void helper_double_22_recursive(double *buf, int depth) {
19868   if (depth == 2) {
19869     for (int j = 0; j < 4; j += 4) {
19870       for (int k = 0; k < 2; k += 2) {
19871         __asm__ volatile (
19872           "movupd (%0), %%xmm0\n"
19873           "movupd (%1), %%xmm1\n"
19874           "movapd %%xmm0, %%xmm8\n"
19875           "haddpd %%xmm8, %%xmm8\n"
19876           "movapd %%xmm0, %%xmm9\n"
19877           "hsubpd %%xmm9, %%xmm9\n"
19878           "blendpd $1, %%xmm8, %%xmm9\n"
19879           "movapd %%xmm9, %%xmm0\n"
19880           "movapd %%xmm1, %%xmm8\n"
19881           "haddpd %%xmm8, %%xmm8\n"
19882           "movapd %%xmm1, %%xmm9\n"
19883           "hsubpd %%xmm9, %%xmm9\n"
19884           "blendpd $1, %%xmm8, %%xmm9\n"
19885           "movapd %%xmm9, %%xmm1\n"
19886           "movapd %%xmm0, %%xmm8\n"
19887           "movapd %%xmm0, %%xmm9\n"
19888           "addpd %%xmm1, %%xmm8\n"
19889           "subpd %%xmm1, %%xmm9\n"
19890           "movupd %%xmm8, (%0)\n"
19891           "movupd %%xmm9, (%1)\n"
19892           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19893         );
19894       }
19895     }
19896     return;
19897   }
19898   if (depth == 5) {
19899     helper_double_22_recursive(buf + 0, 2);
19900     helper_double_22_recursive(buf + 4, 2);
19901     helper_double_22_recursive(buf + 8, 2);
19902     helper_double_22_recursive(buf + 12, 2);
19903     helper_double_22_recursive(buf + 16, 2);
19904     helper_double_22_recursive(buf + 20, 2);
19905     helper_double_22_recursive(buf + 24, 2);
19906     helper_double_22_recursive(buf + 28, 2);
19907     for (int j = 0; j < 32; j += 32) {
19908       for (int k = 0; k < 4; k += 2) {
19909         __asm__ volatile (
19910           "movupd (%0), %%xmm0\n"
19911           "movupd (%1), %%xmm1\n"
19912           "movupd (%2), %%xmm2\n"
19913           "movupd (%3), %%xmm3\n"
19914           "movupd (%4), %%xmm4\n"
19915           "movupd (%5), %%xmm5\n"
19916           "movupd (%6), %%xmm6\n"
19917           "movupd (%7), %%xmm7\n"
19918           "movapd %%xmm0, %%xmm8\n"
19919           "movapd %%xmm0, %%xmm9\n"
19920           "addpd %%xmm1, %%xmm8\n"
19921           "subpd %%xmm1, %%xmm9\n"
19922           "movapd %%xmm2, %%xmm10\n"
19923           "movapd %%xmm2, %%xmm11\n"
19924           "addpd %%xmm3, %%xmm10\n"
19925           "subpd %%xmm3, %%xmm11\n"
19926           "movapd %%xmm4, %%xmm12\n"
19927           "movapd %%xmm4, %%xmm13\n"
19928           "addpd %%xmm5, %%xmm12\n"
19929           "subpd %%xmm5, %%xmm13\n"
19930           "movapd %%xmm6, %%xmm14\n"
19931           "movapd %%xmm6, %%xmm15\n"
19932           "addpd %%xmm7, %%xmm14\n"
19933           "subpd %%xmm7, %%xmm15\n"
19934           "movapd %%xmm8, %%xmm0\n"
19935           "movapd %%xmm8, %%xmm2\n"
19936           "addpd %%xmm10, %%xmm0\n"
19937           "subpd %%xmm10, %%xmm2\n"
19938           "movapd %%xmm9, %%xmm1\n"
19939           "movapd %%xmm9, %%xmm3\n"
19940           "addpd %%xmm11, %%xmm1\n"
19941           "subpd %%xmm11, %%xmm3\n"
19942           "movapd %%xmm12, %%xmm4\n"
19943           "movapd %%xmm12, %%xmm6\n"
19944           "addpd %%xmm14, %%xmm4\n"
19945           "subpd %%xmm14, %%xmm6\n"
19946           "movapd %%xmm13, %%xmm5\n"
19947           "movapd %%xmm13, %%xmm7\n"
19948           "addpd %%xmm15, %%xmm5\n"
19949           "subpd %%xmm15, %%xmm7\n"
19950           "movapd %%xmm0, %%xmm8\n"
19951           "movapd %%xmm0, %%xmm12\n"
19952           "addpd %%xmm4, %%xmm8\n"
19953           "subpd %%xmm4, %%xmm12\n"
19954           "movapd %%xmm1, %%xmm9\n"
19955           "movapd %%xmm1, %%xmm13\n"
19956           "addpd %%xmm5, %%xmm9\n"
19957           "subpd %%xmm5, %%xmm13\n"
19958           "movapd %%xmm2, %%xmm10\n"
19959           "movapd %%xmm2, %%xmm14\n"
19960           "addpd %%xmm6, %%xmm10\n"
19961           "subpd %%xmm6, %%xmm14\n"
19962           "movapd %%xmm3, %%xmm11\n"
19963           "movapd %%xmm3, %%xmm15\n"
19964           "addpd %%xmm7, %%xmm11\n"
19965           "subpd %%xmm7, %%xmm15\n"
19966           "movupd %%xmm8, (%0)\n"
19967           "movupd %%xmm9, (%1)\n"
19968           "movupd %%xmm10, (%2)\n"
19969           "movupd %%xmm11, (%3)\n"
19970           "movupd %%xmm12, (%4)\n"
19971           "movupd %%xmm13, (%5)\n"
19972           "movupd %%xmm14, (%6)\n"
19973           "movupd %%xmm15, (%7)\n"
19974           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19975         );
19976       }
19977     }
19978     return;
19979   }
19980   if (depth == 8) {
19981     helper_double_22_recursive(buf + 0, 5);
19982     helper_double_22_recursive(buf + 32, 5);
19983     helper_double_22_recursive(buf + 64, 5);
19984     helper_double_22_recursive(buf + 96, 5);
19985     helper_double_22_recursive(buf + 128, 5);
19986     helper_double_22_recursive(buf + 160, 5);
19987     helper_double_22_recursive(buf + 192, 5);
19988     helper_double_22_recursive(buf + 224, 5);
19989     for (int j = 0; j < 256; j += 256) {
19990       for (int k = 0; k < 32; k += 2) {
19991         __asm__ volatile (
19992           "movupd (%0), %%xmm0\n"
19993           "movupd (%1), %%xmm1\n"
19994           "movupd (%2), %%xmm2\n"
19995           "movupd (%3), %%xmm3\n"
19996           "movupd (%4), %%xmm4\n"
19997           "movupd (%5), %%xmm5\n"
19998           "movupd (%6), %%xmm6\n"
19999           "movupd (%7), %%xmm7\n"
20000           "movapd %%xmm0, %%xmm8\n"
20001           "movapd %%xmm0, %%xmm9\n"
20002           "addpd %%xmm1, %%xmm8\n"
20003           "subpd %%xmm1, %%xmm9\n"
20004           "movapd %%xmm2, %%xmm10\n"
20005           "movapd %%xmm2, %%xmm11\n"
20006           "addpd %%xmm3, %%xmm10\n"
20007           "subpd %%xmm3, %%xmm11\n"
20008           "movapd %%xmm4, %%xmm12\n"
20009           "movapd %%xmm4, %%xmm13\n"
20010           "addpd %%xmm5, %%xmm12\n"
20011           "subpd %%xmm5, %%xmm13\n"
20012           "movapd %%xmm6, %%xmm14\n"
20013           "movapd %%xmm6, %%xmm15\n"
20014           "addpd %%xmm7, %%xmm14\n"
20015           "subpd %%xmm7, %%xmm15\n"
20016           "movapd %%xmm8, %%xmm0\n"
20017           "movapd %%xmm8, %%xmm2\n"
20018           "addpd %%xmm10, %%xmm0\n"
20019           "subpd %%xmm10, %%xmm2\n"
20020           "movapd %%xmm9, %%xmm1\n"
20021           "movapd %%xmm9, %%xmm3\n"
20022           "addpd %%xmm11, %%xmm1\n"
20023           "subpd %%xmm11, %%xmm3\n"
20024           "movapd %%xmm12, %%xmm4\n"
20025           "movapd %%xmm12, %%xmm6\n"
20026           "addpd %%xmm14, %%xmm4\n"
20027           "subpd %%xmm14, %%xmm6\n"
20028           "movapd %%xmm13, %%xmm5\n"
20029           "movapd %%xmm13, %%xmm7\n"
20030           "addpd %%xmm15, %%xmm5\n"
20031           "subpd %%xmm15, %%xmm7\n"
20032           "movapd %%xmm0, %%xmm8\n"
20033           "movapd %%xmm0, %%xmm12\n"
20034           "addpd %%xmm4, %%xmm8\n"
20035           "subpd %%xmm4, %%xmm12\n"
20036           "movapd %%xmm1, %%xmm9\n"
20037           "movapd %%xmm1, %%xmm13\n"
20038           "addpd %%xmm5, %%xmm9\n"
20039           "subpd %%xmm5, %%xmm13\n"
20040           "movapd %%xmm2, %%xmm10\n"
20041           "movapd %%xmm2, %%xmm14\n"
20042           "addpd %%xmm6, %%xmm10\n"
20043           "subpd %%xmm6, %%xmm14\n"
20044           "movapd %%xmm3, %%xmm11\n"
20045           "movapd %%xmm3, %%xmm15\n"
20046           "addpd %%xmm7, %%xmm11\n"
20047           "subpd %%xmm7, %%xmm15\n"
20048           "movupd %%xmm8, (%0)\n"
20049           "movupd %%xmm9, (%1)\n"
20050           "movupd %%xmm10, (%2)\n"
20051           "movupd %%xmm11, (%3)\n"
20052           "movupd %%xmm12, (%4)\n"
20053           "movupd %%xmm13, (%5)\n"
20054           "movupd %%xmm14, (%6)\n"
20055           "movupd %%xmm15, (%7)\n"
20056           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20057         );
20058       }
20059     }
20060     return;
20061   }
20062   if (depth == 11) {
20063     helper_double_22_recursive(buf + 0, 8);
20064     helper_double_22_recursive(buf + 256, 8);
20065     helper_double_22_recursive(buf + 512, 8);
20066     helper_double_22_recursive(buf + 768, 8);
20067     helper_double_22_recursive(buf + 1024, 8);
20068     helper_double_22_recursive(buf + 1280, 8);
20069     helper_double_22_recursive(buf + 1536, 8);
20070     helper_double_22_recursive(buf + 1792, 8);
20071     for (int j = 0; j < 2048; j += 2048) {
20072       for (int k = 0; k < 256; k += 2) {
20073         __asm__ volatile (
20074           "movupd (%0), %%xmm0\n"
20075           "movupd (%1), %%xmm1\n"
20076           "movupd (%2), %%xmm2\n"
20077           "movupd (%3), %%xmm3\n"
20078           "movupd (%4), %%xmm4\n"
20079           "movupd (%5), %%xmm5\n"
20080           "movupd (%6), %%xmm6\n"
20081           "movupd (%7), %%xmm7\n"
20082           "movapd %%xmm0, %%xmm8\n"
20083           "movapd %%xmm0, %%xmm9\n"
20084           "addpd %%xmm1, %%xmm8\n"
20085           "subpd %%xmm1, %%xmm9\n"
20086           "movapd %%xmm2, %%xmm10\n"
20087           "movapd %%xmm2, %%xmm11\n"
20088           "addpd %%xmm3, %%xmm10\n"
20089           "subpd %%xmm3, %%xmm11\n"
20090           "movapd %%xmm4, %%xmm12\n"
20091           "movapd %%xmm4, %%xmm13\n"
20092           "addpd %%xmm5, %%xmm12\n"
20093           "subpd %%xmm5, %%xmm13\n"
20094           "movapd %%xmm6, %%xmm14\n"
20095           "movapd %%xmm6, %%xmm15\n"
20096           "addpd %%xmm7, %%xmm14\n"
20097           "subpd %%xmm7, %%xmm15\n"
20098           "movapd %%xmm8, %%xmm0\n"
20099           "movapd %%xmm8, %%xmm2\n"
20100           "addpd %%xmm10, %%xmm0\n"
20101           "subpd %%xmm10, %%xmm2\n"
20102           "movapd %%xmm9, %%xmm1\n"
20103           "movapd %%xmm9, %%xmm3\n"
20104           "addpd %%xmm11, %%xmm1\n"
20105           "subpd %%xmm11, %%xmm3\n"
20106           "movapd %%xmm12, %%xmm4\n"
20107           "movapd %%xmm12, %%xmm6\n"
20108           "addpd %%xmm14, %%xmm4\n"
20109           "subpd %%xmm14, %%xmm6\n"
20110           "movapd %%xmm13, %%xmm5\n"
20111           "movapd %%xmm13, %%xmm7\n"
20112           "addpd %%xmm15, %%xmm5\n"
20113           "subpd %%xmm15, %%xmm7\n"
20114           "movapd %%xmm0, %%xmm8\n"
20115           "movapd %%xmm0, %%xmm12\n"
20116           "addpd %%xmm4, %%xmm8\n"
20117           "subpd %%xmm4, %%xmm12\n"
20118           "movapd %%xmm1, %%xmm9\n"
20119           "movapd %%xmm1, %%xmm13\n"
20120           "addpd %%xmm5, %%xmm9\n"
20121           "subpd %%xmm5, %%xmm13\n"
20122           "movapd %%xmm2, %%xmm10\n"
20123           "movapd %%xmm2, %%xmm14\n"
20124           "addpd %%xmm6, %%xmm10\n"
20125           "subpd %%xmm6, %%xmm14\n"
20126           "movapd %%xmm3, %%xmm11\n"
20127           "movapd %%xmm3, %%xmm15\n"
20128           "addpd %%xmm7, %%xmm11\n"
20129           "subpd %%xmm7, %%xmm15\n"
20130           "movupd %%xmm8, (%0)\n"
20131           "movupd %%xmm9, (%1)\n"
20132           "movupd %%xmm10, (%2)\n"
20133           "movupd %%xmm11, (%3)\n"
20134           "movupd %%xmm12, (%4)\n"
20135           "movupd %%xmm13, (%5)\n"
20136           "movupd %%xmm14, (%6)\n"
20137           "movupd %%xmm15, (%7)\n"
20138           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20139         );
20140       }
20141     }
20142     return;
20143   }
20144   if (depth == 14) {
20145     helper_double_22_recursive(buf + 0, 11);
20146     helper_double_22_recursive(buf + 2048, 11);
20147     helper_double_22_recursive(buf + 4096, 11);
20148     helper_double_22_recursive(buf + 6144, 11);
20149     helper_double_22_recursive(buf + 8192, 11);
20150     helper_double_22_recursive(buf + 10240, 11);
20151     helper_double_22_recursive(buf + 12288, 11);
20152     helper_double_22_recursive(buf + 14336, 11);
20153     for (int j = 0; j < 16384; j += 16384) {
20154       for (int k = 0; k < 2048; k += 2) {
20155         __asm__ volatile (
20156           "movupd (%0), %%xmm0\n"
20157           "movupd (%1), %%xmm1\n"
20158           "movupd (%2), %%xmm2\n"
20159           "movupd (%3), %%xmm3\n"
20160           "movupd (%4), %%xmm4\n"
20161           "movupd (%5), %%xmm5\n"
20162           "movupd (%6), %%xmm6\n"
20163           "movupd (%7), %%xmm7\n"
20164           "movapd %%xmm0, %%xmm8\n"
20165           "movapd %%xmm0, %%xmm9\n"
20166           "addpd %%xmm1, %%xmm8\n"
20167           "subpd %%xmm1, %%xmm9\n"
20168           "movapd %%xmm2, %%xmm10\n"
20169           "movapd %%xmm2, %%xmm11\n"
20170           "addpd %%xmm3, %%xmm10\n"
20171           "subpd %%xmm3, %%xmm11\n"
20172           "movapd %%xmm4, %%xmm12\n"
20173           "movapd %%xmm4, %%xmm13\n"
20174           "addpd %%xmm5, %%xmm12\n"
20175           "subpd %%xmm5, %%xmm13\n"
20176           "movapd %%xmm6, %%xmm14\n"
20177           "movapd %%xmm6, %%xmm15\n"
20178           "addpd %%xmm7, %%xmm14\n"
20179           "subpd %%xmm7, %%xmm15\n"
20180           "movapd %%xmm8, %%xmm0\n"
20181           "movapd %%xmm8, %%xmm2\n"
20182           "addpd %%xmm10, %%xmm0\n"
20183           "subpd %%xmm10, %%xmm2\n"
20184           "movapd %%xmm9, %%xmm1\n"
20185           "movapd %%xmm9, %%xmm3\n"
20186           "addpd %%xmm11, %%xmm1\n"
20187           "subpd %%xmm11, %%xmm3\n"
20188           "movapd %%xmm12, %%xmm4\n"
20189           "movapd %%xmm12, %%xmm6\n"
20190           "addpd %%xmm14, %%xmm4\n"
20191           "subpd %%xmm14, %%xmm6\n"
20192           "movapd %%xmm13, %%xmm5\n"
20193           "movapd %%xmm13, %%xmm7\n"
20194           "addpd %%xmm15, %%xmm5\n"
20195           "subpd %%xmm15, %%xmm7\n"
20196           "movapd %%xmm0, %%xmm8\n"
20197           "movapd %%xmm0, %%xmm12\n"
20198           "addpd %%xmm4, %%xmm8\n"
20199           "subpd %%xmm4, %%xmm12\n"
20200           "movapd %%xmm1, %%xmm9\n"
20201           "movapd %%xmm1, %%xmm13\n"
20202           "addpd %%xmm5, %%xmm9\n"
20203           "subpd %%xmm5, %%xmm13\n"
20204           "movapd %%xmm2, %%xmm10\n"
20205           "movapd %%xmm2, %%xmm14\n"
20206           "addpd %%xmm6, %%xmm10\n"
20207           "subpd %%xmm6, %%xmm14\n"
20208           "movapd %%xmm3, %%xmm11\n"
20209           "movapd %%xmm3, %%xmm15\n"
20210           "addpd %%xmm7, %%xmm11\n"
20211           "subpd %%xmm7, %%xmm15\n"
20212           "movupd %%xmm8, (%0)\n"
20213           "movupd %%xmm9, (%1)\n"
20214           "movupd %%xmm10, (%2)\n"
20215           "movupd %%xmm11, (%3)\n"
20216           "movupd %%xmm12, (%4)\n"
20217           "movupd %%xmm13, (%5)\n"
20218           "movupd %%xmm14, (%6)\n"
20219           "movupd %%xmm15, (%7)\n"
20220           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20221         );
20222       }
20223     }
20224     return;
20225   }
20226   if (depth == 17) {
20227     helper_double_22_recursive(buf + 0, 14);
20228     helper_double_22_recursive(buf + 16384, 14);
20229     helper_double_22_recursive(buf + 32768, 14);
20230     helper_double_22_recursive(buf + 49152, 14);
20231     helper_double_22_recursive(buf + 65536, 14);
20232     helper_double_22_recursive(buf + 81920, 14);
20233     helper_double_22_recursive(buf + 98304, 14);
20234     helper_double_22_recursive(buf + 114688, 14);
20235     for (int j = 0; j < 131072; j += 131072) {
20236       for (int k = 0; k < 16384; k += 2) {
20237         __asm__ volatile (
20238           "movupd (%0), %%xmm0\n"
20239           "movupd (%1), %%xmm1\n"
20240           "movupd (%2), %%xmm2\n"
20241           "movupd (%3), %%xmm3\n"
20242           "movupd (%4), %%xmm4\n"
20243           "movupd (%5), %%xmm5\n"
20244           "movupd (%6), %%xmm6\n"
20245           "movupd (%7), %%xmm7\n"
20246           "movapd %%xmm0, %%xmm8\n"
20247           "movapd %%xmm0, %%xmm9\n"
20248           "addpd %%xmm1, %%xmm8\n"
20249           "subpd %%xmm1, %%xmm9\n"
20250           "movapd %%xmm2, %%xmm10\n"
20251           "movapd %%xmm2, %%xmm11\n"
20252           "addpd %%xmm3, %%xmm10\n"
20253           "subpd %%xmm3, %%xmm11\n"
20254           "movapd %%xmm4, %%xmm12\n"
20255           "movapd %%xmm4, %%xmm13\n"
20256           "addpd %%xmm5, %%xmm12\n"
20257           "subpd %%xmm5, %%xmm13\n"
20258           "movapd %%xmm6, %%xmm14\n"
20259           "movapd %%xmm6, %%xmm15\n"
20260           "addpd %%xmm7, %%xmm14\n"
20261           "subpd %%xmm7, %%xmm15\n"
20262           "movapd %%xmm8, %%xmm0\n"
20263           "movapd %%xmm8, %%xmm2\n"
20264           "addpd %%xmm10, %%xmm0\n"
20265           "subpd %%xmm10, %%xmm2\n"
20266           "movapd %%xmm9, %%xmm1\n"
20267           "movapd %%xmm9, %%xmm3\n"
20268           "addpd %%xmm11, %%xmm1\n"
20269           "subpd %%xmm11, %%xmm3\n"
20270           "movapd %%xmm12, %%xmm4\n"
20271           "movapd %%xmm12, %%xmm6\n"
20272           "addpd %%xmm14, %%xmm4\n"
20273           "subpd %%xmm14, %%xmm6\n"
20274           "movapd %%xmm13, %%xmm5\n"
20275           "movapd %%xmm13, %%xmm7\n"
20276           "addpd %%xmm15, %%xmm5\n"
20277           "subpd %%xmm15, %%xmm7\n"
20278           "movapd %%xmm0, %%xmm8\n"
20279           "movapd %%xmm0, %%xmm12\n"
20280           "addpd %%xmm4, %%xmm8\n"
20281           "subpd %%xmm4, %%xmm12\n"
20282           "movapd %%xmm1, %%xmm9\n"
20283           "movapd %%xmm1, %%xmm13\n"
20284           "addpd %%xmm5, %%xmm9\n"
20285           "subpd %%xmm5, %%xmm13\n"
20286           "movapd %%xmm2, %%xmm10\n"
20287           "movapd %%xmm2, %%xmm14\n"
20288           "addpd %%xmm6, %%xmm10\n"
20289           "subpd %%xmm6, %%xmm14\n"
20290           "movapd %%xmm3, %%xmm11\n"
20291           "movapd %%xmm3, %%xmm15\n"
20292           "addpd %%xmm7, %%xmm11\n"
20293           "subpd %%xmm7, %%xmm15\n"
20294           "movupd %%xmm8, (%0)\n"
20295           "movupd %%xmm9, (%1)\n"
20296           "movupd %%xmm10, (%2)\n"
20297           "movupd %%xmm11, (%3)\n"
20298           "movupd %%xmm12, (%4)\n"
20299           "movupd %%xmm13, (%5)\n"
20300           "movupd %%xmm14, (%6)\n"
20301           "movupd %%xmm15, (%7)\n"
20302           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20303         );
20304       }
20305     }
20306     return;
20307   }
20308   if (depth == 20) {
20309     helper_double_22_recursive(buf + 0, 17);
20310     helper_double_22_recursive(buf + 131072, 17);
20311     helper_double_22_recursive(buf + 262144, 17);
20312     helper_double_22_recursive(buf + 393216, 17);
20313     helper_double_22_recursive(buf + 524288, 17);
20314     helper_double_22_recursive(buf + 655360, 17);
20315     helper_double_22_recursive(buf + 786432, 17);
20316     helper_double_22_recursive(buf + 917504, 17);
20317     for (int j = 0; j < 1048576; j += 1048576) {
20318       for (int k = 0; k < 131072; k += 2) {
20319         __asm__ volatile (
20320           "movupd (%0), %%xmm0\n"
20321           "movupd (%1), %%xmm1\n"
20322           "movupd (%2), %%xmm2\n"
20323           "movupd (%3), %%xmm3\n"
20324           "movupd (%4), %%xmm4\n"
20325           "movupd (%5), %%xmm5\n"
20326           "movupd (%6), %%xmm6\n"
20327           "movupd (%7), %%xmm7\n"
20328           "movapd %%xmm0, %%xmm8\n"
20329           "movapd %%xmm0, %%xmm9\n"
20330           "addpd %%xmm1, %%xmm8\n"
20331           "subpd %%xmm1, %%xmm9\n"
20332           "movapd %%xmm2, %%xmm10\n"
20333           "movapd %%xmm2, %%xmm11\n"
20334           "addpd %%xmm3, %%xmm10\n"
20335           "subpd %%xmm3, %%xmm11\n"
20336           "movapd %%xmm4, %%xmm12\n"
20337           "movapd %%xmm4, %%xmm13\n"
20338           "addpd %%xmm5, %%xmm12\n"
20339           "subpd %%xmm5, %%xmm13\n"
20340           "movapd %%xmm6, %%xmm14\n"
20341           "movapd %%xmm6, %%xmm15\n"
20342           "addpd %%xmm7, %%xmm14\n"
20343           "subpd %%xmm7, %%xmm15\n"
20344           "movapd %%xmm8, %%xmm0\n"
20345           "movapd %%xmm8, %%xmm2\n"
20346           "addpd %%xmm10, %%xmm0\n"
20347           "subpd %%xmm10, %%xmm2\n"
20348           "movapd %%xmm9, %%xmm1\n"
20349           "movapd %%xmm9, %%xmm3\n"
20350           "addpd %%xmm11, %%xmm1\n"
20351           "subpd %%xmm11, %%xmm3\n"
20352           "movapd %%xmm12, %%xmm4\n"
20353           "movapd %%xmm12, %%xmm6\n"
20354           "addpd %%xmm14, %%xmm4\n"
20355           "subpd %%xmm14, %%xmm6\n"
20356           "movapd %%xmm13, %%xmm5\n"
20357           "movapd %%xmm13, %%xmm7\n"
20358           "addpd %%xmm15, %%xmm5\n"
20359           "subpd %%xmm15, %%xmm7\n"
20360           "movapd %%xmm0, %%xmm8\n"
20361           "movapd %%xmm0, %%xmm12\n"
20362           "addpd %%xmm4, %%xmm8\n"
20363           "subpd %%xmm4, %%xmm12\n"
20364           "movapd %%xmm1, %%xmm9\n"
20365           "movapd %%xmm1, %%xmm13\n"
20366           "addpd %%xmm5, %%xmm9\n"
20367           "subpd %%xmm5, %%xmm13\n"
20368           "movapd %%xmm2, %%xmm10\n"
20369           "movapd %%xmm2, %%xmm14\n"
20370           "addpd %%xmm6, %%xmm10\n"
20371           "subpd %%xmm6, %%xmm14\n"
20372           "movapd %%xmm3, %%xmm11\n"
20373           "movapd %%xmm3, %%xmm15\n"
20374           "addpd %%xmm7, %%xmm11\n"
20375           "subpd %%xmm7, %%xmm15\n"
20376           "movupd %%xmm8, (%0)\n"
20377           "movupd %%xmm9, (%1)\n"
20378           "movupd %%xmm10, (%2)\n"
20379           "movupd %%xmm11, (%3)\n"
20380           "movupd %%xmm12, (%4)\n"
20381           "movupd %%xmm13, (%5)\n"
20382           "movupd %%xmm14, (%6)\n"
20383           "movupd %%xmm15, (%7)\n"
20384           :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20385         );
20386       }
20387     }
20388     return;
20389   }
20390   if (depth == 22) {
20391     helper_double_22_recursive(buf + 0, 20);
20392     helper_double_22_recursive(buf + 1048576, 20);
20393     helper_double_22_recursive(buf + 2097152, 20);
20394     helper_double_22_recursive(buf + 3145728, 20);
20395     for (int j = 0; j < 4194304; j += 4194304) {
20396       for (int k = 0; k < 1048576; k += 2) {
20397         __asm__ volatile (
20398           "movupd (%0), %%xmm0\n"
20399           "movupd (%1), %%xmm1\n"
20400           "movupd (%2), %%xmm2\n"
20401           "movupd (%3), %%xmm3\n"
20402           "movapd %%xmm0, %%xmm8\n"
20403           "movapd %%xmm0, %%xmm9\n"
20404           "addpd %%xmm1, %%xmm8\n"
20405           "subpd %%xmm1, %%xmm9\n"
20406           "movapd %%xmm2, %%xmm10\n"
20407           "movapd %%xmm2, %%xmm11\n"
20408           "addpd %%xmm3, %%xmm10\n"
20409           "subpd %%xmm3, %%xmm11\n"
20410           "movapd %%xmm8, %%xmm0\n"
20411           "movapd %%xmm8, %%xmm2\n"
20412           "addpd %%xmm10, %%xmm0\n"
20413           "subpd %%xmm10, %%xmm2\n"
20414           "movapd %%xmm9, %%xmm1\n"
20415           "movapd %%xmm9, %%xmm3\n"
20416           "addpd %%xmm11, %%xmm1\n"
20417           "subpd %%xmm11, %%xmm3\n"
20418           "movupd %%xmm0, (%0)\n"
20419           "movupd %%xmm1, (%1)\n"
20420           "movupd %%xmm2, (%2)\n"
20421           "movupd %%xmm3, (%3)\n"
20422           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20423         );
20424       }
20425     }
20426     return;
20427   }
20428 }
20429 void helper_double_22(double *buf);
helper_double_22(double * buf)20430 void helper_double_22(double *buf) {
20431   helper_double_22_recursive(buf, 22);
20432 }
20433 void helper_double_23_recursive(double *buf, int depth);
helper_double_23_recursive(double * buf,int depth)20434 void helper_double_23_recursive(double *buf, int depth) {
20435   if (depth == 2) {
20436     for (int j = 0; j < 4; j += 4) {
20437       for (int k = 0; k < 2; k += 2) {
20438         __asm__ volatile (
20439           "movupd (%0), %%xmm0\n"
20440           "movupd (%1), %%xmm1\n"
20441           "movapd %%xmm0, %%xmm8\n"
20442           "haddpd %%xmm8, %%xmm8\n"
20443           "movapd %%xmm0, %%xmm9\n"
20444           "hsubpd %%xmm9, %%xmm9\n"
20445           "blendpd $1, %%xmm8, %%xmm9\n"
20446           "movapd %%xmm9, %%xmm0\n"
20447           "movapd %%xmm1, %%xmm8\n"
20448           "haddpd %%xmm8, %%xmm8\n"
20449           "movapd %%xmm1, %%xmm9\n"
20450           "hsubpd %%xmm9, %%xmm9\n"
20451           "blendpd $1, %%xmm8, %%xmm9\n"
20452           "movapd %%xmm9, %%xmm1\n"
20453           "movapd %%xmm0, %%xmm8\n"
20454           "movapd %%xmm0, %%xmm9\n"
20455           "addpd %%xmm1, %%xmm8\n"
20456           "subpd %%xmm1, %%xmm9\n"
20457           "movupd %%xmm8, (%0)\n"
20458           "movupd %%xmm9, (%1)\n"
20459           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20460         );
20461       }
20462     }
20463     return;
20464   }
20465   if (depth == 5) {
20466     helper_double_23_recursive(buf + 0, 2);
20467     helper_double_23_recursive(buf + 4, 2);
20468     helper_double_23_recursive(buf + 8, 2);
20469     helper_double_23_recursive(buf + 12, 2);
20470     helper_double_23_recursive(buf + 16, 2);
20471     helper_double_23_recursive(buf + 20, 2);
20472     helper_double_23_recursive(buf + 24, 2);
20473     helper_double_23_recursive(buf + 28, 2);
20474     for (int j = 0; j < 32; j += 32) {
20475       for (int k = 0; k < 4; k += 2) {
20476         __asm__ volatile (
20477           "movupd (%0), %%xmm0\n"
20478           "movupd (%1), %%xmm1\n"
20479           "movupd (%2), %%xmm2\n"
20480           "movupd (%3), %%xmm3\n"
20481           "movupd (%4), %%xmm4\n"
20482           "movupd (%5), %%xmm5\n"
20483           "movupd (%6), %%xmm6\n"
20484           "movupd (%7), %%xmm7\n"
20485           "movapd %%xmm0, %%xmm8\n"
20486           "movapd %%xmm0, %%xmm9\n"
20487           "addpd %%xmm1, %%xmm8\n"
20488           "subpd %%xmm1, %%xmm9\n"
20489           "movapd %%xmm2, %%xmm10\n"
20490           "movapd %%xmm2, %%xmm11\n"
20491           "addpd %%xmm3, %%xmm10\n"
20492           "subpd %%xmm3, %%xmm11\n"
20493           "movapd %%xmm4, %%xmm12\n"
20494           "movapd %%xmm4, %%xmm13\n"
20495           "addpd %%xmm5, %%xmm12\n"
20496           "subpd %%xmm5, %%xmm13\n"
20497           "movapd %%xmm6, %%xmm14\n"
20498           "movapd %%xmm6, %%xmm15\n"
20499           "addpd %%xmm7, %%xmm14\n"
20500           "subpd %%xmm7, %%xmm15\n"
20501           "movapd %%xmm8, %%xmm0\n"
20502           "movapd %%xmm8, %%xmm2\n"
20503           "addpd %%xmm10, %%xmm0\n"
20504           "subpd %%xmm10, %%xmm2\n"
20505           "movapd %%xmm9, %%xmm1\n"
20506           "movapd %%xmm9, %%xmm3\n"
20507           "addpd %%xmm11, %%xmm1\n"
20508           "subpd %%xmm11, %%xmm3\n"
20509           "movapd %%xmm12, %%xmm4\n"
20510           "movapd %%xmm12, %%xmm6\n"
20511           "addpd %%xmm14, %%xmm4\n"
20512           "subpd %%xmm14, %%xmm6\n"
20513           "movapd %%xmm13, %%xmm5\n"
20514           "movapd %%xmm13, %%xmm7\n"
20515           "addpd %%xmm15, %%xmm5\n"
20516           "subpd %%xmm15, %%xmm7\n"
20517           "movapd %%xmm0, %%xmm8\n"
20518           "movapd %%xmm0, %%xmm12\n"
20519           "addpd %%xmm4, %%xmm8\n"
20520           "subpd %%xmm4, %%xmm12\n"
20521           "movapd %%xmm1, %%xmm9\n"
20522           "movapd %%xmm1, %%xmm13\n"
20523           "addpd %%xmm5, %%xmm9\n"
20524           "subpd %%xmm5, %%xmm13\n"
20525           "movapd %%xmm2, %%xmm10\n"
20526           "movapd %%xmm2, %%xmm14\n"
20527           "addpd %%xmm6, %%xmm10\n"
20528           "subpd %%xmm6, %%xmm14\n"
20529           "movapd %%xmm3, %%xmm11\n"
20530           "movapd %%xmm3, %%xmm15\n"
20531           "addpd %%xmm7, %%xmm11\n"
20532           "subpd %%xmm7, %%xmm15\n"
20533           "movupd %%xmm8, (%0)\n"
20534           "movupd %%xmm9, (%1)\n"
20535           "movupd %%xmm10, (%2)\n"
20536           "movupd %%xmm11, (%3)\n"
20537           "movupd %%xmm12, (%4)\n"
20538           "movupd %%xmm13, (%5)\n"
20539           "movupd %%xmm14, (%6)\n"
20540           "movupd %%xmm15, (%7)\n"
20541           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20542         );
20543       }
20544     }
20545     return;
20546   }
20547   if (depth == 8) {
20548     helper_double_23_recursive(buf + 0, 5);
20549     helper_double_23_recursive(buf + 32, 5);
20550     helper_double_23_recursive(buf + 64, 5);
20551     helper_double_23_recursive(buf + 96, 5);
20552     helper_double_23_recursive(buf + 128, 5);
20553     helper_double_23_recursive(buf + 160, 5);
20554     helper_double_23_recursive(buf + 192, 5);
20555     helper_double_23_recursive(buf + 224, 5);
20556     for (int j = 0; j < 256; j += 256) {
20557       for (int k = 0; k < 32; k += 2) {
20558         __asm__ volatile (
20559           "movupd (%0), %%xmm0\n"
20560           "movupd (%1), %%xmm1\n"
20561           "movupd (%2), %%xmm2\n"
20562           "movupd (%3), %%xmm3\n"
20563           "movupd (%4), %%xmm4\n"
20564           "movupd (%5), %%xmm5\n"
20565           "movupd (%6), %%xmm6\n"
20566           "movupd (%7), %%xmm7\n"
20567           "movapd %%xmm0, %%xmm8\n"
20568           "movapd %%xmm0, %%xmm9\n"
20569           "addpd %%xmm1, %%xmm8\n"
20570           "subpd %%xmm1, %%xmm9\n"
20571           "movapd %%xmm2, %%xmm10\n"
20572           "movapd %%xmm2, %%xmm11\n"
20573           "addpd %%xmm3, %%xmm10\n"
20574           "subpd %%xmm3, %%xmm11\n"
20575           "movapd %%xmm4, %%xmm12\n"
20576           "movapd %%xmm4, %%xmm13\n"
20577           "addpd %%xmm5, %%xmm12\n"
20578           "subpd %%xmm5, %%xmm13\n"
20579           "movapd %%xmm6, %%xmm14\n"
20580           "movapd %%xmm6, %%xmm15\n"
20581           "addpd %%xmm7, %%xmm14\n"
20582           "subpd %%xmm7, %%xmm15\n"
20583           "movapd %%xmm8, %%xmm0\n"
20584           "movapd %%xmm8, %%xmm2\n"
20585           "addpd %%xmm10, %%xmm0\n"
20586           "subpd %%xmm10, %%xmm2\n"
20587           "movapd %%xmm9, %%xmm1\n"
20588           "movapd %%xmm9, %%xmm3\n"
20589           "addpd %%xmm11, %%xmm1\n"
20590           "subpd %%xmm11, %%xmm3\n"
20591           "movapd %%xmm12, %%xmm4\n"
20592           "movapd %%xmm12, %%xmm6\n"
20593           "addpd %%xmm14, %%xmm4\n"
20594           "subpd %%xmm14, %%xmm6\n"
20595           "movapd %%xmm13, %%xmm5\n"
20596           "movapd %%xmm13, %%xmm7\n"
20597           "addpd %%xmm15, %%xmm5\n"
20598           "subpd %%xmm15, %%xmm7\n"
20599           "movapd %%xmm0, %%xmm8\n"
20600           "movapd %%xmm0, %%xmm12\n"
20601           "addpd %%xmm4, %%xmm8\n"
20602           "subpd %%xmm4, %%xmm12\n"
20603           "movapd %%xmm1, %%xmm9\n"
20604           "movapd %%xmm1, %%xmm13\n"
20605           "addpd %%xmm5, %%xmm9\n"
20606           "subpd %%xmm5, %%xmm13\n"
20607           "movapd %%xmm2, %%xmm10\n"
20608           "movapd %%xmm2, %%xmm14\n"
20609           "addpd %%xmm6, %%xmm10\n"
20610           "subpd %%xmm6, %%xmm14\n"
20611           "movapd %%xmm3, %%xmm11\n"
20612           "movapd %%xmm3, %%xmm15\n"
20613           "addpd %%xmm7, %%xmm11\n"
20614           "subpd %%xmm7, %%xmm15\n"
20615           "movupd %%xmm8, (%0)\n"
20616           "movupd %%xmm9, (%1)\n"
20617           "movupd %%xmm10, (%2)\n"
20618           "movupd %%xmm11, (%3)\n"
20619           "movupd %%xmm12, (%4)\n"
20620           "movupd %%xmm13, (%5)\n"
20621           "movupd %%xmm14, (%6)\n"
20622           "movupd %%xmm15, (%7)\n"
20623           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20624         );
20625       }
20626     }
20627     return;
20628   }
20629   if (depth == 11) {
20630     helper_double_23_recursive(buf + 0, 8);
20631     helper_double_23_recursive(buf + 256, 8);
20632     helper_double_23_recursive(buf + 512, 8);
20633     helper_double_23_recursive(buf + 768, 8);
20634     helper_double_23_recursive(buf + 1024, 8);
20635     helper_double_23_recursive(buf + 1280, 8);
20636     helper_double_23_recursive(buf + 1536, 8);
20637     helper_double_23_recursive(buf + 1792, 8);
20638     for (int j = 0; j < 2048; j += 2048) {
20639       for (int k = 0; k < 256; k += 2) {
20640         __asm__ volatile (
20641           "movupd (%0), %%xmm0\n"
20642           "movupd (%1), %%xmm1\n"
20643           "movupd (%2), %%xmm2\n"
20644           "movupd (%3), %%xmm3\n"
20645           "movupd (%4), %%xmm4\n"
20646           "movupd (%5), %%xmm5\n"
20647           "movupd (%6), %%xmm6\n"
20648           "movupd (%7), %%xmm7\n"
20649           "movapd %%xmm0, %%xmm8\n"
20650           "movapd %%xmm0, %%xmm9\n"
20651           "addpd %%xmm1, %%xmm8\n"
20652           "subpd %%xmm1, %%xmm9\n"
20653           "movapd %%xmm2, %%xmm10\n"
20654           "movapd %%xmm2, %%xmm11\n"
20655           "addpd %%xmm3, %%xmm10\n"
20656           "subpd %%xmm3, %%xmm11\n"
20657           "movapd %%xmm4, %%xmm12\n"
20658           "movapd %%xmm4, %%xmm13\n"
20659           "addpd %%xmm5, %%xmm12\n"
20660           "subpd %%xmm5, %%xmm13\n"
20661           "movapd %%xmm6, %%xmm14\n"
20662           "movapd %%xmm6, %%xmm15\n"
20663           "addpd %%xmm7, %%xmm14\n"
20664           "subpd %%xmm7, %%xmm15\n"
20665           "movapd %%xmm8, %%xmm0\n"
20666           "movapd %%xmm8, %%xmm2\n"
20667           "addpd %%xmm10, %%xmm0\n"
20668           "subpd %%xmm10, %%xmm2\n"
20669           "movapd %%xmm9, %%xmm1\n"
20670           "movapd %%xmm9, %%xmm3\n"
20671           "addpd %%xmm11, %%xmm1\n"
20672           "subpd %%xmm11, %%xmm3\n"
20673           "movapd %%xmm12, %%xmm4\n"
20674           "movapd %%xmm12, %%xmm6\n"
20675           "addpd %%xmm14, %%xmm4\n"
20676           "subpd %%xmm14, %%xmm6\n"
20677           "movapd %%xmm13, %%xmm5\n"
20678           "movapd %%xmm13, %%xmm7\n"
20679           "addpd %%xmm15, %%xmm5\n"
20680           "subpd %%xmm15, %%xmm7\n"
20681           "movapd %%xmm0, %%xmm8\n"
20682           "movapd %%xmm0, %%xmm12\n"
20683           "addpd %%xmm4, %%xmm8\n"
20684           "subpd %%xmm4, %%xmm12\n"
20685           "movapd %%xmm1, %%xmm9\n"
20686           "movapd %%xmm1, %%xmm13\n"
20687           "addpd %%xmm5, %%xmm9\n"
20688           "subpd %%xmm5, %%xmm13\n"
20689           "movapd %%xmm2, %%xmm10\n"
20690           "movapd %%xmm2, %%xmm14\n"
20691           "addpd %%xmm6, %%xmm10\n"
20692           "subpd %%xmm6, %%xmm14\n"
20693           "movapd %%xmm3, %%xmm11\n"
20694           "movapd %%xmm3, %%xmm15\n"
20695           "addpd %%xmm7, %%xmm11\n"
20696           "subpd %%xmm7, %%xmm15\n"
20697           "movupd %%xmm8, (%0)\n"
20698           "movupd %%xmm9, (%1)\n"
20699           "movupd %%xmm10, (%2)\n"
20700           "movupd %%xmm11, (%3)\n"
20701           "movupd %%xmm12, (%4)\n"
20702           "movupd %%xmm13, (%5)\n"
20703           "movupd %%xmm14, (%6)\n"
20704           "movupd %%xmm15, (%7)\n"
20705           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20706         );
20707       }
20708     }
20709     return;
20710   }
20711   if (depth == 14) {
20712     helper_double_23_recursive(buf + 0, 11);
20713     helper_double_23_recursive(buf + 2048, 11);
20714     helper_double_23_recursive(buf + 4096, 11);
20715     helper_double_23_recursive(buf + 6144, 11);
20716     helper_double_23_recursive(buf + 8192, 11);
20717     helper_double_23_recursive(buf + 10240, 11);
20718     helper_double_23_recursive(buf + 12288, 11);
20719     helper_double_23_recursive(buf + 14336, 11);
20720     for (int j = 0; j < 16384; j += 16384) {
20721       for (int k = 0; k < 2048; k += 2) {
20722         __asm__ volatile (
20723           "movupd (%0), %%xmm0\n"
20724           "movupd (%1), %%xmm1\n"
20725           "movupd (%2), %%xmm2\n"
20726           "movupd (%3), %%xmm3\n"
20727           "movupd (%4), %%xmm4\n"
20728           "movupd (%5), %%xmm5\n"
20729           "movupd (%6), %%xmm6\n"
20730           "movupd (%7), %%xmm7\n"
20731           "movapd %%xmm0, %%xmm8\n"
20732           "movapd %%xmm0, %%xmm9\n"
20733           "addpd %%xmm1, %%xmm8\n"
20734           "subpd %%xmm1, %%xmm9\n"
20735           "movapd %%xmm2, %%xmm10\n"
20736           "movapd %%xmm2, %%xmm11\n"
20737           "addpd %%xmm3, %%xmm10\n"
20738           "subpd %%xmm3, %%xmm11\n"
20739           "movapd %%xmm4, %%xmm12\n"
20740           "movapd %%xmm4, %%xmm13\n"
20741           "addpd %%xmm5, %%xmm12\n"
20742           "subpd %%xmm5, %%xmm13\n"
20743           "movapd %%xmm6, %%xmm14\n"
20744           "movapd %%xmm6, %%xmm15\n"
20745           "addpd %%xmm7, %%xmm14\n"
20746           "subpd %%xmm7, %%xmm15\n"
20747           "movapd %%xmm8, %%xmm0\n"
20748           "movapd %%xmm8, %%xmm2\n"
20749           "addpd %%xmm10, %%xmm0\n"
20750           "subpd %%xmm10, %%xmm2\n"
20751           "movapd %%xmm9, %%xmm1\n"
20752           "movapd %%xmm9, %%xmm3\n"
20753           "addpd %%xmm11, %%xmm1\n"
20754           "subpd %%xmm11, %%xmm3\n"
20755           "movapd %%xmm12, %%xmm4\n"
20756           "movapd %%xmm12, %%xmm6\n"
20757           "addpd %%xmm14, %%xmm4\n"
20758           "subpd %%xmm14, %%xmm6\n"
20759           "movapd %%xmm13, %%xmm5\n"
20760           "movapd %%xmm13, %%xmm7\n"
20761           "addpd %%xmm15, %%xmm5\n"
20762           "subpd %%xmm15, %%xmm7\n"
20763           "movapd %%xmm0, %%xmm8\n"
20764           "movapd %%xmm0, %%xmm12\n"
20765           "addpd %%xmm4, %%xmm8\n"
20766           "subpd %%xmm4, %%xmm12\n"
20767           "movapd %%xmm1, %%xmm9\n"
20768           "movapd %%xmm1, %%xmm13\n"
20769           "addpd %%xmm5, %%xmm9\n"
20770           "subpd %%xmm5, %%xmm13\n"
20771           "movapd %%xmm2, %%xmm10\n"
20772           "movapd %%xmm2, %%xmm14\n"
20773           "addpd %%xmm6, %%xmm10\n"
20774           "subpd %%xmm6, %%xmm14\n"
20775           "movapd %%xmm3, %%xmm11\n"
20776           "movapd %%xmm3, %%xmm15\n"
20777           "addpd %%xmm7, %%xmm11\n"
20778           "subpd %%xmm7, %%xmm15\n"
20779           "movupd %%xmm8, (%0)\n"
20780           "movupd %%xmm9, (%1)\n"
20781           "movupd %%xmm10, (%2)\n"
20782           "movupd %%xmm11, (%3)\n"
20783           "movupd %%xmm12, (%4)\n"
20784           "movupd %%xmm13, (%5)\n"
20785           "movupd %%xmm14, (%6)\n"
20786           "movupd %%xmm15, (%7)\n"
20787           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20788         );
20789       }
20790     }
20791     return;
20792   }
20793   if (depth == 17) {
20794     helper_double_23_recursive(buf + 0, 14);
20795     helper_double_23_recursive(buf + 16384, 14);
20796     helper_double_23_recursive(buf + 32768, 14);
20797     helper_double_23_recursive(buf + 49152, 14);
20798     helper_double_23_recursive(buf + 65536, 14);
20799     helper_double_23_recursive(buf + 81920, 14);
20800     helper_double_23_recursive(buf + 98304, 14);
20801     helper_double_23_recursive(buf + 114688, 14);
20802     for (int j = 0; j < 131072; j += 131072) {
20803       for (int k = 0; k < 16384; k += 2) {
20804         __asm__ volatile (
20805           "movupd (%0), %%xmm0\n"
20806           "movupd (%1), %%xmm1\n"
20807           "movupd (%2), %%xmm2\n"
20808           "movupd (%3), %%xmm3\n"
20809           "movupd (%4), %%xmm4\n"
20810           "movupd (%5), %%xmm5\n"
20811           "movupd (%6), %%xmm6\n"
20812           "movupd (%7), %%xmm7\n"
20813           "movapd %%xmm0, %%xmm8\n"
20814           "movapd %%xmm0, %%xmm9\n"
20815           "addpd %%xmm1, %%xmm8\n"
20816           "subpd %%xmm1, %%xmm9\n"
20817           "movapd %%xmm2, %%xmm10\n"
20818           "movapd %%xmm2, %%xmm11\n"
20819           "addpd %%xmm3, %%xmm10\n"
20820           "subpd %%xmm3, %%xmm11\n"
20821           "movapd %%xmm4, %%xmm12\n"
20822           "movapd %%xmm4, %%xmm13\n"
20823           "addpd %%xmm5, %%xmm12\n"
20824           "subpd %%xmm5, %%xmm13\n"
20825           "movapd %%xmm6, %%xmm14\n"
20826           "movapd %%xmm6, %%xmm15\n"
20827           "addpd %%xmm7, %%xmm14\n"
20828           "subpd %%xmm7, %%xmm15\n"
20829           "movapd %%xmm8, %%xmm0\n"
20830           "movapd %%xmm8, %%xmm2\n"
20831           "addpd %%xmm10, %%xmm0\n"
20832           "subpd %%xmm10, %%xmm2\n"
20833           "movapd %%xmm9, %%xmm1\n"
20834           "movapd %%xmm9, %%xmm3\n"
20835           "addpd %%xmm11, %%xmm1\n"
20836           "subpd %%xmm11, %%xmm3\n"
20837           "movapd %%xmm12, %%xmm4\n"
20838           "movapd %%xmm12, %%xmm6\n"
20839           "addpd %%xmm14, %%xmm4\n"
20840           "subpd %%xmm14, %%xmm6\n"
20841           "movapd %%xmm13, %%xmm5\n"
20842           "movapd %%xmm13, %%xmm7\n"
20843           "addpd %%xmm15, %%xmm5\n"
20844           "subpd %%xmm15, %%xmm7\n"
20845           "movapd %%xmm0, %%xmm8\n"
20846           "movapd %%xmm0, %%xmm12\n"
20847           "addpd %%xmm4, %%xmm8\n"
20848           "subpd %%xmm4, %%xmm12\n"
20849           "movapd %%xmm1, %%xmm9\n"
20850           "movapd %%xmm1, %%xmm13\n"
20851           "addpd %%xmm5, %%xmm9\n"
20852           "subpd %%xmm5, %%xmm13\n"
20853           "movapd %%xmm2, %%xmm10\n"
20854           "movapd %%xmm2, %%xmm14\n"
20855           "addpd %%xmm6, %%xmm10\n"
20856           "subpd %%xmm6, %%xmm14\n"
20857           "movapd %%xmm3, %%xmm11\n"
20858           "movapd %%xmm3, %%xmm15\n"
20859           "addpd %%xmm7, %%xmm11\n"
20860           "subpd %%xmm7, %%xmm15\n"
20861           "movupd %%xmm8, (%0)\n"
20862           "movupd %%xmm9, (%1)\n"
20863           "movupd %%xmm10, (%2)\n"
20864           "movupd %%xmm11, (%3)\n"
20865           "movupd %%xmm12, (%4)\n"
20866           "movupd %%xmm13, (%5)\n"
20867           "movupd %%xmm14, (%6)\n"
20868           "movupd %%xmm15, (%7)\n"
20869           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20870         );
20871       }
20872     }
20873     return;
20874   }
20875   if (depth == 20) {
20876     helper_double_23_recursive(buf + 0, 17);
20877     helper_double_23_recursive(buf + 131072, 17);
20878     helper_double_23_recursive(buf + 262144, 17);
20879     helper_double_23_recursive(buf + 393216, 17);
20880     helper_double_23_recursive(buf + 524288, 17);
20881     helper_double_23_recursive(buf + 655360, 17);
20882     helper_double_23_recursive(buf + 786432, 17);
20883     helper_double_23_recursive(buf + 917504, 17);
20884     for (int j = 0; j < 1048576; j += 1048576) {
20885       for (int k = 0; k < 131072; k += 2) {
20886         __asm__ volatile (
20887           "movupd (%0), %%xmm0\n"
20888           "movupd (%1), %%xmm1\n"
20889           "movupd (%2), %%xmm2\n"
20890           "movupd (%3), %%xmm3\n"
20891           "movupd (%4), %%xmm4\n"
20892           "movupd (%5), %%xmm5\n"
20893           "movupd (%6), %%xmm6\n"
20894           "movupd (%7), %%xmm7\n"
20895           "movapd %%xmm0, %%xmm8\n"
20896           "movapd %%xmm0, %%xmm9\n"
20897           "addpd %%xmm1, %%xmm8\n"
20898           "subpd %%xmm1, %%xmm9\n"
20899           "movapd %%xmm2, %%xmm10\n"
20900           "movapd %%xmm2, %%xmm11\n"
20901           "addpd %%xmm3, %%xmm10\n"
20902           "subpd %%xmm3, %%xmm11\n"
20903           "movapd %%xmm4, %%xmm12\n"
20904           "movapd %%xmm4, %%xmm13\n"
20905           "addpd %%xmm5, %%xmm12\n"
20906           "subpd %%xmm5, %%xmm13\n"
20907           "movapd %%xmm6, %%xmm14\n"
20908           "movapd %%xmm6, %%xmm15\n"
20909           "addpd %%xmm7, %%xmm14\n"
20910           "subpd %%xmm7, %%xmm15\n"
20911           "movapd %%xmm8, %%xmm0\n"
20912           "movapd %%xmm8, %%xmm2\n"
20913           "addpd %%xmm10, %%xmm0\n"
20914           "subpd %%xmm10, %%xmm2\n"
20915           "movapd %%xmm9, %%xmm1\n"
20916           "movapd %%xmm9, %%xmm3\n"
20917           "addpd %%xmm11, %%xmm1\n"
20918           "subpd %%xmm11, %%xmm3\n"
20919           "movapd %%xmm12, %%xmm4\n"
20920           "movapd %%xmm12, %%xmm6\n"
20921           "addpd %%xmm14, %%xmm4\n"
20922           "subpd %%xmm14, %%xmm6\n"
20923           "movapd %%xmm13, %%xmm5\n"
20924           "movapd %%xmm13, %%xmm7\n"
20925           "addpd %%xmm15, %%xmm5\n"
20926           "subpd %%xmm15, %%xmm7\n"
20927           "movapd %%xmm0, %%xmm8\n"
20928           "movapd %%xmm0, %%xmm12\n"
20929           "addpd %%xmm4, %%xmm8\n"
20930           "subpd %%xmm4, %%xmm12\n"
20931           "movapd %%xmm1, %%xmm9\n"
20932           "movapd %%xmm1, %%xmm13\n"
20933           "addpd %%xmm5, %%xmm9\n"
20934           "subpd %%xmm5, %%xmm13\n"
20935           "movapd %%xmm2, %%xmm10\n"
20936           "movapd %%xmm2, %%xmm14\n"
20937           "addpd %%xmm6, %%xmm10\n"
20938           "subpd %%xmm6, %%xmm14\n"
20939           "movapd %%xmm3, %%xmm11\n"
20940           "movapd %%xmm3, %%xmm15\n"
20941           "addpd %%xmm7, %%xmm11\n"
20942           "subpd %%xmm7, %%xmm15\n"
20943           "movupd %%xmm8, (%0)\n"
20944           "movupd %%xmm9, (%1)\n"
20945           "movupd %%xmm10, (%2)\n"
20946           "movupd %%xmm11, (%3)\n"
20947           "movupd %%xmm12, (%4)\n"
20948           "movupd %%xmm13, (%5)\n"
20949           "movupd %%xmm14, (%6)\n"
20950           "movupd %%xmm15, (%7)\n"
20951           :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20952         );
20953       }
20954     }
20955     return;
20956   }
20957   if (depth == 23) {
20958     helper_double_23_recursive(buf + 0, 20);
20959     helper_double_23_recursive(buf + 1048576, 20);
20960     helper_double_23_recursive(buf + 2097152, 20);
20961     helper_double_23_recursive(buf + 3145728, 20);
20962     helper_double_23_recursive(buf + 4194304, 20);
20963     helper_double_23_recursive(buf + 5242880, 20);
20964     helper_double_23_recursive(buf + 6291456, 20);
20965     helper_double_23_recursive(buf + 7340032, 20);
20966     for (int j = 0; j < 8388608; j += 8388608) {
20967       for (int k = 0; k < 1048576; k += 2) {
20968         __asm__ volatile (
20969           "movupd (%0), %%xmm0\n"
20970           "movupd (%1), %%xmm1\n"
20971           "movupd (%2), %%xmm2\n"
20972           "movupd (%3), %%xmm3\n"
20973           "movupd (%4), %%xmm4\n"
20974           "movupd (%5), %%xmm5\n"
20975           "movupd (%6), %%xmm6\n"
20976           "movupd (%7), %%xmm7\n"
20977           "movapd %%xmm0, %%xmm8\n"
20978           "movapd %%xmm0, %%xmm9\n"
20979           "addpd %%xmm1, %%xmm8\n"
20980           "subpd %%xmm1, %%xmm9\n"
20981           "movapd %%xmm2, %%xmm10\n"
20982           "movapd %%xmm2, %%xmm11\n"
20983           "addpd %%xmm3, %%xmm10\n"
20984           "subpd %%xmm3, %%xmm11\n"
20985           "movapd %%xmm4, %%xmm12\n"
20986           "movapd %%xmm4, %%xmm13\n"
20987           "addpd %%xmm5, %%xmm12\n"
20988           "subpd %%xmm5, %%xmm13\n"
20989           "movapd %%xmm6, %%xmm14\n"
20990           "movapd %%xmm6, %%xmm15\n"
20991           "addpd %%xmm7, %%xmm14\n"
20992           "subpd %%xmm7, %%xmm15\n"
20993           "movapd %%xmm8, %%xmm0\n"
20994           "movapd %%xmm8, %%xmm2\n"
20995           "addpd %%xmm10, %%xmm0\n"
20996           "subpd %%xmm10, %%xmm2\n"
20997           "movapd %%xmm9, %%xmm1\n"
20998           "movapd %%xmm9, %%xmm3\n"
20999           "addpd %%xmm11, %%xmm1\n"
21000           "subpd %%xmm11, %%xmm3\n"
21001           "movapd %%xmm12, %%xmm4\n"
21002           "movapd %%xmm12, %%xmm6\n"
21003           "addpd %%xmm14, %%xmm4\n"
21004           "subpd %%xmm14, %%xmm6\n"
21005           "movapd %%xmm13, %%xmm5\n"
21006           "movapd %%xmm13, %%xmm7\n"
21007           "addpd %%xmm15, %%xmm5\n"
21008           "subpd %%xmm15, %%xmm7\n"
21009           "movapd %%xmm0, %%xmm8\n"
21010           "movapd %%xmm0, %%xmm12\n"
21011           "addpd %%xmm4, %%xmm8\n"
21012           "subpd %%xmm4, %%xmm12\n"
21013           "movapd %%xmm1, %%xmm9\n"
21014           "movapd %%xmm1, %%xmm13\n"
21015           "addpd %%xmm5, %%xmm9\n"
21016           "subpd %%xmm5, %%xmm13\n"
21017           "movapd %%xmm2, %%xmm10\n"
21018           "movapd %%xmm2, %%xmm14\n"
21019           "addpd %%xmm6, %%xmm10\n"
21020           "subpd %%xmm6, %%xmm14\n"
21021           "movapd %%xmm3, %%xmm11\n"
21022           "movapd %%xmm3, %%xmm15\n"
21023           "addpd %%xmm7, %%xmm11\n"
21024           "subpd %%xmm7, %%xmm15\n"
21025           "movupd %%xmm8, (%0)\n"
21026           "movupd %%xmm9, (%1)\n"
21027           "movupd %%xmm10, (%2)\n"
21028           "movupd %%xmm11, (%3)\n"
21029           "movupd %%xmm12, (%4)\n"
21030           "movupd %%xmm13, (%5)\n"
21031           "movupd %%xmm14, (%6)\n"
21032           "movupd %%xmm15, (%7)\n"
21033           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21034         );
21035       }
21036     }
21037     return;
21038   }
21039 }
21040 void helper_double_23(double *buf);
helper_double_23(double * buf)21041 void helper_double_23(double *buf) {
21042   helper_double_23_recursive(buf, 23);
21043 }
21044 void helper_double_24_recursive(double *buf, int depth);
helper_double_24_recursive(double * buf,int depth)21045 void helper_double_24_recursive(double *buf, int depth) {
21046   if (depth == 13) {
21047     for (int j = 0; j < 8192; j += 16) {
21048       for (int k = 0; k < 2; k += 2) {
21049         __asm__ volatile (
21050           "movupd (%0), %%xmm0\n"
21051           "movupd (%1), %%xmm1\n"
21052           "movupd (%2), %%xmm2\n"
21053           "movupd (%3), %%xmm3\n"
21054           "movupd (%4), %%xmm4\n"
21055           "movupd (%5), %%xmm5\n"
21056           "movupd (%6), %%xmm6\n"
21057           "movupd (%7), %%xmm7\n"
21058           "movapd %%xmm0, %%xmm8\n"
21059           "haddpd %%xmm8, %%xmm8\n"
21060           "movapd %%xmm0, %%xmm9\n"
21061           "hsubpd %%xmm9, %%xmm9\n"
21062           "blendpd $1, %%xmm8, %%xmm9\n"
21063           "movapd %%xmm9, %%xmm0\n"
21064           "movapd %%xmm1, %%xmm8\n"
21065           "haddpd %%xmm8, %%xmm8\n"
21066           "movapd %%xmm1, %%xmm9\n"
21067           "hsubpd %%xmm9, %%xmm9\n"
21068           "blendpd $1, %%xmm8, %%xmm9\n"
21069           "movapd %%xmm9, %%xmm1\n"
21070           "movapd %%xmm2, %%xmm8\n"
21071           "haddpd %%xmm8, %%xmm8\n"
21072           "movapd %%xmm2, %%xmm9\n"
21073           "hsubpd %%xmm9, %%xmm9\n"
21074           "blendpd $1, %%xmm8, %%xmm9\n"
21075           "movapd %%xmm9, %%xmm2\n"
21076           "movapd %%xmm3, %%xmm8\n"
21077           "haddpd %%xmm8, %%xmm8\n"
21078           "movapd %%xmm3, %%xmm9\n"
21079           "hsubpd %%xmm9, %%xmm9\n"
21080           "blendpd $1, %%xmm8, %%xmm9\n"
21081           "movapd %%xmm9, %%xmm3\n"
21082           "movapd %%xmm4, %%xmm8\n"
21083           "haddpd %%xmm8, %%xmm8\n"
21084           "movapd %%xmm4, %%xmm9\n"
21085           "hsubpd %%xmm9, %%xmm9\n"
21086           "blendpd $1, %%xmm8, %%xmm9\n"
21087           "movapd %%xmm9, %%xmm4\n"
21088           "movapd %%xmm5, %%xmm8\n"
21089           "haddpd %%xmm8, %%xmm8\n"
21090           "movapd %%xmm5, %%xmm9\n"
21091           "hsubpd %%xmm9, %%xmm9\n"
21092           "blendpd $1, %%xmm8, %%xmm9\n"
21093           "movapd %%xmm9, %%xmm5\n"
21094           "movapd %%xmm6, %%xmm8\n"
21095           "haddpd %%xmm8, %%xmm8\n"
21096           "movapd %%xmm6, %%xmm9\n"
21097           "hsubpd %%xmm9, %%xmm9\n"
21098           "blendpd $1, %%xmm8, %%xmm9\n"
21099           "movapd %%xmm9, %%xmm6\n"
21100           "movapd %%xmm7, %%xmm8\n"
21101           "haddpd %%xmm8, %%xmm8\n"
21102           "movapd %%xmm7, %%xmm9\n"
21103           "hsubpd %%xmm9, %%xmm9\n"
21104           "blendpd $1, %%xmm8, %%xmm9\n"
21105           "movapd %%xmm9, %%xmm7\n"
21106           "movapd %%xmm0, %%xmm8\n"
21107           "movapd %%xmm0, %%xmm9\n"
21108           "addpd %%xmm1, %%xmm8\n"
21109           "subpd %%xmm1, %%xmm9\n"
21110           "movapd %%xmm2, %%xmm10\n"
21111           "movapd %%xmm2, %%xmm11\n"
21112           "addpd %%xmm3, %%xmm10\n"
21113           "subpd %%xmm3, %%xmm11\n"
21114           "movapd %%xmm4, %%xmm12\n"
21115           "movapd %%xmm4, %%xmm13\n"
21116           "addpd %%xmm5, %%xmm12\n"
21117           "subpd %%xmm5, %%xmm13\n"
21118           "movapd %%xmm6, %%xmm14\n"
21119           "movapd %%xmm6, %%xmm15\n"
21120           "addpd %%xmm7, %%xmm14\n"
21121           "subpd %%xmm7, %%xmm15\n"
21122           "movapd %%xmm8, %%xmm0\n"
21123           "movapd %%xmm8, %%xmm2\n"
21124           "addpd %%xmm10, %%xmm0\n"
21125           "subpd %%xmm10, %%xmm2\n"
21126           "movapd %%xmm9, %%xmm1\n"
21127           "movapd %%xmm9, %%xmm3\n"
21128           "addpd %%xmm11, %%xmm1\n"
21129           "subpd %%xmm11, %%xmm3\n"
21130           "movapd %%xmm12, %%xmm4\n"
21131           "movapd %%xmm12, %%xmm6\n"
21132           "addpd %%xmm14, %%xmm4\n"
21133           "subpd %%xmm14, %%xmm6\n"
21134           "movapd %%xmm13, %%xmm5\n"
21135           "movapd %%xmm13, %%xmm7\n"
21136           "addpd %%xmm15, %%xmm5\n"
21137           "subpd %%xmm15, %%xmm7\n"
21138           "movapd %%xmm0, %%xmm8\n"
21139           "movapd %%xmm0, %%xmm12\n"
21140           "addpd %%xmm4, %%xmm8\n"
21141           "subpd %%xmm4, %%xmm12\n"
21142           "movapd %%xmm1, %%xmm9\n"
21143           "movapd %%xmm1, %%xmm13\n"
21144           "addpd %%xmm5, %%xmm9\n"
21145           "subpd %%xmm5, %%xmm13\n"
21146           "movapd %%xmm2, %%xmm10\n"
21147           "movapd %%xmm2, %%xmm14\n"
21148           "addpd %%xmm6, %%xmm10\n"
21149           "subpd %%xmm6, %%xmm14\n"
21150           "movapd %%xmm3, %%xmm11\n"
21151           "movapd %%xmm3, %%xmm15\n"
21152           "addpd %%xmm7, %%xmm11\n"
21153           "subpd %%xmm7, %%xmm15\n"
21154           "movupd %%xmm8, (%0)\n"
21155           "movupd %%xmm9, (%1)\n"
21156           "movupd %%xmm10, (%2)\n"
21157           "movupd %%xmm11, (%3)\n"
21158           "movupd %%xmm12, (%4)\n"
21159           "movupd %%xmm13, (%5)\n"
21160           "movupd %%xmm14, (%6)\n"
21161           "movupd %%xmm15, (%7)\n"
21162           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21163         );
21164       }
21165     }
21166     for (int j = 0; j < 8192; j += 128) {
21167       for (int k = 0; k < 16; k += 2) {
21168         __asm__ volatile (
21169           "movupd (%0), %%xmm0\n"
21170           "movupd (%1), %%xmm1\n"
21171           "movupd (%2), %%xmm2\n"
21172           "movupd (%3), %%xmm3\n"
21173           "movupd (%4), %%xmm4\n"
21174           "movupd (%5), %%xmm5\n"
21175           "movupd (%6), %%xmm6\n"
21176           "movupd (%7), %%xmm7\n"
21177           "movapd %%xmm0, %%xmm8\n"
21178           "movapd %%xmm0, %%xmm9\n"
21179           "addpd %%xmm1, %%xmm8\n"
21180           "subpd %%xmm1, %%xmm9\n"
21181           "movapd %%xmm2, %%xmm10\n"
21182           "movapd %%xmm2, %%xmm11\n"
21183           "addpd %%xmm3, %%xmm10\n"
21184           "subpd %%xmm3, %%xmm11\n"
21185           "movapd %%xmm4, %%xmm12\n"
21186           "movapd %%xmm4, %%xmm13\n"
21187           "addpd %%xmm5, %%xmm12\n"
21188           "subpd %%xmm5, %%xmm13\n"
21189           "movapd %%xmm6, %%xmm14\n"
21190           "movapd %%xmm6, %%xmm15\n"
21191           "addpd %%xmm7, %%xmm14\n"
21192           "subpd %%xmm7, %%xmm15\n"
21193           "movapd %%xmm8, %%xmm0\n"
21194           "movapd %%xmm8, %%xmm2\n"
21195           "addpd %%xmm10, %%xmm0\n"
21196           "subpd %%xmm10, %%xmm2\n"
21197           "movapd %%xmm9, %%xmm1\n"
21198           "movapd %%xmm9, %%xmm3\n"
21199           "addpd %%xmm11, %%xmm1\n"
21200           "subpd %%xmm11, %%xmm3\n"
21201           "movapd %%xmm12, %%xmm4\n"
21202           "movapd %%xmm12, %%xmm6\n"
21203           "addpd %%xmm14, %%xmm4\n"
21204           "subpd %%xmm14, %%xmm6\n"
21205           "movapd %%xmm13, %%xmm5\n"
21206           "movapd %%xmm13, %%xmm7\n"
21207           "addpd %%xmm15, %%xmm5\n"
21208           "subpd %%xmm15, %%xmm7\n"
21209           "movapd %%xmm0, %%xmm8\n"
21210           "movapd %%xmm0, %%xmm12\n"
21211           "addpd %%xmm4, %%xmm8\n"
21212           "subpd %%xmm4, %%xmm12\n"
21213           "movapd %%xmm1, %%xmm9\n"
21214           "movapd %%xmm1, %%xmm13\n"
21215           "addpd %%xmm5, %%xmm9\n"
21216           "subpd %%xmm5, %%xmm13\n"
21217           "movapd %%xmm2, %%xmm10\n"
21218           "movapd %%xmm2, %%xmm14\n"
21219           "addpd %%xmm6, %%xmm10\n"
21220           "subpd %%xmm6, %%xmm14\n"
21221           "movapd %%xmm3, %%xmm11\n"
21222           "movapd %%xmm3, %%xmm15\n"
21223           "addpd %%xmm7, %%xmm11\n"
21224           "subpd %%xmm7, %%xmm15\n"
21225           "movupd %%xmm8, (%0)\n"
21226           "movupd %%xmm9, (%1)\n"
21227           "movupd %%xmm10, (%2)\n"
21228           "movupd %%xmm11, (%3)\n"
21229           "movupd %%xmm12, (%4)\n"
21230           "movupd %%xmm13, (%5)\n"
21231           "movupd %%xmm14, (%6)\n"
21232           "movupd %%xmm15, (%7)\n"
21233           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21234         );
21235       }
21236     }
21237     for (int j = 0; j < 8192; j += 1024) {
21238       for (int k = 0; k < 128; k += 2) {
21239         __asm__ volatile (
21240           "movupd (%0), %%xmm0\n"
21241           "movupd (%1), %%xmm1\n"
21242           "movupd (%2), %%xmm2\n"
21243           "movupd (%3), %%xmm3\n"
21244           "movupd (%4), %%xmm4\n"
21245           "movupd (%5), %%xmm5\n"
21246           "movupd (%6), %%xmm6\n"
21247           "movupd (%7), %%xmm7\n"
21248           "movapd %%xmm0, %%xmm8\n"
21249           "movapd %%xmm0, %%xmm9\n"
21250           "addpd %%xmm1, %%xmm8\n"
21251           "subpd %%xmm1, %%xmm9\n"
21252           "movapd %%xmm2, %%xmm10\n"
21253           "movapd %%xmm2, %%xmm11\n"
21254           "addpd %%xmm3, %%xmm10\n"
21255           "subpd %%xmm3, %%xmm11\n"
21256           "movapd %%xmm4, %%xmm12\n"
21257           "movapd %%xmm4, %%xmm13\n"
21258           "addpd %%xmm5, %%xmm12\n"
21259           "subpd %%xmm5, %%xmm13\n"
21260           "movapd %%xmm6, %%xmm14\n"
21261           "movapd %%xmm6, %%xmm15\n"
21262           "addpd %%xmm7, %%xmm14\n"
21263           "subpd %%xmm7, %%xmm15\n"
21264           "movapd %%xmm8, %%xmm0\n"
21265           "movapd %%xmm8, %%xmm2\n"
21266           "addpd %%xmm10, %%xmm0\n"
21267           "subpd %%xmm10, %%xmm2\n"
21268           "movapd %%xmm9, %%xmm1\n"
21269           "movapd %%xmm9, %%xmm3\n"
21270           "addpd %%xmm11, %%xmm1\n"
21271           "subpd %%xmm11, %%xmm3\n"
21272           "movapd %%xmm12, %%xmm4\n"
21273           "movapd %%xmm12, %%xmm6\n"
21274           "addpd %%xmm14, %%xmm4\n"
21275           "subpd %%xmm14, %%xmm6\n"
21276           "movapd %%xmm13, %%xmm5\n"
21277           "movapd %%xmm13, %%xmm7\n"
21278           "addpd %%xmm15, %%xmm5\n"
21279           "subpd %%xmm15, %%xmm7\n"
21280           "movapd %%xmm0, %%xmm8\n"
21281           "movapd %%xmm0, %%xmm12\n"
21282           "addpd %%xmm4, %%xmm8\n"
21283           "subpd %%xmm4, %%xmm12\n"
21284           "movapd %%xmm1, %%xmm9\n"
21285           "movapd %%xmm1, %%xmm13\n"
21286           "addpd %%xmm5, %%xmm9\n"
21287           "subpd %%xmm5, %%xmm13\n"
21288           "movapd %%xmm2, %%xmm10\n"
21289           "movapd %%xmm2, %%xmm14\n"
21290           "addpd %%xmm6, %%xmm10\n"
21291           "subpd %%xmm6, %%xmm14\n"
21292           "movapd %%xmm3, %%xmm11\n"
21293           "movapd %%xmm3, %%xmm15\n"
21294           "addpd %%xmm7, %%xmm11\n"
21295           "subpd %%xmm7, %%xmm15\n"
21296           "movupd %%xmm8, (%0)\n"
21297           "movupd %%xmm9, (%1)\n"
21298           "movupd %%xmm10, (%2)\n"
21299           "movupd %%xmm11, (%3)\n"
21300           "movupd %%xmm12, (%4)\n"
21301           "movupd %%xmm13, (%5)\n"
21302           "movupd %%xmm14, (%6)\n"
21303           "movupd %%xmm15, (%7)\n"
21304           :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21305         );
21306       }
21307     }
21308     for (int j = 0; j < 8192; j += 8192) {
21309       for (int k = 0; k < 1024; k += 2) {
21310         __asm__ volatile (
21311           "movupd (%0), %%xmm0\n"
21312           "movupd (%1), %%xmm1\n"
21313           "movupd (%2), %%xmm2\n"
21314           "movupd (%3), %%xmm3\n"
21315           "movupd (%4), %%xmm4\n"
21316           "movupd (%5), %%xmm5\n"
21317           "movupd (%6), %%xmm6\n"
21318           "movupd (%7), %%xmm7\n"
21319           "movapd %%xmm0, %%xmm8\n"
21320           "movapd %%xmm0, %%xmm9\n"
21321           "addpd %%xmm1, %%xmm8\n"
21322           "subpd %%xmm1, %%xmm9\n"
21323           "movapd %%xmm2, %%xmm10\n"
21324           "movapd %%xmm2, %%xmm11\n"
21325           "addpd %%xmm3, %%xmm10\n"
21326           "subpd %%xmm3, %%xmm11\n"
21327           "movapd %%xmm4, %%xmm12\n"
21328           "movapd %%xmm4, %%xmm13\n"
21329           "addpd %%xmm5, %%xmm12\n"
21330           "subpd %%xmm5, %%xmm13\n"
21331           "movapd %%xmm6, %%xmm14\n"
21332           "movapd %%xmm6, %%xmm15\n"
21333           "addpd %%xmm7, %%xmm14\n"
21334           "subpd %%xmm7, %%xmm15\n"
21335           "movapd %%xmm8, %%xmm0\n"
21336           "movapd %%xmm8, %%xmm2\n"
21337           "addpd %%xmm10, %%xmm0\n"
21338           "subpd %%xmm10, %%xmm2\n"
21339           "movapd %%xmm9, %%xmm1\n"
21340           "movapd %%xmm9, %%xmm3\n"
21341           "addpd %%xmm11, %%xmm1\n"
21342           "subpd %%xmm11, %%xmm3\n"
21343           "movapd %%xmm12, %%xmm4\n"
21344           "movapd %%xmm12, %%xmm6\n"
21345           "addpd %%xmm14, %%xmm4\n"
21346           "subpd %%xmm14, %%xmm6\n"
21347           "movapd %%xmm13, %%xmm5\n"
21348           "movapd %%xmm13, %%xmm7\n"
21349           "addpd %%xmm15, %%xmm5\n"
21350           "subpd %%xmm15, %%xmm7\n"
21351           "movapd %%xmm0, %%xmm8\n"
21352           "movapd %%xmm0, %%xmm12\n"
21353           "addpd %%xmm4, %%xmm8\n"
21354           "subpd %%xmm4, %%xmm12\n"
21355           "movapd %%xmm1, %%xmm9\n"
21356           "movapd %%xmm1, %%xmm13\n"
21357           "addpd %%xmm5, %%xmm9\n"
21358           "subpd %%xmm5, %%xmm13\n"
21359           "movapd %%xmm2, %%xmm10\n"
21360           "movapd %%xmm2, %%xmm14\n"
21361           "addpd %%xmm6, %%xmm10\n"
21362           "subpd %%xmm6, %%xmm14\n"
21363           "movapd %%xmm3, %%xmm11\n"
21364           "movapd %%xmm3, %%xmm15\n"
21365           "addpd %%xmm7, %%xmm11\n"
21366           "subpd %%xmm7, %%xmm15\n"
21367           "movupd %%xmm8, (%0)\n"
21368           "movupd %%xmm9, (%1)\n"
21369           "movupd %%xmm10, (%2)\n"
21370           "movupd %%xmm11, (%3)\n"
21371           "movupd %%xmm12, (%4)\n"
21372           "movupd %%xmm13, (%5)\n"
21373           "movupd %%xmm14, (%6)\n"
21374           "movupd %%xmm15, (%7)\n"
21375           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21376         );
21377       }
21378     }
21379     return;
21380   }
21381   if (depth == 16) {
21382     helper_double_24_recursive(buf + 0, 13);
21383     helper_double_24_recursive(buf + 8192, 13);
21384     helper_double_24_recursive(buf + 16384, 13);
21385     helper_double_24_recursive(buf + 24576, 13);
21386     helper_double_24_recursive(buf + 32768, 13);
21387     helper_double_24_recursive(buf + 40960, 13);
21388     helper_double_24_recursive(buf + 49152, 13);
21389     helper_double_24_recursive(buf + 57344, 13);
21390     for (int j = 0; j < 65536; j += 65536) {
21391       for (int k = 0; k < 8192; k += 2) {
21392         __asm__ volatile (
21393           "movupd (%0), %%xmm0\n"
21394           "movupd (%1), %%xmm1\n"
21395           "movupd (%2), %%xmm2\n"
21396           "movupd (%3), %%xmm3\n"
21397           "movupd (%4), %%xmm4\n"
21398           "movupd (%5), %%xmm5\n"
21399           "movupd (%6), %%xmm6\n"
21400           "movupd (%7), %%xmm7\n"
21401           "movapd %%xmm0, %%xmm8\n"
21402           "movapd %%xmm0, %%xmm9\n"
21403           "addpd %%xmm1, %%xmm8\n"
21404           "subpd %%xmm1, %%xmm9\n"
21405           "movapd %%xmm2, %%xmm10\n"
21406           "movapd %%xmm2, %%xmm11\n"
21407           "addpd %%xmm3, %%xmm10\n"
21408           "subpd %%xmm3, %%xmm11\n"
21409           "movapd %%xmm4, %%xmm12\n"
21410           "movapd %%xmm4, %%xmm13\n"
21411           "addpd %%xmm5, %%xmm12\n"
21412           "subpd %%xmm5, %%xmm13\n"
21413           "movapd %%xmm6, %%xmm14\n"
21414           "movapd %%xmm6, %%xmm15\n"
21415           "addpd %%xmm7, %%xmm14\n"
21416           "subpd %%xmm7, %%xmm15\n"
21417           "movapd %%xmm8, %%xmm0\n"
21418           "movapd %%xmm8, %%xmm2\n"
21419           "addpd %%xmm10, %%xmm0\n"
21420           "subpd %%xmm10, %%xmm2\n"
21421           "movapd %%xmm9, %%xmm1\n"
21422           "movapd %%xmm9, %%xmm3\n"
21423           "addpd %%xmm11, %%xmm1\n"
21424           "subpd %%xmm11, %%xmm3\n"
21425           "movapd %%xmm12, %%xmm4\n"
21426           "movapd %%xmm12, %%xmm6\n"
21427           "addpd %%xmm14, %%xmm4\n"
21428           "subpd %%xmm14, %%xmm6\n"
21429           "movapd %%xmm13, %%xmm5\n"
21430           "movapd %%xmm13, %%xmm7\n"
21431           "addpd %%xmm15, %%xmm5\n"
21432           "subpd %%xmm15, %%xmm7\n"
21433           "movapd %%xmm0, %%xmm8\n"
21434           "movapd %%xmm0, %%xmm12\n"
21435           "addpd %%xmm4, %%xmm8\n"
21436           "subpd %%xmm4, %%xmm12\n"
21437           "movapd %%xmm1, %%xmm9\n"
21438           "movapd %%xmm1, %%xmm13\n"
21439           "addpd %%xmm5, %%xmm9\n"
21440           "subpd %%xmm5, %%xmm13\n"
21441           "movapd %%xmm2, %%xmm10\n"
21442           "movapd %%xmm2, %%xmm14\n"
21443           "addpd %%xmm6, %%xmm10\n"
21444           "subpd %%xmm6, %%xmm14\n"
21445           "movapd %%xmm3, %%xmm11\n"
21446           "movapd %%xmm3, %%xmm15\n"
21447           "addpd %%xmm7, %%xmm11\n"
21448           "subpd %%xmm7, %%xmm15\n"
21449           "movupd %%xmm8, (%0)\n"
21450           "movupd %%xmm9, (%1)\n"
21451           "movupd %%xmm10, (%2)\n"
21452           "movupd %%xmm11, (%3)\n"
21453           "movupd %%xmm12, (%4)\n"
21454           "movupd %%xmm13, (%5)\n"
21455           "movupd %%xmm14, (%6)\n"
21456           "movupd %%xmm15, (%7)\n"
21457           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21458         );
21459       }
21460     }
21461     return;
21462   }
21463   if (depth == 19) {
21464     helper_double_24_recursive(buf + 0, 16);
21465     helper_double_24_recursive(buf + 65536, 16);
21466     helper_double_24_recursive(buf + 131072, 16);
21467     helper_double_24_recursive(buf + 196608, 16);
21468     helper_double_24_recursive(buf + 262144, 16);
21469     helper_double_24_recursive(buf + 327680, 16);
21470     helper_double_24_recursive(buf + 393216, 16);
21471     helper_double_24_recursive(buf + 458752, 16);
21472     for (int j = 0; j < 524288; j += 524288) {
21473       for (int k = 0; k < 65536; k += 2) {
21474         __asm__ volatile (
21475           "movupd (%0), %%xmm0\n"
21476           "movupd (%1), %%xmm1\n"
21477           "movupd (%2), %%xmm2\n"
21478           "movupd (%3), %%xmm3\n"
21479           "movupd (%4), %%xmm4\n"
21480           "movupd (%5), %%xmm5\n"
21481           "movupd (%6), %%xmm6\n"
21482           "movupd (%7), %%xmm7\n"
21483           "movapd %%xmm0, %%xmm8\n"
21484           "movapd %%xmm0, %%xmm9\n"
21485           "addpd %%xmm1, %%xmm8\n"
21486           "subpd %%xmm1, %%xmm9\n"
21487           "movapd %%xmm2, %%xmm10\n"
21488           "movapd %%xmm2, %%xmm11\n"
21489           "addpd %%xmm3, %%xmm10\n"
21490           "subpd %%xmm3, %%xmm11\n"
21491           "movapd %%xmm4, %%xmm12\n"
21492           "movapd %%xmm4, %%xmm13\n"
21493           "addpd %%xmm5, %%xmm12\n"
21494           "subpd %%xmm5, %%xmm13\n"
21495           "movapd %%xmm6, %%xmm14\n"
21496           "movapd %%xmm6, %%xmm15\n"
21497           "addpd %%xmm7, %%xmm14\n"
21498           "subpd %%xmm7, %%xmm15\n"
21499           "movapd %%xmm8, %%xmm0\n"
21500           "movapd %%xmm8, %%xmm2\n"
21501           "addpd %%xmm10, %%xmm0\n"
21502           "subpd %%xmm10, %%xmm2\n"
21503           "movapd %%xmm9, %%xmm1\n"
21504           "movapd %%xmm9, %%xmm3\n"
21505           "addpd %%xmm11, %%xmm1\n"
21506           "subpd %%xmm11, %%xmm3\n"
21507           "movapd %%xmm12, %%xmm4\n"
21508           "movapd %%xmm12, %%xmm6\n"
21509           "addpd %%xmm14, %%xmm4\n"
21510           "subpd %%xmm14, %%xmm6\n"
21511           "movapd %%xmm13, %%xmm5\n"
21512           "movapd %%xmm13, %%xmm7\n"
21513           "addpd %%xmm15, %%xmm5\n"
21514           "subpd %%xmm15, %%xmm7\n"
21515           "movapd %%xmm0, %%xmm8\n"
21516           "movapd %%xmm0, %%xmm12\n"
21517           "addpd %%xmm4, %%xmm8\n"
21518           "subpd %%xmm4, %%xmm12\n"
21519           "movapd %%xmm1, %%xmm9\n"
21520           "movapd %%xmm1, %%xmm13\n"
21521           "addpd %%xmm5, %%xmm9\n"
21522           "subpd %%xmm5, %%xmm13\n"
21523           "movapd %%xmm2, %%xmm10\n"
21524           "movapd %%xmm2, %%xmm14\n"
21525           "addpd %%xmm6, %%xmm10\n"
21526           "subpd %%xmm6, %%xmm14\n"
21527           "movapd %%xmm3, %%xmm11\n"
21528           "movapd %%xmm3, %%xmm15\n"
21529           "addpd %%xmm7, %%xmm11\n"
21530           "subpd %%xmm7, %%xmm15\n"
21531           "movupd %%xmm8, (%0)\n"
21532           "movupd %%xmm9, (%1)\n"
21533           "movupd %%xmm10, (%2)\n"
21534           "movupd %%xmm11, (%3)\n"
21535           "movupd %%xmm12, (%4)\n"
21536           "movupd %%xmm13, (%5)\n"
21537           "movupd %%xmm14, (%6)\n"
21538           "movupd %%xmm15, (%7)\n"
21539           :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21540         );
21541       }
21542     }
21543     return;
21544   }
21545   if (depth == 22) {
21546     helper_double_24_recursive(buf + 0, 19);
21547     helper_double_24_recursive(buf + 524288, 19);
21548     helper_double_24_recursive(buf + 1048576, 19);
21549     helper_double_24_recursive(buf + 1572864, 19);
21550     helper_double_24_recursive(buf + 2097152, 19);
21551     helper_double_24_recursive(buf + 2621440, 19);
21552     helper_double_24_recursive(buf + 3145728, 19);
21553     helper_double_24_recursive(buf + 3670016, 19);
21554     for (int j = 0; j < 4194304; j += 4194304) {
21555       for (int k = 0; k < 524288; k += 2) {
21556         __asm__ volatile (
21557           "movupd (%0), %%xmm0\n"
21558           "movupd (%1), %%xmm1\n"
21559           "movupd (%2), %%xmm2\n"
21560           "movupd (%3), %%xmm3\n"
21561           "movupd (%4), %%xmm4\n"
21562           "movupd (%5), %%xmm5\n"
21563           "movupd (%6), %%xmm6\n"
21564           "movupd (%7), %%xmm7\n"
21565           "movapd %%xmm0, %%xmm8\n"
21566           "movapd %%xmm0, %%xmm9\n"
21567           "addpd %%xmm1, %%xmm8\n"
21568           "subpd %%xmm1, %%xmm9\n"
21569           "movapd %%xmm2, %%xmm10\n"
21570           "movapd %%xmm2, %%xmm11\n"
21571           "addpd %%xmm3, %%xmm10\n"
21572           "subpd %%xmm3, %%xmm11\n"
21573           "movapd %%xmm4, %%xmm12\n"
21574           "movapd %%xmm4, %%xmm13\n"
21575           "addpd %%xmm5, %%xmm12\n"
21576           "subpd %%xmm5, %%xmm13\n"
21577           "movapd %%xmm6, %%xmm14\n"
21578           "movapd %%xmm6, %%xmm15\n"
21579           "addpd %%xmm7, %%xmm14\n"
21580           "subpd %%xmm7, %%xmm15\n"
21581           "movapd %%xmm8, %%xmm0\n"
21582           "movapd %%xmm8, %%xmm2\n"
21583           "addpd %%xmm10, %%xmm0\n"
21584           "subpd %%xmm10, %%xmm2\n"
21585           "movapd %%xmm9, %%xmm1\n"
21586           "movapd %%xmm9, %%xmm3\n"
21587           "addpd %%xmm11, %%xmm1\n"
21588           "subpd %%xmm11, %%xmm3\n"
21589           "movapd %%xmm12, %%xmm4\n"
21590           "movapd %%xmm12, %%xmm6\n"
21591           "addpd %%xmm14, %%xmm4\n"
21592           "subpd %%xmm14, %%xmm6\n"
21593           "movapd %%xmm13, %%xmm5\n"
21594           "movapd %%xmm13, %%xmm7\n"
21595           "addpd %%xmm15, %%xmm5\n"
21596           "subpd %%xmm15, %%xmm7\n"
21597           "movapd %%xmm0, %%xmm8\n"
21598           "movapd %%xmm0, %%xmm12\n"
21599           "addpd %%xmm4, %%xmm8\n"
21600           "subpd %%xmm4, %%xmm12\n"
21601           "movapd %%xmm1, %%xmm9\n"
21602           "movapd %%xmm1, %%xmm13\n"
21603           "addpd %%xmm5, %%xmm9\n"
21604           "subpd %%xmm5, %%xmm13\n"
21605           "movapd %%xmm2, %%xmm10\n"
21606           "movapd %%xmm2, %%xmm14\n"
21607           "addpd %%xmm6, %%xmm10\n"
21608           "subpd %%xmm6, %%xmm14\n"
21609           "movapd %%xmm3, %%xmm11\n"
21610           "movapd %%xmm3, %%xmm15\n"
21611           "addpd %%xmm7, %%xmm11\n"
21612           "subpd %%xmm7, %%xmm15\n"
21613           "movupd %%xmm8, (%0)\n"
21614           "movupd %%xmm9, (%1)\n"
21615           "movupd %%xmm10, (%2)\n"
21616           "movupd %%xmm11, (%3)\n"
21617           "movupd %%xmm12, (%4)\n"
21618           "movupd %%xmm13, (%5)\n"
21619           "movupd %%xmm14, (%6)\n"
21620           "movupd %%xmm15, (%7)\n"
21621           :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21622         );
21623       }
21624     }
21625     return;
21626   }
21627   if (depth == 24) {
21628     helper_double_24_recursive(buf + 0, 22);
21629     helper_double_24_recursive(buf + 4194304, 22);
21630     helper_double_24_recursive(buf + 8388608, 22);
21631     helper_double_24_recursive(buf + 12582912, 22);
21632     for (int j = 0; j < 16777216; j += 16777216) {
21633       for (int k = 0; k < 4194304; k += 2) {
21634         __asm__ volatile (
21635           "movupd (%0), %%xmm0\n"
21636           "movupd (%1), %%xmm1\n"
21637           "movupd (%2), %%xmm2\n"
21638           "movupd (%3), %%xmm3\n"
21639           "movapd %%xmm0, %%xmm8\n"
21640           "movapd %%xmm0, %%xmm9\n"
21641           "addpd %%xmm1, %%xmm8\n"
21642           "subpd %%xmm1, %%xmm9\n"
21643           "movapd %%xmm2, %%xmm10\n"
21644           "movapd %%xmm2, %%xmm11\n"
21645           "addpd %%xmm3, %%xmm10\n"
21646           "subpd %%xmm3, %%xmm11\n"
21647           "movapd %%xmm8, %%xmm0\n"
21648           "movapd %%xmm8, %%xmm2\n"
21649           "addpd %%xmm10, %%xmm0\n"
21650           "subpd %%xmm10, %%xmm2\n"
21651           "movapd %%xmm9, %%xmm1\n"
21652           "movapd %%xmm9, %%xmm3\n"
21653           "addpd %%xmm11, %%xmm1\n"
21654           "subpd %%xmm11, %%xmm3\n"
21655           "movupd %%xmm0, (%0)\n"
21656           "movupd %%xmm1, (%1)\n"
21657           "movupd %%xmm2, (%2)\n"
21658           "movupd %%xmm3, (%3)\n"
21659           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21660         );
21661       }
21662     }
21663     return;
21664   }
21665 }
21666 void helper_double_24(double *buf);
helper_double_24(double * buf)21667 void helper_double_24(double *buf) {
21668   helper_double_24_recursive(buf, 24);
21669 }
21670 void helper_double_25_recursive(double *buf, int depth);
helper_double_25_recursive(double * buf,int depth)21671 void helper_double_25_recursive(double *buf, int depth) {
21672   if (depth == 10) {
21673     for (int j = 0; j < 1024; j += 16) {
21674       for (int k = 0; k < 2; k += 2) {
21675         __asm__ volatile (
21676           "movupd (%0), %%xmm0\n"
21677           "movupd (%1), %%xmm1\n"
21678           "movupd (%2), %%xmm2\n"
21679           "movupd (%3), %%xmm3\n"
21680           "movupd (%4), %%xmm4\n"
21681           "movupd (%5), %%xmm5\n"
21682           "movupd (%6), %%xmm6\n"
21683           "movupd (%7), %%xmm7\n"
21684           "movapd %%xmm0, %%xmm8\n"
21685           "haddpd %%xmm8, %%xmm8\n"
21686           "movapd %%xmm0, %%xmm9\n"
21687           "hsubpd %%xmm9, %%xmm9\n"
21688           "blendpd $1, %%xmm8, %%xmm9\n"
21689           "movapd %%xmm9, %%xmm0\n"
21690           "movapd %%xmm1, %%xmm8\n"
21691           "haddpd %%xmm8, %%xmm8\n"
21692           "movapd %%xmm1, %%xmm9\n"
21693           "hsubpd %%xmm9, %%xmm9\n"
21694           "blendpd $1, %%xmm8, %%xmm9\n"
21695           "movapd %%xmm9, %%xmm1\n"
21696           "movapd %%xmm2, %%xmm8\n"
21697           "haddpd %%xmm8, %%xmm8\n"
21698           "movapd %%xmm2, %%xmm9\n"
21699           "hsubpd %%xmm9, %%xmm9\n"
21700           "blendpd $1, %%xmm8, %%xmm9\n"
21701           "movapd %%xmm9, %%xmm2\n"
21702           "movapd %%xmm3, %%xmm8\n"
21703           "haddpd %%xmm8, %%xmm8\n"
21704           "movapd %%xmm3, %%xmm9\n"
21705           "hsubpd %%xmm9, %%xmm9\n"
21706           "blendpd $1, %%xmm8, %%xmm9\n"
21707           "movapd %%xmm9, %%xmm3\n"
21708           "movapd %%xmm4, %%xmm8\n"
21709           "haddpd %%xmm8, %%xmm8\n"
21710           "movapd %%xmm4, %%xmm9\n"
21711           "hsubpd %%xmm9, %%xmm9\n"
21712           "blendpd $1, %%xmm8, %%xmm9\n"
21713           "movapd %%xmm9, %%xmm4\n"
21714           "movapd %%xmm5, %%xmm8\n"
21715           "haddpd %%xmm8, %%xmm8\n"
21716           "movapd %%xmm5, %%xmm9\n"
21717           "hsubpd %%xmm9, %%xmm9\n"
21718           "blendpd $1, %%xmm8, %%xmm9\n"
21719           "movapd %%xmm9, %%xmm5\n"
21720           "movapd %%xmm6, %%xmm8\n"
21721           "haddpd %%xmm8, %%xmm8\n"
21722           "movapd %%xmm6, %%xmm9\n"
21723           "hsubpd %%xmm9, %%xmm9\n"
21724           "blendpd $1, %%xmm8, %%xmm9\n"
21725           "movapd %%xmm9, %%xmm6\n"
21726           "movapd %%xmm7, %%xmm8\n"
21727           "haddpd %%xmm8, %%xmm8\n"
21728           "movapd %%xmm7, %%xmm9\n"
21729           "hsubpd %%xmm9, %%xmm9\n"
21730           "blendpd $1, %%xmm8, %%xmm9\n"
21731           "movapd %%xmm9, %%xmm7\n"
21732           "movapd %%xmm0, %%xmm8\n"
21733           "movapd %%xmm0, %%xmm9\n"
21734           "addpd %%xmm1, %%xmm8\n"
21735           "subpd %%xmm1, %%xmm9\n"
21736           "movapd %%xmm2, %%xmm10\n"
21737           "movapd %%xmm2, %%xmm11\n"
21738           "addpd %%xmm3, %%xmm10\n"
21739           "subpd %%xmm3, %%xmm11\n"
21740           "movapd %%xmm4, %%xmm12\n"
21741           "movapd %%xmm4, %%xmm13\n"
21742           "addpd %%xmm5, %%xmm12\n"
21743           "subpd %%xmm5, %%xmm13\n"
21744           "movapd %%xmm6, %%xmm14\n"
21745           "movapd %%xmm6, %%xmm15\n"
21746           "addpd %%xmm7, %%xmm14\n"
21747           "subpd %%xmm7, %%xmm15\n"
21748           "movapd %%xmm8, %%xmm0\n"
21749           "movapd %%xmm8, %%xmm2\n"
21750           "addpd %%xmm10, %%xmm0\n"
21751           "subpd %%xmm10, %%xmm2\n"
21752           "movapd %%xmm9, %%xmm1\n"
21753           "movapd %%xmm9, %%xmm3\n"
21754           "addpd %%xmm11, %%xmm1\n"
21755           "subpd %%xmm11, %%xmm3\n"
21756           "movapd %%xmm12, %%xmm4\n"
21757           "movapd %%xmm12, %%xmm6\n"
21758           "addpd %%xmm14, %%xmm4\n"
21759           "subpd %%xmm14, %%xmm6\n"
21760           "movapd %%xmm13, %%xmm5\n"
21761           "movapd %%xmm13, %%xmm7\n"
21762           "addpd %%xmm15, %%xmm5\n"
21763           "subpd %%xmm15, %%xmm7\n"
21764           "movapd %%xmm0, %%xmm8\n"
21765           "movapd %%xmm0, %%xmm12\n"
21766           "addpd %%xmm4, %%xmm8\n"
21767           "subpd %%xmm4, %%xmm12\n"
21768           "movapd %%xmm1, %%xmm9\n"
21769           "movapd %%xmm1, %%xmm13\n"
21770           "addpd %%xmm5, %%xmm9\n"
21771           "subpd %%xmm5, %%xmm13\n"
21772           "movapd %%xmm2, %%xmm10\n"
21773           "movapd %%xmm2, %%xmm14\n"
21774           "addpd %%xmm6, %%xmm10\n"
21775           "subpd %%xmm6, %%xmm14\n"
21776           "movapd %%xmm3, %%xmm11\n"
21777           "movapd %%xmm3, %%xmm15\n"
21778           "addpd %%xmm7, %%xmm11\n"
21779           "subpd %%xmm7, %%xmm15\n"
21780           "movupd %%xmm8, (%0)\n"
21781           "movupd %%xmm9, (%1)\n"
21782           "movupd %%xmm10, (%2)\n"
21783           "movupd %%xmm11, (%3)\n"
21784           "movupd %%xmm12, (%4)\n"
21785           "movupd %%xmm13, (%5)\n"
21786           "movupd %%xmm14, (%6)\n"
21787           "movupd %%xmm15, (%7)\n"
21788           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21789         );
21790       }
21791     }
21792     for (int j = 0; j < 1024; j += 128) {
21793       for (int k = 0; k < 16; k += 2) {
21794         __asm__ volatile (
21795           "movupd (%0), %%xmm0\n"
21796           "movupd (%1), %%xmm1\n"
21797           "movupd (%2), %%xmm2\n"
21798           "movupd (%3), %%xmm3\n"
21799           "movupd (%4), %%xmm4\n"
21800           "movupd (%5), %%xmm5\n"
21801           "movupd (%6), %%xmm6\n"
21802           "movupd (%7), %%xmm7\n"
21803           "movapd %%xmm0, %%xmm8\n"
21804           "movapd %%xmm0, %%xmm9\n"
21805           "addpd %%xmm1, %%xmm8\n"
21806           "subpd %%xmm1, %%xmm9\n"
21807           "movapd %%xmm2, %%xmm10\n"
21808           "movapd %%xmm2, %%xmm11\n"
21809           "addpd %%xmm3, %%xmm10\n"
21810           "subpd %%xmm3, %%xmm11\n"
21811           "movapd %%xmm4, %%xmm12\n"
21812           "movapd %%xmm4, %%xmm13\n"
21813           "addpd %%xmm5, %%xmm12\n"
21814           "subpd %%xmm5, %%xmm13\n"
21815           "movapd %%xmm6, %%xmm14\n"
21816           "movapd %%xmm6, %%xmm15\n"
21817           "addpd %%xmm7, %%xmm14\n"
21818           "subpd %%xmm7, %%xmm15\n"
21819           "movapd %%xmm8, %%xmm0\n"
21820           "movapd %%xmm8, %%xmm2\n"
21821           "addpd %%xmm10, %%xmm0\n"
21822           "subpd %%xmm10, %%xmm2\n"
21823           "movapd %%xmm9, %%xmm1\n"
21824           "movapd %%xmm9, %%xmm3\n"
21825           "addpd %%xmm11, %%xmm1\n"
21826           "subpd %%xmm11, %%xmm3\n"
21827           "movapd %%xmm12, %%xmm4\n"
21828           "movapd %%xmm12, %%xmm6\n"
21829           "addpd %%xmm14, %%xmm4\n"
21830           "subpd %%xmm14, %%xmm6\n"
21831           "movapd %%xmm13, %%xmm5\n"
21832           "movapd %%xmm13, %%xmm7\n"
21833           "addpd %%xmm15, %%xmm5\n"
21834           "subpd %%xmm15, %%xmm7\n"
21835           "movapd %%xmm0, %%xmm8\n"
21836           "movapd %%xmm0, %%xmm12\n"
21837           "addpd %%xmm4, %%xmm8\n"
21838           "subpd %%xmm4, %%xmm12\n"
21839           "movapd %%xmm1, %%xmm9\n"
21840           "movapd %%xmm1, %%xmm13\n"
21841           "addpd %%xmm5, %%xmm9\n"
21842           "subpd %%xmm5, %%xmm13\n"
21843           "movapd %%xmm2, %%xmm10\n"
21844           "movapd %%xmm2, %%xmm14\n"
21845           "addpd %%xmm6, %%xmm10\n"
21846           "subpd %%xmm6, %%xmm14\n"
21847           "movapd %%xmm3, %%xmm11\n"
21848           "movapd %%xmm3, %%xmm15\n"
21849           "addpd %%xmm7, %%xmm11\n"
21850           "subpd %%xmm7, %%xmm15\n"
21851           "movupd %%xmm8, (%0)\n"
21852           "movupd %%xmm9, (%1)\n"
21853           "movupd %%xmm10, (%2)\n"
21854           "movupd %%xmm11, (%3)\n"
21855           "movupd %%xmm12, (%4)\n"
21856           "movupd %%xmm13, (%5)\n"
21857           "movupd %%xmm14, (%6)\n"
21858           "movupd %%xmm15, (%7)\n"
21859           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21860         );
21861       }
21862     }
21863     for (int j = 0; j < 1024; j += 1024) {
21864       for (int k = 0; k < 128; k += 2) {
21865         __asm__ volatile (
21866           "movupd (%0), %%xmm0\n"
21867           "movupd (%1), %%xmm1\n"
21868           "movupd (%2), %%xmm2\n"
21869           "movupd (%3), %%xmm3\n"
21870           "movupd (%4), %%xmm4\n"
21871           "movupd (%5), %%xmm5\n"
21872           "movupd (%6), %%xmm6\n"
21873           "movupd (%7), %%xmm7\n"
21874           "movapd %%xmm0, %%xmm8\n"
21875           "movapd %%xmm0, %%xmm9\n"
21876           "addpd %%xmm1, %%xmm8\n"
21877           "subpd %%xmm1, %%xmm9\n"
21878           "movapd %%xmm2, %%xmm10\n"
21879           "movapd %%xmm2, %%xmm11\n"
21880           "addpd %%xmm3, %%xmm10\n"
21881           "subpd %%xmm3, %%xmm11\n"
21882           "movapd %%xmm4, %%xmm12\n"
21883           "movapd %%xmm4, %%xmm13\n"
21884           "addpd %%xmm5, %%xmm12\n"
21885           "subpd %%xmm5, %%xmm13\n"
21886           "movapd %%xmm6, %%xmm14\n"
21887           "movapd %%xmm6, %%xmm15\n"
21888           "addpd %%xmm7, %%xmm14\n"
21889           "subpd %%xmm7, %%xmm15\n"
21890           "movapd %%xmm8, %%xmm0\n"
21891           "movapd %%xmm8, %%xmm2\n"
21892           "addpd %%xmm10, %%xmm0\n"
21893           "subpd %%xmm10, %%xmm2\n"
21894           "movapd %%xmm9, %%xmm1\n"
21895           "movapd %%xmm9, %%xmm3\n"
21896           "addpd %%xmm11, %%xmm1\n"
21897           "subpd %%xmm11, %%xmm3\n"
21898           "movapd %%xmm12, %%xmm4\n"
21899           "movapd %%xmm12, %%xmm6\n"
21900           "addpd %%xmm14, %%xmm4\n"
21901           "subpd %%xmm14, %%xmm6\n"
21902           "movapd %%xmm13, %%xmm5\n"
21903           "movapd %%xmm13, %%xmm7\n"
21904           "addpd %%xmm15, %%xmm5\n"
21905           "subpd %%xmm15, %%xmm7\n"
21906           "movapd %%xmm0, %%xmm8\n"
21907           "movapd %%xmm0, %%xmm12\n"
21908           "addpd %%xmm4, %%xmm8\n"
21909           "subpd %%xmm4, %%xmm12\n"
21910           "movapd %%xmm1, %%xmm9\n"
21911           "movapd %%xmm1, %%xmm13\n"
21912           "addpd %%xmm5, %%xmm9\n"
21913           "subpd %%xmm5, %%xmm13\n"
21914           "movapd %%xmm2, %%xmm10\n"
21915           "movapd %%xmm2, %%xmm14\n"
21916           "addpd %%xmm6, %%xmm10\n"
21917           "subpd %%xmm6, %%xmm14\n"
21918           "movapd %%xmm3, %%xmm11\n"
21919           "movapd %%xmm3, %%xmm15\n"
21920           "addpd %%xmm7, %%xmm11\n"
21921           "subpd %%xmm7, %%xmm15\n"
21922           "movupd %%xmm8, (%0)\n"
21923           "movupd %%xmm9, (%1)\n"
21924           "movupd %%xmm10, (%2)\n"
21925           "movupd %%xmm11, (%3)\n"
21926           "movupd %%xmm12, (%4)\n"
21927           "movupd %%xmm13, (%5)\n"
21928           "movupd %%xmm14, (%6)\n"
21929           "movupd %%xmm15, (%7)\n"
21930           :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21931         );
21932       }
21933     }
21934     return;
21935   }
21936   if (depth == 13) {
21937     helper_double_25_recursive(buf + 0, 10);
21938     helper_double_25_recursive(buf + 1024, 10);
21939     helper_double_25_recursive(buf + 2048, 10);
21940     helper_double_25_recursive(buf + 3072, 10);
21941     helper_double_25_recursive(buf + 4096, 10);
21942     helper_double_25_recursive(buf + 5120, 10);
21943     helper_double_25_recursive(buf + 6144, 10);
21944     helper_double_25_recursive(buf + 7168, 10);
21945     for (int j = 0; j < 8192; j += 8192) {
21946       for (int k = 0; k < 1024; k += 2) {
21947         __asm__ volatile (
21948           "movupd (%0), %%xmm0\n"
21949           "movupd (%1), %%xmm1\n"
21950           "movupd (%2), %%xmm2\n"
21951           "movupd (%3), %%xmm3\n"
21952           "movupd (%4), %%xmm4\n"
21953           "movupd (%5), %%xmm5\n"
21954           "movupd (%6), %%xmm6\n"
21955           "movupd (%7), %%xmm7\n"
21956           "movapd %%xmm0, %%xmm8\n"
21957           "movapd %%xmm0, %%xmm9\n"
21958           "addpd %%xmm1, %%xmm8\n"
21959           "subpd %%xmm1, %%xmm9\n"
21960           "movapd %%xmm2, %%xmm10\n"
21961           "movapd %%xmm2, %%xmm11\n"
21962           "addpd %%xmm3, %%xmm10\n"
21963           "subpd %%xmm3, %%xmm11\n"
21964           "movapd %%xmm4, %%xmm12\n"
21965           "movapd %%xmm4, %%xmm13\n"
21966           "addpd %%xmm5, %%xmm12\n"
21967           "subpd %%xmm5, %%xmm13\n"
21968           "movapd %%xmm6, %%xmm14\n"
21969           "movapd %%xmm6, %%xmm15\n"
21970           "addpd %%xmm7, %%xmm14\n"
21971           "subpd %%xmm7, %%xmm15\n"
21972           "movapd %%xmm8, %%xmm0\n"
21973           "movapd %%xmm8, %%xmm2\n"
21974           "addpd %%xmm10, %%xmm0\n"
21975           "subpd %%xmm10, %%xmm2\n"
21976           "movapd %%xmm9, %%xmm1\n"
21977           "movapd %%xmm9, %%xmm3\n"
21978           "addpd %%xmm11, %%xmm1\n"
21979           "subpd %%xmm11, %%xmm3\n"
21980           "movapd %%xmm12, %%xmm4\n"
21981           "movapd %%xmm12, %%xmm6\n"
21982           "addpd %%xmm14, %%xmm4\n"
21983           "subpd %%xmm14, %%xmm6\n"
21984           "movapd %%xmm13, %%xmm5\n"
21985           "movapd %%xmm13, %%xmm7\n"
21986           "addpd %%xmm15, %%xmm5\n"
21987           "subpd %%xmm15, %%xmm7\n"
21988           "movapd %%xmm0, %%xmm8\n"
21989           "movapd %%xmm0, %%xmm12\n"
21990           "addpd %%xmm4, %%xmm8\n"
21991           "subpd %%xmm4, %%xmm12\n"
21992           "movapd %%xmm1, %%xmm9\n"
21993           "movapd %%xmm1, %%xmm13\n"
21994           "addpd %%xmm5, %%xmm9\n"
21995           "subpd %%xmm5, %%xmm13\n"
21996           "movapd %%xmm2, %%xmm10\n"
21997           "movapd %%xmm2, %%xmm14\n"
21998           "addpd %%xmm6, %%xmm10\n"
21999           "subpd %%xmm6, %%xmm14\n"
22000           "movapd %%xmm3, %%xmm11\n"
22001           "movapd %%xmm3, %%xmm15\n"
22002           "addpd %%xmm7, %%xmm11\n"
22003           "subpd %%xmm7, %%xmm15\n"
22004           "movupd %%xmm8, (%0)\n"
22005           "movupd %%xmm9, (%1)\n"
22006           "movupd %%xmm10, (%2)\n"
22007           "movupd %%xmm11, (%3)\n"
22008           "movupd %%xmm12, (%4)\n"
22009           "movupd %%xmm13, (%5)\n"
22010           "movupd %%xmm14, (%6)\n"
22011           "movupd %%xmm15, (%7)\n"
22012           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22013         );
22014       }
22015     }
22016     return;
22017   }
22018   if (depth == 16) {
22019     helper_double_25_recursive(buf + 0, 13);
22020     helper_double_25_recursive(buf + 8192, 13);
22021     helper_double_25_recursive(buf + 16384, 13);
22022     helper_double_25_recursive(buf + 24576, 13);
22023     helper_double_25_recursive(buf + 32768, 13);
22024     helper_double_25_recursive(buf + 40960, 13);
22025     helper_double_25_recursive(buf + 49152, 13);
22026     helper_double_25_recursive(buf + 57344, 13);
22027     for (int j = 0; j < 65536; j += 65536) {
22028       for (int k = 0; k < 8192; k += 2) {
22029         __asm__ volatile (
22030           "movupd (%0), %%xmm0\n"
22031           "movupd (%1), %%xmm1\n"
22032           "movupd (%2), %%xmm2\n"
22033           "movupd (%3), %%xmm3\n"
22034           "movupd (%4), %%xmm4\n"
22035           "movupd (%5), %%xmm5\n"
22036           "movupd (%6), %%xmm6\n"
22037           "movupd (%7), %%xmm7\n"
22038           "movapd %%xmm0, %%xmm8\n"
22039           "movapd %%xmm0, %%xmm9\n"
22040           "addpd %%xmm1, %%xmm8\n"
22041           "subpd %%xmm1, %%xmm9\n"
22042           "movapd %%xmm2, %%xmm10\n"
22043           "movapd %%xmm2, %%xmm11\n"
22044           "addpd %%xmm3, %%xmm10\n"
22045           "subpd %%xmm3, %%xmm11\n"
22046           "movapd %%xmm4, %%xmm12\n"
22047           "movapd %%xmm4, %%xmm13\n"
22048           "addpd %%xmm5, %%xmm12\n"
22049           "subpd %%xmm5, %%xmm13\n"
22050           "movapd %%xmm6, %%xmm14\n"
22051           "movapd %%xmm6, %%xmm15\n"
22052           "addpd %%xmm7, %%xmm14\n"
22053           "subpd %%xmm7, %%xmm15\n"
22054           "movapd %%xmm8, %%xmm0\n"
22055           "movapd %%xmm8, %%xmm2\n"
22056           "addpd %%xmm10, %%xmm0\n"
22057           "subpd %%xmm10, %%xmm2\n"
22058           "movapd %%xmm9, %%xmm1\n"
22059           "movapd %%xmm9, %%xmm3\n"
22060           "addpd %%xmm11, %%xmm1\n"
22061           "subpd %%xmm11, %%xmm3\n"
22062           "movapd %%xmm12, %%xmm4\n"
22063           "movapd %%xmm12, %%xmm6\n"
22064           "addpd %%xmm14, %%xmm4\n"
22065           "subpd %%xmm14, %%xmm6\n"
22066           "movapd %%xmm13, %%xmm5\n"
22067           "movapd %%xmm13, %%xmm7\n"
22068           "addpd %%xmm15, %%xmm5\n"
22069           "subpd %%xmm15, %%xmm7\n"
22070           "movapd %%xmm0, %%xmm8\n"
22071           "movapd %%xmm0, %%xmm12\n"
22072           "addpd %%xmm4, %%xmm8\n"
22073           "subpd %%xmm4, %%xmm12\n"
22074           "movapd %%xmm1, %%xmm9\n"
22075           "movapd %%xmm1, %%xmm13\n"
22076           "addpd %%xmm5, %%xmm9\n"
22077           "subpd %%xmm5, %%xmm13\n"
22078           "movapd %%xmm2, %%xmm10\n"
22079           "movapd %%xmm2, %%xmm14\n"
22080           "addpd %%xmm6, %%xmm10\n"
22081           "subpd %%xmm6, %%xmm14\n"
22082           "movapd %%xmm3, %%xmm11\n"
22083           "movapd %%xmm3, %%xmm15\n"
22084           "addpd %%xmm7, %%xmm11\n"
22085           "subpd %%xmm7, %%xmm15\n"
22086           "movupd %%xmm8, (%0)\n"
22087           "movupd %%xmm9, (%1)\n"
22088           "movupd %%xmm10, (%2)\n"
22089           "movupd %%xmm11, (%3)\n"
22090           "movupd %%xmm12, (%4)\n"
22091           "movupd %%xmm13, (%5)\n"
22092           "movupd %%xmm14, (%6)\n"
22093           "movupd %%xmm15, (%7)\n"
22094           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22095         );
22096       }
22097     }
22098     return;
22099   }
22100   if (depth == 19) {
22101     helper_double_25_recursive(buf + 0, 16);
22102     helper_double_25_recursive(buf + 65536, 16);
22103     helper_double_25_recursive(buf + 131072, 16);
22104     helper_double_25_recursive(buf + 196608, 16);
22105     helper_double_25_recursive(buf + 262144, 16);
22106     helper_double_25_recursive(buf + 327680, 16);
22107     helper_double_25_recursive(buf + 393216, 16);
22108     helper_double_25_recursive(buf + 458752, 16);
22109     for (int j = 0; j < 524288; j += 524288) {
22110       for (int k = 0; k < 65536; k += 2) {
22111         __asm__ volatile (
22112           "movupd (%0), %%xmm0\n"
22113           "movupd (%1), %%xmm1\n"
22114           "movupd (%2), %%xmm2\n"
22115           "movupd (%3), %%xmm3\n"
22116           "movupd (%4), %%xmm4\n"
22117           "movupd (%5), %%xmm5\n"
22118           "movupd (%6), %%xmm6\n"
22119           "movupd (%7), %%xmm7\n"
22120           "movapd %%xmm0, %%xmm8\n"
22121           "movapd %%xmm0, %%xmm9\n"
22122           "addpd %%xmm1, %%xmm8\n"
22123           "subpd %%xmm1, %%xmm9\n"
22124           "movapd %%xmm2, %%xmm10\n"
22125           "movapd %%xmm2, %%xmm11\n"
22126           "addpd %%xmm3, %%xmm10\n"
22127           "subpd %%xmm3, %%xmm11\n"
22128           "movapd %%xmm4, %%xmm12\n"
22129           "movapd %%xmm4, %%xmm13\n"
22130           "addpd %%xmm5, %%xmm12\n"
22131           "subpd %%xmm5, %%xmm13\n"
22132           "movapd %%xmm6, %%xmm14\n"
22133           "movapd %%xmm6, %%xmm15\n"
22134           "addpd %%xmm7, %%xmm14\n"
22135           "subpd %%xmm7, %%xmm15\n"
22136           "movapd %%xmm8, %%xmm0\n"
22137           "movapd %%xmm8, %%xmm2\n"
22138           "addpd %%xmm10, %%xmm0\n"
22139           "subpd %%xmm10, %%xmm2\n"
22140           "movapd %%xmm9, %%xmm1\n"
22141           "movapd %%xmm9, %%xmm3\n"
22142           "addpd %%xmm11, %%xmm1\n"
22143           "subpd %%xmm11, %%xmm3\n"
22144           "movapd %%xmm12, %%xmm4\n"
22145           "movapd %%xmm12, %%xmm6\n"
22146           "addpd %%xmm14, %%xmm4\n"
22147           "subpd %%xmm14, %%xmm6\n"
22148           "movapd %%xmm13, %%xmm5\n"
22149           "movapd %%xmm13, %%xmm7\n"
22150           "addpd %%xmm15, %%xmm5\n"
22151           "subpd %%xmm15, %%xmm7\n"
22152           "movapd %%xmm0, %%xmm8\n"
22153           "movapd %%xmm0, %%xmm12\n"
22154           "addpd %%xmm4, %%xmm8\n"
22155           "subpd %%xmm4, %%xmm12\n"
22156           "movapd %%xmm1, %%xmm9\n"
22157           "movapd %%xmm1, %%xmm13\n"
22158           "addpd %%xmm5, %%xmm9\n"
22159           "subpd %%xmm5, %%xmm13\n"
22160           "movapd %%xmm2, %%xmm10\n"
22161           "movapd %%xmm2, %%xmm14\n"
22162           "addpd %%xmm6, %%xmm10\n"
22163           "subpd %%xmm6, %%xmm14\n"
22164           "movapd %%xmm3, %%xmm11\n"
22165           "movapd %%xmm3, %%xmm15\n"
22166           "addpd %%xmm7, %%xmm11\n"
22167           "subpd %%xmm7, %%xmm15\n"
22168           "movupd %%xmm8, (%0)\n"
22169           "movupd %%xmm9, (%1)\n"
22170           "movupd %%xmm10, (%2)\n"
22171           "movupd %%xmm11, (%3)\n"
22172           "movupd %%xmm12, (%4)\n"
22173           "movupd %%xmm13, (%5)\n"
22174           "movupd %%xmm14, (%6)\n"
22175           "movupd %%xmm15, (%7)\n"
22176           :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22177         );
22178       }
22179     }
22180     return;
22181   }
22182   if (depth == 22) {
22183     helper_double_25_recursive(buf + 0, 19);
22184     helper_double_25_recursive(buf + 524288, 19);
22185     helper_double_25_recursive(buf + 1048576, 19);
22186     helper_double_25_recursive(buf + 1572864, 19);
22187     helper_double_25_recursive(buf + 2097152, 19);
22188     helper_double_25_recursive(buf + 2621440, 19);
22189     helper_double_25_recursive(buf + 3145728, 19);
22190     helper_double_25_recursive(buf + 3670016, 19);
22191     for (int j = 0; j < 4194304; j += 4194304) {
22192       for (int k = 0; k < 524288; k += 2) {
22193         __asm__ volatile (
22194           "movupd (%0), %%xmm0\n"
22195           "movupd (%1), %%xmm1\n"
22196           "movupd (%2), %%xmm2\n"
22197           "movupd (%3), %%xmm3\n"
22198           "movupd (%4), %%xmm4\n"
22199           "movupd (%5), %%xmm5\n"
22200           "movupd (%6), %%xmm6\n"
22201           "movupd (%7), %%xmm7\n"
22202           "movapd %%xmm0, %%xmm8\n"
22203           "movapd %%xmm0, %%xmm9\n"
22204           "addpd %%xmm1, %%xmm8\n"
22205           "subpd %%xmm1, %%xmm9\n"
22206           "movapd %%xmm2, %%xmm10\n"
22207           "movapd %%xmm2, %%xmm11\n"
22208           "addpd %%xmm3, %%xmm10\n"
22209           "subpd %%xmm3, %%xmm11\n"
22210           "movapd %%xmm4, %%xmm12\n"
22211           "movapd %%xmm4, %%xmm13\n"
22212           "addpd %%xmm5, %%xmm12\n"
22213           "subpd %%xmm5, %%xmm13\n"
22214           "movapd %%xmm6, %%xmm14\n"
22215           "movapd %%xmm6, %%xmm15\n"
22216           "addpd %%xmm7, %%xmm14\n"
22217           "subpd %%xmm7, %%xmm15\n"
22218           "movapd %%xmm8, %%xmm0\n"
22219           "movapd %%xmm8, %%xmm2\n"
22220           "addpd %%xmm10, %%xmm0\n"
22221           "subpd %%xmm10, %%xmm2\n"
22222           "movapd %%xmm9, %%xmm1\n"
22223           "movapd %%xmm9, %%xmm3\n"
22224           "addpd %%xmm11, %%xmm1\n"
22225           "subpd %%xmm11, %%xmm3\n"
22226           "movapd %%xmm12, %%xmm4\n"
22227           "movapd %%xmm12, %%xmm6\n"
22228           "addpd %%xmm14, %%xmm4\n"
22229           "subpd %%xmm14, %%xmm6\n"
22230           "movapd %%xmm13, %%xmm5\n"
22231           "movapd %%xmm13, %%xmm7\n"
22232           "addpd %%xmm15, %%xmm5\n"
22233           "subpd %%xmm15, %%xmm7\n"
22234           "movapd %%xmm0, %%xmm8\n"
22235           "movapd %%xmm0, %%xmm12\n"
22236           "addpd %%xmm4, %%xmm8\n"
22237           "subpd %%xmm4, %%xmm12\n"
22238           "movapd %%xmm1, %%xmm9\n"
22239           "movapd %%xmm1, %%xmm13\n"
22240           "addpd %%xmm5, %%xmm9\n"
22241           "subpd %%xmm5, %%xmm13\n"
22242           "movapd %%xmm2, %%xmm10\n"
22243           "movapd %%xmm2, %%xmm14\n"
22244           "addpd %%xmm6, %%xmm10\n"
22245           "subpd %%xmm6, %%xmm14\n"
22246           "movapd %%xmm3, %%xmm11\n"
22247           "movapd %%xmm3, %%xmm15\n"
22248           "addpd %%xmm7, %%xmm11\n"
22249           "subpd %%xmm7, %%xmm15\n"
22250           "movupd %%xmm8, (%0)\n"
22251           "movupd %%xmm9, (%1)\n"
22252           "movupd %%xmm10, (%2)\n"
22253           "movupd %%xmm11, (%3)\n"
22254           "movupd %%xmm12, (%4)\n"
22255           "movupd %%xmm13, (%5)\n"
22256           "movupd %%xmm14, (%6)\n"
22257           "movupd %%xmm15, (%7)\n"
22258           :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22259         );
22260       }
22261     }
22262     return;
22263   }
22264   if (depth == 25) {
22265     helper_double_25_recursive(buf + 0, 22);
22266     helper_double_25_recursive(buf + 4194304, 22);
22267     helper_double_25_recursive(buf + 8388608, 22);
22268     helper_double_25_recursive(buf + 12582912, 22);
22269     helper_double_25_recursive(buf + 16777216, 22);
22270     helper_double_25_recursive(buf + 20971520, 22);
22271     helper_double_25_recursive(buf + 25165824, 22);
22272     helper_double_25_recursive(buf + 29360128, 22);
22273     for (int j = 0; j < 33554432; j += 33554432) {
22274       for (int k = 0; k < 4194304; k += 2) {
22275         __asm__ volatile (
22276           "movupd (%0), %%xmm0\n"
22277           "movupd (%1), %%xmm1\n"
22278           "movupd (%2), %%xmm2\n"
22279           "movupd (%3), %%xmm3\n"
22280           "movupd (%4), %%xmm4\n"
22281           "movupd (%5), %%xmm5\n"
22282           "movupd (%6), %%xmm6\n"
22283           "movupd (%7), %%xmm7\n"
22284           "movapd %%xmm0, %%xmm8\n"
22285           "movapd %%xmm0, %%xmm9\n"
22286           "addpd %%xmm1, %%xmm8\n"
22287           "subpd %%xmm1, %%xmm9\n"
22288           "movapd %%xmm2, %%xmm10\n"
22289           "movapd %%xmm2, %%xmm11\n"
22290           "addpd %%xmm3, %%xmm10\n"
22291           "subpd %%xmm3, %%xmm11\n"
22292           "movapd %%xmm4, %%xmm12\n"
22293           "movapd %%xmm4, %%xmm13\n"
22294           "addpd %%xmm5, %%xmm12\n"
22295           "subpd %%xmm5, %%xmm13\n"
22296           "movapd %%xmm6, %%xmm14\n"
22297           "movapd %%xmm6, %%xmm15\n"
22298           "addpd %%xmm7, %%xmm14\n"
22299           "subpd %%xmm7, %%xmm15\n"
22300           "movapd %%xmm8, %%xmm0\n"
22301           "movapd %%xmm8, %%xmm2\n"
22302           "addpd %%xmm10, %%xmm0\n"
22303           "subpd %%xmm10, %%xmm2\n"
22304           "movapd %%xmm9, %%xmm1\n"
22305           "movapd %%xmm9, %%xmm3\n"
22306           "addpd %%xmm11, %%xmm1\n"
22307           "subpd %%xmm11, %%xmm3\n"
22308           "movapd %%xmm12, %%xmm4\n"
22309           "movapd %%xmm12, %%xmm6\n"
22310           "addpd %%xmm14, %%xmm4\n"
22311           "subpd %%xmm14, %%xmm6\n"
22312           "movapd %%xmm13, %%xmm5\n"
22313           "movapd %%xmm13, %%xmm7\n"
22314           "addpd %%xmm15, %%xmm5\n"
22315           "subpd %%xmm15, %%xmm7\n"
22316           "movapd %%xmm0, %%xmm8\n"
22317           "movapd %%xmm0, %%xmm12\n"
22318           "addpd %%xmm4, %%xmm8\n"
22319           "subpd %%xmm4, %%xmm12\n"
22320           "movapd %%xmm1, %%xmm9\n"
22321           "movapd %%xmm1, %%xmm13\n"
22322           "addpd %%xmm5, %%xmm9\n"
22323           "subpd %%xmm5, %%xmm13\n"
22324           "movapd %%xmm2, %%xmm10\n"
22325           "movapd %%xmm2, %%xmm14\n"
22326           "addpd %%xmm6, %%xmm10\n"
22327           "subpd %%xmm6, %%xmm14\n"
22328           "movapd %%xmm3, %%xmm11\n"
22329           "movapd %%xmm3, %%xmm15\n"
22330           "addpd %%xmm7, %%xmm11\n"
22331           "subpd %%xmm7, %%xmm15\n"
22332           "movupd %%xmm8, (%0)\n"
22333           "movupd %%xmm9, (%1)\n"
22334           "movupd %%xmm10, (%2)\n"
22335           "movupd %%xmm11, (%3)\n"
22336           "movupd %%xmm12, (%4)\n"
22337           "movupd %%xmm13, (%5)\n"
22338           "movupd %%xmm14, (%6)\n"
22339           "movupd %%xmm15, (%7)\n"
22340           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22341         );
22342       }
22343     }
22344     return;
22345   }
22346 }
22347 void helper_double_25(double *buf);
helper_double_25(double * buf)22348 void helper_double_25(double *buf) {
22349   helper_double_25_recursive(buf, 25);
22350 }
22351 void helper_double_26_recursive(double *buf, int depth);
helper_double_26_recursive(double * buf,int depth)22352 void helper_double_26_recursive(double *buf, int depth) {
22353   if (depth == 5) {
22354     for (int j = 0; j < 32; j += 16) {
22355       for (int k = 0; k < 2; k += 2) {
22356         __asm__ volatile (
22357           "movupd (%0), %%xmm0\n"
22358           "movupd (%1), %%xmm1\n"
22359           "movupd (%2), %%xmm2\n"
22360           "movupd (%3), %%xmm3\n"
22361           "movupd (%4), %%xmm4\n"
22362           "movupd (%5), %%xmm5\n"
22363           "movupd (%6), %%xmm6\n"
22364           "movupd (%7), %%xmm7\n"
22365           "movapd %%xmm0, %%xmm8\n"
22366           "haddpd %%xmm8, %%xmm8\n"
22367           "movapd %%xmm0, %%xmm9\n"
22368           "hsubpd %%xmm9, %%xmm9\n"
22369           "blendpd $1, %%xmm8, %%xmm9\n"
22370           "movapd %%xmm9, %%xmm0\n"
22371           "movapd %%xmm1, %%xmm8\n"
22372           "haddpd %%xmm8, %%xmm8\n"
22373           "movapd %%xmm1, %%xmm9\n"
22374           "hsubpd %%xmm9, %%xmm9\n"
22375           "blendpd $1, %%xmm8, %%xmm9\n"
22376           "movapd %%xmm9, %%xmm1\n"
22377           "movapd %%xmm2, %%xmm8\n"
22378           "haddpd %%xmm8, %%xmm8\n"
22379           "movapd %%xmm2, %%xmm9\n"
22380           "hsubpd %%xmm9, %%xmm9\n"
22381           "blendpd $1, %%xmm8, %%xmm9\n"
22382           "movapd %%xmm9, %%xmm2\n"
22383           "movapd %%xmm3, %%xmm8\n"
22384           "haddpd %%xmm8, %%xmm8\n"
22385           "movapd %%xmm3, %%xmm9\n"
22386           "hsubpd %%xmm9, %%xmm9\n"
22387           "blendpd $1, %%xmm8, %%xmm9\n"
22388           "movapd %%xmm9, %%xmm3\n"
22389           "movapd %%xmm4, %%xmm8\n"
22390           "haddpd %%xmm8, %%xmm8\n"
22391           "movapd %%xmm4, %%xmm9\n"
22392           "hsubpd %%xmm9, %%xmm9\n"
22393           "blendpd $1, %%xmm8, %%xmm9\n"
22394           "movapd %%xmm9, %%xmm4\n"
22395           "movapd %%xmm5, %%xmm8\n"
22396           "haddpd %%xmm8, %%xmm8\n"
22397           "movapd %%xmm5, %%xmm9\n"
22398           "hsubpd %%xmm9, %%xmm9\n"
22399           "blendpd $1, %%xmm8, %%xmm9\n"
22400           "movapd %%xmm9, %%xmm5\n"
22401           "movapd %%xmm6, %%xmm8\n"
22402           "haddpd %%xmm8, %%xmm8\n"
22403           "movapd %%xmm6, %%xmm9\n"
22404           "hsubpd %%xmm9, %%xmm9\n"
22405           "blendpd $1, %%xmm8, %%xmm9\n"
22406           "movapd %%xmm9, %%xmm6\n"
22407           "movapd %%xmm7, %%xmm8\n"
22408           "haddpd %%xmm8, %%xmm8\n"
22409           "movapd %%xmm7, %%xmm9\n"
22410           "hsubpd %%xmm9, %%xmm9\n"
22411           "blendpd $1, %%xmm8, %%xmm9\n"
22412           "movapd %%xmm9, %%xmm7\n"
22413           "movapd %%xmm0, %%xmm8\n"
22414           "movapd %%xmm0, %%xmm9\n"
22415           "addpd %%xmm1, %%xmm8\n"
22416           "subpd %%xmm1, %%xmm9\n"
22417           "movapd %%xmm2, %%xmm10\n"
22418           "movapd %%xmm2, %%xmm11\n"
22419           "addpd %%xmm3, %%xmm10\n"
22420           "subpd %%xmm3, %%xmm11\n"
22421           "movapd %%xmm4, %%xmm12\n"
22422           "movapd %%xmm4, %%xmm13\n"
22423           "addpd %%xmm5, %%xmm12\n"
22424           "subpd %%xmm5, %%xmm13\n"
22425           "movapd %%xmm6, %%xmm14\n"
22426           "movapd %%xmm6, %%xmm15\n"
22427           "addpd %%xmm7, %%xmm14\n"
22428           "subpd %%xmm7, %%xmm15\n"
22429           "movapd %%xmm8, %%xmm0\n"
22430           "movapd %%xmm8, %%xmm2\n"
22431           "addpd %%xmm10, %%xmm0\n"
22432           "subpd %%xmm10, %%xmm2\n"
22433           "movapd %%xmm9, %%xmm1\n"
22434           "movapd %%xmm9, %%xmm3\n"
22435           "addpd %%xmm11, %%xmm1\n"
22436           "subpd %%xmm11, %%xmm3\n"
22437           "movapd %%xmm12, %%xmm4\n"
22438           "movapd %%xmm12, %%xmm6\n"
22439           "addpd %%xmm14, %%xmm4\n"
22440           "subpd %%xmm14, %%xmm6\n"
22441           "movapd %%xmm13, %%xmm5\n"
22442           "movapd %%xmm13, %%xmm7\n"
22443           "addpd %%xmm15, %%xmm5\n"
22444           "subpd %%xmm15, %%xmm7\n"
22445           "movapd %%xmm0, %%xmm8\n"
22446           "movapd %%xmm0, %%xmm12\n"
22447           "addpd %%xmm4, %%xmm8\n"
22448           "subpd %%xmm4, %%xmm12\n"
22449           "movapd %%xmm1, %%xmm9\n"
22450           "movapd %%xmm1, %%xmm13\n"
22451           "addpd %%xmm5, %%xmm9\n"
22452           "subpd %%xmm5, %%xmm13\n"
22453           "movapd %%xmm2, %%xmm10\n"
22454           "movapd %%xmm2, %%xmm14\n"
22455           "addpd %%xmm6, %%xmm10\n"
22456           "subpd %%xmm6, %%xmm14\n"
22457           "movapd %%xmm3, %%xmm11\n"
22458           "movapd %%xmm3, %%xmm15\n"
22459           "addpd %%xmm7, %%xmm11\n"
22460           "subpd %%xmm7, %%xmm15\n"
22461           "movupd %%xmm8, (%0)\n"
22462           "movupd %%xmm9, (%1)\n"
22463           "movupd %%xmm10, (%2)\n"
22464           "movupd %%xmm11, (%3)\n"
22465           "movupd %%xmm12, (%4)\n"
22466           "movupd %%xmm13, (%5)\n"
22467           "movupd %%xmm14, (%6)\n"
22468           "movupd %%xmm15, (%7)\n"
22469           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22470         );
22471       }
22472     }
22473     for (int j = 0; j < 32; j += 32) {
22474       for (int k = 0; k < 16; k += 2) {
22475         __asm__ volatile (
22476           "movupd (%0), %%xmm0\n"
22477           "movupd (%1), %%xmm1\n"
22478           "movapd %%xmm0, %%xmm8\n"
22479           "movapd %%xmm0, %%xmm9\n"
22480           "addpd %%xmm1, %%xmm8\n"
22481           "subpd %%xmm1, %%xmm9\n"
22482           "movupd %%xmm8, (%0)\n"
22483           "movupd %%xmm9, (%1)\n"
22484           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22485         );
22486       }
22487     }
22488     return;
22489   }
22490   if (depth == 8) {
22491     helper_double_26_recursive(buf + 0, 5);
22492     helper_double_26_recursive(buf + 32, 5);
22493     helper_double_26_recursive(buf + 64, 5);
22494     helper_double_26_recursive(buf + 96, 5);
22495     helper_double_26_recursive(buf + 128, 5);
22496     helper_double_26_recursive(buf + 160, 5);
22497     helper_double_26_recursive(buf + 192, 5);
22498     helper_double_26_recursive(buf + 224, 5);
22499     for (int j = 0; j < 256; j += 256) {
22500       for (int k = 0; k < 32; k += 2) {
22501         __asm__ volatile (
22502           "movupd (%0), %%xmm0\n"
22503           "movupd (%1), %%xmm1\n"
22504           "movupd (%2), %%xmm2\n"
22505           "movupd (%3), %%xmm3\n"
22506           "movupd (%4), %%xmm4\n"
22507           "movupd (%5), %%xmm5\n"
22508           "movupd (%6), %%xmm6\n"
22509           "movupd (%7), %%xmm7\n"
22510           "movapd %%xmm0, %%xmm8\n"
22511           "movapd %%xmm0, %%xmm9\n"
22512           "addpd %%xmm1, %%xmm8\n"
22513           "subpd %%xmm1, %%xmm9\n"
22514           "movapd %%xmm2, %%xmm10\n"
22515           "movapd %%xmm2, %%xmm11\n"
22516           "addpd %%xmm3, %%xmm10\n"
22517           "subpd %%xmm3, %%xmm11\n"
22518           "movapd %%xmm4, %%xmm12\n"
22519           "movapd %%xmm4, %%xmm13\n"
22520           "addpd %%xmm5, %%xmm12\n"
22521           "subpd %%xmm5, %%xmm13\n"
22522           "movapd %%xmm6, %%xmm14\n"
22523           "movapd %%xmm6, %%xmm15\n"
22524           "addpd %%xmm7, %%xmm14\n"
22525           "subpd %%xmm7, %%xmm15\n"
22526           "movapd %%xmm8, %%xmm0\n"
22527           "movapd %%xmm8, %%xmm2\n"
22528           "addpd %%xmm10, %%xmm0\n"
22529           "subpd %%xmm10, %%xmm2\n"
22530           "movapd %%xmm9, %%xmm1\n"
22531           "movapd %%xmm9, %%xmm3\n"
22532           "addpd %%xmm11, %%xmm1\n"
22533           "subpd %%xmm11, %%xmm3\n"
22534           "movapd %%xmm12, %%xmm4\n"
22535           "movapd %%xmm12, %%xmm6\n"
22536           "addpd %%xmm14, %%xmm4\n"
22537           "subpd %%xmm14, %%xmm6\n"
22538           "movapd %%xmm13, %%xmm5\n"
22539           "movapd %%xmm13, %%xmm7\n"
22540           "addpd %%xmm15, %%xmm5\n"
22541           "subpd %%xmm15, %%xmm7\n"
22542           "movapd %%xmm0, %%xmm8\n"
22543           "movapd %%xmm0, %%xmm12\n"
22544           "addpd %%xmm4, %%xmm8\n"
22545           "subpd %%xmm4, %%xmm12\n"
22546           "movapd %%xmm1, %%xmm9\n"
22547           "movapd %%xmm1, %%xmm13\n"
22548           "addpd %%xmm5, %%xmm9\n"
22549           "subpd %%xmm5, %%xmm13\n"
22550           "movapd %%xmm2, %%xmm10\n"
22551           "movapd %%xmm2, %%xmm14\n"
22552           "addpd %%xmm6, %%xmm10\n"
22553           "subpd %%xmm6, %%xmm14\n"
22554           "movapd %%xmm3, %%xmm11\n"
22555           "movapd %%xmm3, %%xmm15\n"
22556           "addpd %%xmm7, %%xmm11\n"
22557           "subpd %%xmm7, %%xmm15\n"
22558           "movupd %%xmm8, (%0)\n"
22559           "movupd %%xmm9, (%1)\n"
22560           "movupd %%xmm10, (%2)\n"
22561           "movupd %%xmm11, (%3)\n"
22562           "movupd %%xmm12, (%4)\n"
22563           "movupd %%xmm13, (%5)\n"
22564           "movupd %%xmm14, (%6)\n"
22565           "movupd %%xmm15, (%7)\n"
22566           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22567         );
22568       }
22569     }
22570     return;
22571   }
22572   if (depth == 11) {
22573     helper_double_26_recursive(buf + 0, 8);
22574     helper_double_26_recursive(buf + 256, 8);
22575     helper_double_26_recursive(buf + 512, 8);
22576     helper_double_26_recursive(buf + 768, 8);
22577     helper_double_26_recursive(buf + 1024, 8);
22578     helper_double_26_recursive(buf + 1280, 8);
22579     helper_double_26_recursive(buf + 1536, 8);
22580     helper_double_26_recursive(buf + 1792, 8);
22581     for (int j = 0; j < 2048; j += 2048) {
22582       for (int k = 0; k < 256; k += 2) {
22583         __asm__ volatile (
22584           "movupd (%0), %%xmm0\n"
22585           "movupd (%1), %%xmm1\n"
22586           "movupd (%2), %%xmm2\n"
22587           "movupd (%3), %%xmm3\n"
22588           "movupd (%4), %%xmm4\n"
22589           "movupd (%5), %%xmm5\n"
22590           "movupd (%6), %%xmm6\n"
22591           "movupd (%7), %%xmm7\n"
22592           "movapd %%xmm0, %%xmm8\n"
22593           "movapd %%xmm0, %%xmm9\n"
22594           "addpd %%xmm1, %%xmm8\n"
22595           "subpd %%xmm1, %%xmm9\n"
22596           "movapd %%xmm2, %%xmm10\n"
22597           "movapd %%xmm2, %%xmm11\n"
22598           "addpd %%xmm3, %%xmm10\n"
22599           "subpd %%xmm3, %%xmm11\n"
22600           "movapd %%xmm4, %%xmm12\n"
22601           "movapd %%xmm4, %%xmm13\n"
22602           "addpd %%xmm5, %%xmm12\n"
22603           "subpd %%xmm5, %%xmm13\n"
22604           "movapd %%xmm6, %%xmm14\n"
22605           "movapd %%xmm6, %%xmm15\n"
22606           "addpd %%xmm7, %%xmm14\n"
22607           "subpd %%xmm7, %%xmm15\n"
22608           "movapd %%xmm8, %%xmm0\n"
22609           "movapd %%xmm8, %%xmm2\n"
22610           "addpd %%xmm10, %%xmm0\n"
22611           "subpd %%xmm10, %%xmm2\n"
22612           "movapd %%xmm9, %%xmm1\n"
22613           "movapd %%xmm9, %%xmm3\n"
22614           "addpd %%xmm11, %%xmm1\n"
22615           "subpd %%xmm11, %%xmm3\n"
22616           "movapd %%xmm12, %%xmm4\n"
22617           "movapd %%xmm12, %%xmm6\n"
22618           "addpd %%xmm14, %%xmm4\n"
22619           "subpd %%xmm14, %%xmm6\n"
22620           "movapd %%xmm13, %%xmm5\n"
22621           "movapd %%xmm13, %%xmm7\n"
22622           "addpd %%xmm15, %%xmm5\n"
22623           "subpd %%xmm15, %%xmm7\n"
22624           "movapd %%xmm0, %%xmm8\n"
22625           "movapd %%xmm0, %%xmm12\n"
22626           "addpd %%xmm4, %%xmm8\n"
22627           "subpd %%xmm4, %%xmm12\n"
22628           "movapd %%xmm1, %%xmm9\n"
22629           "movapd %%xmm1, %%xmm13\n"
22630           "addpd %%xmm5, %%xmm9\n"
22631           "subpd %%xmm5, %%xmm13\n"
22632           "movapd %%xmm2, %%xmm10\n"
22633           "movapd %%xmm2, %%xmm14\n"
22634           "addpd %%xmm6, %%xmm10\n"
22635           "subpd %%xmm6, %%xmm14\n"
22636           "movapd %%xmm3, %%xmm11\n"
22637           "movapd %%xmm3, %%xmm15\n"
22638           "addpd %%xmm7, %%xmm11\n"
22639           "subpd %%xmm7, %%xmm15\n"
22640           "movupd %%xmm8, (%0)\n"
22641           "movupd %%xmm9, (%1)\n"
22642           "movupd %%xmm10, (%2)\n"
22643           "movupd %%xmm11, (%3)\n"
22644           "movupd %%xmm12, (%4)\n"
22645           "movupd %%xmm13, (%5)\n"
22646           "movupd %%xmm14, (%6)\n"
22647           "movupd %%xmm15, (%7)\n"
22648           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22649         );
22650       }
22651     }
22652     return;
22653   }
22654   if (depth == 14) {
22655     helper_double_26_recursive(buf + 0, 11);
22656     helper_double_26_recursive(buf + 2048, 11);
22657     helper_double_26_recursive(buf + 4096, 11);
22658     helper_double_26_recursive(buf + 6144, 11);
22659     helper_double_26_recursive(buf + 8192, 11);
22660     helper_double_26_recursive(buf + 10240, 11);
22661     helper_double_26_recursive(buf + 12288, 11);
22662     helper_double_26_recursive(buf + 14336, 11);
22663     for (int j = 0; j < 16384; j += 16384) {
22664       for (int k = 0; k < 2048; k += 2) {
22665         __asm__ volatile (
22666           "movupd (%0), %%xmm0\n"
22667           "movupd (%1), %%xmm1\n"
22668           "movupd (%2), %%xmm2\n"
22669           "movupd (%3), %%xmm3\n"
22670           "movupd (%4), %%xmm4\n"
22671           "movupd (%5), %%xmm5\n"
22672           "movupd (%6), %%xmm6\n"
22673           "movupd (%7), %%xmm7\n"
22674           "movapd %%xmm0, %%xmm8\n"
22675           "movapd %%xmm0, %%xmm9\n"
22676           "addpd %%xmm1, %%xmm8\n"
22677           "subpd %%xmm1, %%xmm9\n"
22678           "movapd %%xmm2, %%xmm10\n"
22679           "movapd %%xmm2, %%xmm11\n"
22680           "addpd %%xmm3, %%xmm10\n"
22681           "subpd %%xmm3, %%xmm11\n"
22682           "movapd %%xmm4, %%xmm12\n"
22683           "movapd %%xmm4, %%xmm13\n"
22684           "addpd %%xmm5, %%xmm12\n"
22685           "subpd %%xmm5, %%xmm13\n"
22686           "movapd %%xmm6, %%xmm14\n"
22687           "movapd %%xmm6, %%xmm15\n"
22688           "addpd %%xmm7, %%xmm14\n"
22689           "subpd %%xmm7, %%xmm15\n"
22690           "movapd %%xmm8, %%xmm0\n"
22691           "movapd %%xmm8, %%xmm2\n"
22692           "addpd %%xmm10, %%xmm0\n"
22693           "subpd %%xmm10, %%xmm2\n"
22694           "movapd %%xmm9, %%xmm1\n"
22695           "movapd %%xmm9, %%xmm3\n"
22696           "addpd %%xmm11, %%xmm1\n"
22697           "subpd %%xmm11, %%xmm3\n"
22698           "movapd %%xmm12, %%xmm4\n"
22699           "movapd %%xmm12, %%xmm6\n"
22700           "addpd %%xmm14, %%xmm4\n"
22701           "subpd %%xmm14, %%xmm6\n"
22702           "movapd %%xmm13, %%xmm5\n"
22703           "movapd %%xmm13, %%xmm7\n"
22704           "addpd %%xmm15, %%xmm5\n"
22705           "subpd %%xmm15, %%xmm7\n"
22706           "movapd %%xmm0, %%xmm8\n"
22707           "movapd %%xmm0, %%xmm12\n"
22708           "addpd %%xmm4, %%xmm8\n"
22709           "subpd %%xmm4, %%xmm12\n"
22710           "movapd %%xmm1, %%xmm9\n"
22711           "movapd %%xmm1, %%xmm13\n"
22712           "addpd %%xmm5, %%xmm9\n"
22713           "subpd %%xmm5, %%xmm13\n"
22714           "movapd %%xmm2, %%xmm10\n"
22715           "movapd %%xmm2, %%xmm14\n"
22716           "addpd %%xmm6, %%xmm10\n"
22717           "subpd %%xmm6, %%xmm14\n"
22718           "movapd %%xmm3, %%xmm11\n"
22719           "movapd %%xmm3, %%xmm15\n"
22720           "addpd %%xmm7, %%xmm11\n"
22721           "subpd %%xmm7, %%xmm15\n"
22722           "movupd %%xmm8, (%0)\n"
22723           "movupd %%xmm9, (%1)\n"
22724           "movupd %%xmm10, (%2)\n"
22725           "movupd %%xmm11, (%3)\n"
22726           "movupd %%xmm12, (%4)\n"
22727           "movupd %%xmm13, (%5)\n"
22728           "movupd %%xmm14, (%6)\n"
22729           "movupd %%xmm15, (%7)\n"
22730           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22731         );
22732       }
22733     }
22734     return;
22735   }
22736   if (depth == 17) {
22737     helper_double_26_recursive(buf + 0, 14);
22738     helper_double_26_recursive(buf + 16384, 14);
22739     helper_double_26_recursive(buf + 32768, 14);
22740     helper_double_26_recursive(buf + 49152, 14);
22741     helper_double_26_recursive(buf + 65536, 14);
22742     helper_double_26_recursive(buf + 81920, 14);
22743     helper_double_26_recursive(buf + 98304, 14);
22744     helper_double_26_recursive(buf + 114688, 14);
22745     for (int j = 0; j < 131072; j += 131072) {
22746       for (int k = 0; k < 16384; k += 2) {
22747         __asm__ volatile (
22748           "movupd (%0), %%xmm0\n"
22749           "movupd (%1), %%xmm1\n"
22750           "movupd (%2), %%xmm2\n"
22751           "movupd (%3), %%xmm3\n"
22752           "movupd (%4), %%xmm4\n"
22753           "movupd (%5), %%xmm5\n"
22754           "movupd (%6), %%xmm6\n"
22755           "movupd (%7), %%xmm7\n"
22756           "movapd %%xmm0, %%xmm8\n"
22757           "movapd %%xmm0, %%xmm9\n"
22758           "addpd %%xmm1, %%xmm8\n"
22759           "subpd %%xmm1, %%xmm9\n"
22760           "movapd %%xmm2, %%xmm10\n"
22761           "movapd %%xmm2, %%xmm11\n"
22762           "addpd %%xmm3, %%xmm10\n"
22763           "subpd %%xmm3, %%xmm11\n"
22764           "movapd %%xmm4, %%xmm12\n"
22765           "movapd %%xmm4, %%xmm13\n"
22766           "addpd %%xmm5, %%xmm12\n"
22767           "subpd %%xmm5, %%xmm13\n"
22768           "movapd %%xmm6, %%xmm14\n"
22769           "movapd %%xmm6, %%xmm15\n"
22770           "addpd %%xmm7, %%xmm14\n"
22771           "subpd %%xmm7, %%xmm15\n"
22772           "movapd %%xmm8, %%xmm0\n"
22773           "movapd %%xmm8, %%xmm2\n"
22774           "addpd %%xmm10, %%xmm0\n"
22775           "subpd %%xmm10, %%xmm2\n"
22776           "movapd %%xmm9, %%xmm1\n"
22777           "movapd %%xmm9, %%xmm3\n"
22778           "addpd %%xmm11, %%xmm1\n"
22779           "subpd %%xmm11, %%xmm3\n"
22780           "movapd %%xmm12, %%xmm4\n"
22781           "movapd %%xmm12, %%xmm6\n"
22782           "addpd %%xmm14, %%xmm4\n"
22783           "subpd %%xmm14, %%xmm6\n"
22784           "movapd %%xmm13, %%xmm5\n"
22785           "movapd %%xmm13, %%xmm7\n"
22786           "addpd %%xmm15, %%xmm5\n"
22787           "subpd %%xmm15, %%xmm7\n"
22788           "movapd %%xmm0, %%xmm8\n"
22789           "movapd %%xmm0, %%xmm12\n"
22790           "addpd %%xmm4, %%xmm8\n"
22791           "subpd %%xmm4, %%xmm12\n"
22792           "movapd %%xmm1, %%xmm9\n"
22793           "movapd %%xmm1, %%xmm13\n"
22794           "addpd %%xmm5, %%xmm9\n"
22795           "subpd %%xmm5, %%xmm13\n"
22796           "movapd %%xmm2, %%xmm10\n"
22797           "movapd %%xmm2, %%xmm14\n"
22798           "addpd %%xmm6, %%xmm10\n"
22799           "subpd %%xmm6, %%xmm14\n"
22800           "movapd %%xmm3, %%xmm11\n"
22801           "movapd %%xmm3, %%xmm15\n"
22802           "addpd %%xmm7, %%xmm11\n"
22803           "subpd %%xmm7, %%xmm15\n"
22804           "movupd %%xmm8, (%0)\n"
22805           "movupd %%xmm9, (%1)\n"
22806           "movupd %%xmm10, (%2)\n"
22807           "movupd %%xmm11, (%3)\n"
22808           "movupd %%xmm12, (%4)\n"
22809           "movupd %%xmm13, (%5)\n"
22810           "movupd %%xmm14, (%6)\n"
22811           "movupd %%xmm15, (%7)\n"
22812           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22813         );
22814       }
22815     }
22816     return;
22817   }
22818   if (depth == 20) {
22819     helper_double_26_recursive(buf + 0, 17);
22820     helper_double_26_recursive(buf + 131072, 17);
22821     helper_double_26_recursive(buf + 262144, 17);
22822     helper_double_26_recursive(buf + 393216, 17);
22823     helper_double_26_recursive(buf + 524288, 17);
22824     helper_double_26_recursive(buf + 655360, 17);
22825     helper_double_26_recursive(buf + 786432, 17);
22826     helper_double_26_recursive(buf + 917504, 17);
22827     for (int j = 0; j < 1048576; j += 1048576) {
22828       for (int k = 0; k < 131072; k += 2) {
22829         __asm__ volatile (
22830           "movupd (%0), %%xmm0\n"
22831           "movupd (%1), %%xmm1\n"
22832           "movupd (%2), %%xmm2\n"
22833           "movupd (%3), %%xmm3\n"
22834           "movupd (%4), %%xmm4\n"
22835           "movupd (%5), %%xmm5\n"
22836           "movupd (%6), %%xmm6\n"
22837           "movupd (%7), %%xmm7\n"
22838           "movapd %%xmm0, %%xmm8\n"
22839           "movapd %%xmm0, %%xmm9\n"
22840           "addpd %%xmm1, %%xmm8\n"
22841           "subpd %%xmm1, %%xmm9\n"
22842           "movapd %%xmm2, %%xmm10\n"
22843           "movapd %%xmm2, %%xmm11\n"
22844           "addpd %%xmm3, %%xmm10\n"
22845           "subpd %%xmm3, %%xmm11\n"
22846           "movapd %%xmm4, %%xmm12\n"
22847           "movapd %%xmm4, %%xmm13\n"
22848           "addpd %%xmm5, %%xmm12\n"
22849           "subpd %%xmm5, %%xmm13\n"
22850           "movapd %%xmm6, %%xmm14\n"
22851           "movapd %%xmm6, %%xmm15\n"
22852           "addpd %%xmm7, %%xmm14\n"
22853           "subpd %%xmm7, %%xmm15\n"
22854           "movapd %%xmm8, %%xmm0\n"
22855           "movapd %%xmm8, %%xmm2\n"
22856           "addpd %%xmm10, %%xmm0\n"
22857           "subpd %%xmm10, %%xmm2\n"
22858           "movapd %%xmm9, %%xmm1\n"
22859           "movapd %%xmm9, %%xmm3\n"
22860           "addpd %%xmm11, %%xmm1\n"
22861           "subpd %%xmm11, %%xmm3\n"
22862           "movapd %%xmm12, %%xmm4\n"
22863           "movapd %%xmm12, %%xmm6\n"
22864           "addpd %%xmm14, %%xmm4\n"
22865           "subpd %%xmm14, %%xmm6\n"
22866           "movapd %%xmm13, %%xmm5\n"
22867           "movapd %%xmm13, %%xmm7\n"
22868           "addpd %%xmm15, %%xmm5\n"
22869           "subpd %%xmm15, %%xmm7\n"
22870           "movapd %%xmm0, %%xmm8\n"
22871           "movapd %%xmm0, %%xmm12\n"
22872           "addpd %%xmm4, %%xmm8\n"
22873           "subpd %%xmm4, %%xmm12\n"
22874           "movapd %%xmm1, %%xmm9\n"
22875           "movapd %%xmm1, %%xmm13\n"
22876           "addpd %%xmm5, %%xmm9\n"
22877           "subpd %%xmm5, %%xmm13\n"
22878           "movapd %%xmm2, %%xmm10\n"
22879           "movapd %%xmm2, %%xmm14\n"
22880           "addpd %%xmm6, %%xmm10\n"
22881           "subpd %%xmm6, %%xmm14\n"
22882           "movapd %%xmm3, %%xmm11\n"
22883           "movapd %%xmm3, %%xmm15\n"
22884           "addpd %%xmm7, %%xmm11\n"
22885           "subpd %%xmm7, %%xmm15\n"
22886           "movupd %%xmm8, (%0)\n"
22887           "movupd %%xmm9, (%1)\n"
22888           "movupd %%xmm10, (%2)\n"
22889           "movupd %%xmm11, (%3)\n"
22890           "movupd %%xmm12, (%4)\n"
22891           "movupd %%xmm13, (%5)\n"
22892           "movupd %%xmm14, (%6)\n"
22893           "movupd %%xmm15, (%7)\n"
22894           :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22895         );
22896       }
22897     }
22898     return;
22899   }
22900   if (depth == 23) {
22901     helper_double_26_recursive(buf + 0, 20);
22902     helper_double_26_recursive(buf + 1048576, 20);
22903     helper_double_26_recursive(buf + 2097152, 20);
22904     helper_double_26_recursive(buf + 3145728, 20);
22905     helper_double_26_recursive(buf + 4194304, 20);
22906     helper_double_26_recursive(buf + 5242880, 20);
22907     helper_double_26_recursive(buf + 6291456, 20);
22908     helper_double_26_recursive(buf + 7340032, 20);
22909     for (int j = 0; j < 8388608; j += 8388608) {
22910       for (int k = 0; k < 1048576; k += 2) {
22911         __asm__ volatile (
22912           "movupd (%0), %%xmm0\n"
22913           "movupd (%1), %%xmm1\n"
22914           "movupd (%2), %%xmm2\n"
22915           "movupd (%3), %%xmm3\n"
22916           "movupd (%4), %%xmm4\n"
22917           "movupd (%5), %%xmm5\n"
22918           "movupd (%6), %%xmm6\n"
22919           "movupd (%7), %%xmm7\n"
22920           "movapd %%xmm0, %%xmm8\n"
22921           "movapd %%xmm0, %%xmm9\n"
22922           "addpd %%xmm1, %%xmm8\n"
22923           "subpd %%xmm1, %%xmm9\n"
22924           "movapd %%xmm2, %%xmm10\n"
22925           "movapd %%xmm2, %%xmm11\n"
22926           "addpd %%xmm3, %%xmm10\n"
22927           "subpd %%xmm3, %%xmm11\n"
22928           "movapd %%xmm4, %%xmm12\n"
22929           "movapd %%xmm4, %%xmm13\n"
22930           "addpd %%xmm5, %%xmm12\n"
22931           "subpd %%xmm5, %%xmm13\n"
22932           "movapd %%xmm6, %%xmm14\n"
22933           "movapd %%xmm6, %%xmm15\n"
22934           "addpd %%xmm7, %%xmm14\n"
22935           "subpd %%xmm7, %%xmm15\n"
22936           "movapd %%xmm8, %%xmm0\n"
22937           "movapd %%xmm8, %%xmm2\n"
22938           "addpd %%xmm10, %%xmm0\n"
22939           "subpd %%xmm10, %%xmm2\n"
22940           "movapd %%xmm9, %%xmm1\n"
22941           "movapd %%xmm9, %%xmm3\n"
22942           "addpd %%xmm11, %%xmm1\n"
22943           "subpd %%xmm11, %%xmm3\n"
22944           "movapd %%xmm12, %%xmm4\n"
22945           "movapd %%xmm12, %%xmm6\n"
22946           "addpd %%xmm14, %%xmm4\n"
22947           "subpd %%xmm14, %%xmm6\n"
22948           "movapd %%xmm13, %%xmm5\n"
22949           "movapd %%xmm13, %%xmm7\n"
22950           "addpd %%xmm15, %%xmm5\n"
22951           "subpd %%xmm15, %%xmm7\n"
22952           "movapd %%xmm0, %%xmm8\n"
22953           "movapd %%xmm0, %%xmm12\n"
22954           "addpd %%xmm4, %%xmm8\n"
22955           "subpd %%xmm4, %%xmm12\n"
22956           "movapd %%xmm1, %%xmm9\n"
22957           "movapd %%xmm1, %%xmm13\n"
22958           "addpd %%xmm5, %%xmm9\n"
22959           "subpd %%xmm5, %%xmm13\n"
22960           "movapd %%xmm2, %%xmm10\n"
22961           "movapd %%xmm2, %%xmm14\n"
22962           "addpd %%xmm6, %%xmm10\n"
22963           "subpd %%xmm6, %%xmm14\n"
22964           "movapd %%xmm3, %%xmm11\n"
22965           "movapd %%xmm3, %%xmm15\n"
22966           "addpd %%xmm7, %%xmm11\n"
22967           "subpd %%xmm7, %%xmm15\n"
22968           "movupd %%xmm8, (%0)\n"
22969           "movupd %%xmm9, (%1)\n"
22970           "movupd %%xmm10, (%2)\n"
22971           "movupd %%xmm11, (%3)\n"
22972           "movupd %%xmm12, (%4)\n"
22973           "movupd %%xmm13, (%5)\n"
22974           "movupd %%xmm14, (%6)\n"
22975           "movupd %%xmm15, (%7)\n"
22976           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22977         );
22978       }
22979     }
22980     return;
22981   }
22982   if (depth == 26) {
22983     helper_double_26_recursive(buf + 0, 23);
22984     helper_double_26_recursive(buf + 8388608, 23);
22985     helper_double_26_recursive(buf + 16777216, 23);
22986     helper_double_26_recursive(buf + 25165824, 23);
22987     helper_double_26_recursive(buf + 33554432, 23);
22988     helper_double_26_recursive(buf + 41943040, 23);
22989     helper_double_26_recursive(buf + 50331648, 23);
22990     helper_double_26_recursive(buf + 58720256, 23);
22991     for (int j = 0; j < 67108864; j += 67108864) {
22992       for (int k = 0; k < 8388608; k += 2) {
22993         __asm__ volatile (
22994           "movupd (%0), %%xmm0\n"
22995           "movupd (%1), %%xmm1\n"
22996           "movupd (%2), %%xmm2\n"
22997           "movupd (%3), %%xmm3\n"
22998           "movupd (%4), %%xmm4\n"
22999           "movupd (%5), %%xmm5\n"
23000           "movupd (%6), %%xmm6\n"
23001           "movupd (%7), %%xmm7\n"
23002           "movapd %%xmm0, %%xmm8\n"
23003           "movapd %%xmm0, %%xmm9\n"
23004           "addpd %%xmm1, %%xmm8\n"
23005           "subpd %%xmm1, %%xmm9\n"
23006           "movapd %%xmm2, %%xmm10\n"
23007           "movapd %%xmm2, %%xmm11\n"
23008           "addpd %%xmm3, %%xmm10\n"
23009           "subpd %%xmm3, %%xmm11\n"
23010           "movapd %%xmm4, %%xmm12\n"
23011           "movapd %%xmm4, %%xmm13\n"
23012           "addpd %%xmm5, %%xmm12\n"
23013           "subpd %%xmm5, %%xmm13\n"
23014           "movapd %%xmm6, %%xmm14\n"
23015           "movapd %%xmm6, %%xmm15\n"
23016           "addpd %%xmm7, %%xmm14\n"
23017           "subpd %%xmm7, %%xmm15\n"
23018           "movapd %%xmm8, %%xmm0\n"
23019           "movapd %%xmm8, %%xmm2\n"
23020           "addpd %%xmm10, %%xmm0\n"
23021           "subpd %%xmm10, %%xmm2\n"
23022           "movapd %%xmm9, %%xmm1\n"
23023           "movapd %%xmm9, %%xmm3\n"
23024           "addpd %%xmm11, %%xmm1\n"
23025           "subpd %%xmm11, %%xmm3\n"
23026           "movapd %%xmm12, %%xmm4\n"
23027           "movapd %%xmm12, %%xmm6\n"
23028           "addpd %%xmm14, %%xmm4\n"
23029           "subpd %%xmm14, %%xmm6\n"
23030           "movapd %%xmm13, %%xmm5\n"
23031           "movapd %%xmm13, %%xmm7\n"
23032           "addpd %%xmm15, %%xmm5\n"
23033           "subpd %%xmm15, %%xmm7\n"
23034           "movapd %%xmm0, %%xmm8\n"
23035           "movapd %%xmm0, %%xmm12\n"
23036           "addpd %%xmm4, %%xmm8\n"
23037           "subpd %%xmm4, %%xmm12\n"
23038           "movapd %%xmm1, %%xmm9\n"
23039           "movapd %%xmm1, %%xmm13\n"
23040           "addpd %%xmm5, %%xmm9\n"
23041           "subpd %%xmm5, %%xmm13\n"
23042           "movapd %%xmm2, %%xmm10\n"
23043           "movapd %%xmm2, %%xmm14\n"
23044           "addpd %%xmm6, %%xmm10\n"
23045           "subpd %%xmm6, %%xmm14\n"
23046           "movapd %%xmm3, %%xmm11\n"
23047           "movapd %%xmm3, %%xmm15\n"
23048           "addpd %%xmm7, %%xmm11\n"
23049           "subpd %%xmm7, %%xmm15\n"
23050           "movupd %%xmm8, (%0)\n"
23051           "movupd %%xmm9, (%1)\n"
23052           "movupd %%xmm10, (%2)\n"
23053           "movupd %%xmm11, (%3)\n"
23054           "movupd %%xmm12, (%4)\n"
23055           "movupd %%xmm13, (%5)\n"
23056           "movupd %%xmm14, (%6)\n"
23057           "movupd %%xmm15, (%7)\n"
23058           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23059         );
23060       }
23061     }
23062     return;
23063   }
23064 }
23065 void helper_double_26(double *buf);
helper_double_26(double * buf)23066 void helper_double_26(double *buf) {
23067   helper_double_26_recursive(buf, 26);
23068 }
23069 void helper_double_27_recursive(double *buf, int depth);
helper_double_27_recursive(double * buf,int depth)23070 void helper_double_27_recursive(double *buf, int depth) {
23071   if (depth == 6) {
23072     for (int j = 0; j < 64; j += 16) {
23073       for (int k = 0; k < 2; k += 2) {
23074         __asm__ volatile (
23075           "movupd (%0), %%xmm0\n"
23076           "movupd (%1), %%xmm1\n"
23077           "movupd (%2), %%xmm2\n"
23078           "movupd (%3), %%xmm3\n"
23079           "movupd (%4), %%xmm4\n"
23080           "movupd (%5), %%xmm5\n"
23081           "movupd (%6), %%xmm6\n"
23082           "movupd (%7), %%xmm7\n"
23083           "movapd %%xmm0, %%xmm8\n"
23084           "haddpd %%xmm8, %%xmm8\n"
23085           "movapd %%xmm0, %%xmm9\n"
23086           "hsubpd %%xmm9, %%xmm9\n"
23087           "blendpd $1, %%xmm8, %%xmm9\n"
23088           "movapd %%xmm9, %%xmm0\n"
23089           "movapd %%xmm1, %%xmm8\n"
23090           "haddpd %%xmm8, %%xmm8\n"
23091           "movapd %%xmm1, %%xmm9\n"
23092           "hsubpd %%xmm9, %%xmm9\n"
23093           "blendpd $1, %%xmm8, %%xmm9\n"
23094           "movapd %%xmm9, %%xmm1\n"
23095           "movapd %%xmm2, %%xmm8\n"
23096           "haddpd %%xmm8, %%xmm8\n"
23097           "movapd %%xmm2, %%xmm9\n"
23098           "hsubpd %%xmm9, %%xmm9\n"
23099           "blendpd $1, %%xmm8, %%xmm9\n"
23100           "movapd %%xmm9, %%xmm2\n"
23101           "movapd %%xmm3, %%xmm8\n"
23102           "haddpd %%xmm8, %%xmm8\n"
23103           "movapd %%xmm3, %%xmm9\n"
23104           "hsubpd %%xmm9, %%xmm9\n"
23105           "blendpd $1, %%xmm8, %%xmm9\n"
23106           "movapd %%xmm9, %%xmm3\n"
23107           "movapd %%xmm4, %%xmm8\n"
23108           "haddpd %%xmm8, %%xmm8\n"
23109           "movapd %%xmm4, %%xmm9\n"
23110           "hsubpd %%xmm9, %%xmm9\n"
23111           "blendpd $1, %%xmm8, %%xmm9\n"
23112           "movapd %%xmm9, %%xmm4\n"
23113           "movapd %%xmm5, %%xmm8\n"
23114           "haddpd %%xmm8, %%xmm8\n"
23115           "movapd %%xmm5, %%xmm9\n"
23116           "hsubpd %%xmm9, %%xmm9\n"
23117           "blendpd $1, %%xmm8, %%xmm9\n"
23118           "movapd %%xmm9, %%xmm5\n"
23119           "movapd %%xmm6, %%xmm8\n"
23120           "haddpd %%xmm8, %%xmm8\n"
23121           "movapd %%xmm6, %%xmm9\n"
23122           "hsubpd %%xmm9, %%xmm9\n"
23123           "blendpd $1, %%xmm8, %%xmm9\n"
23124           "movapd %%xmm9, %%xmm6\n"
23125           "movapd %%xmm7, %%xmm8\n"
23126           "haddpd %%xmm8, %%xmm8\n"
23127           "movapd %%xmm7, %%xmm9\n"
23128           "hsubpd %%xmm9, %%xmm9\n"
23129           "blendpd $1, %%xmm8, %%xmm9\n"
23130           "movapd %%xmm9, %%xmm7\n"
23131           "movapd %%xmm0, %%xmm8\n"
23132           "movapd %%xmm0, %%xmm9\n"
23133           "addpd %%xmm1, %%xmm8\n"
23134           "subpd %%xmm1, %%xmm9\n"
23135           "movapd %%xmm2, %%xmm10\n"
23136           "movapd %%xmm2, %%xmm11\n"
23137           "addpd %%xmm3, %%xmm10\n"
23138           "subpd %%xmm3, %%xmm11\n"
23139           "movapd %%xmm4, %%xmm12\n"
23140           "movapd %%xmm4, %%xmm13\n"
23141           "addpd %%xmm5, %%xmm12\n"
23142           "subpd %%xmm5, %%xmm13\n"
23143           "movapd %%xmm6, %%xmm14\n"
23144           "movapd %%xmm6, %%xmm15\n"
23145           "addpd %%xmm7, %%xmm14\n"
23146           "subpd %%xmm7, %%xmm15\n"
23147           "movapd %%xmm8, %%xmm0\n"
23148           "movapd %%xmm8, %%xmm2\n"
23149           "addpd %%xmm10, %%xmm0\n"
23150           "subpd %%xmm10, %%xmm2\n"
23151           "movapd %%xmm9, %%xmm1\n"
23152           "movapd %%xmm9, %%xmm3\n"
23153           "addpd %%xmm11, %%xmm1\n"
23154           "subpd %%xmm11, %%xmm3\n"
23155           "movapd %%xmm12, %%xmm4\n"
23156           "movapd %%xmm12, %%xmm6\n"
23157           "addpd %%xmm14, %%xmm4\n"
23158           "subpd %%xmm14, %%xmm6\n"
23159           "movapd %%xmm13, %%xmm5\n"
23160           "movapd %%xmm13, %%xmm7\n"
23161           "addpd %%xmm15, %%xmm5\n"
23162           "subpd %%xmm15, %%xmm7\n"
23163           "movapd %%xmm0, %%xmm8\n"
23164           "movapd %%xmm0, %%xmm12\n"
23165           "addpd %%xmm4, %%xmm8\n"
23166           "subpd %%xmm4, %%xmm12\n"
23167           "movapd %%xmm1, %%xmm9\n"
23168           "movapd %%xmm1, %%xmm13\n"
23169           "addpd %%xmm5, %%xmm9\n"
23170           "subpd %%xmm5, %%xmm13\n"
23171           "movapd %%xmm2, %%xmm10\n"
23172           "movapd %%xmm2, %%xmm14\n"
23173           "addpd %%xmm6, %%xmm10\n"
23174           "subpd %%xmm6, %%xmm14\n"
23175           "movapd %%xmm3, %%xmm11\n"
23176           "movapd %%xmm3, %%xmm15\n"
23177           "addpd %%xmm7, %%xmm11\n"
23178           "subpd %%xmm7, %%xmm15\n"
23179           "movupd %%xmm8, (%0)\n"
23180           "movupd %%xmm9, (%1)\n"
23181           "movupd %%xmm10, (%2)\n"
23182           "movupd %%xmm11, (%3)\n"
23183           "movupd %%xmm12, (%4)\n"
23184           "movupd %%xmm13, (%5)\n"
23185           "movupd %%xmm14, (%6)\n"
23186           "movupd %%xmm15, (%7)\n"
23187           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23188         );
23189       }
23190     }
23191     for (int j = 0; j < 64; j += 64) {
23192       for (int k = 0; k < 16; k += 2) {
23193         __asm__ volatile (
23194           "movupd (%0), %%xmm0\n"
23195           "movupd (%1), %%xmm1\n"
23196           "movupd (%2), %%xmm2\n"
23197           "movupd (%3), %%xmm3\n"
23198           "movapd %%xmm0, %%xmm8\n"
23199           "movapd %%xmm0, %%xmm9\n"
23200           "addpd %%xmm1, %%xmm8\n"
23201           "subpd %%xmm1, %%xmm9\n"
23202           "movapd %%xmm2, %%xmm10\n"
23203           "movapd %%xmm2, %%xmm11\n"
23204           "addpd %%xmm3, %%xmm10\n"
23205           "subpd %%xmm3, %%xmm11\n"
23206           "movapd %%xmm8, %%xmm0\n"
23207           "movapd %%xmm8, %%xmm2\n"
23208           "addpd %%xmm10, %%xmm0\n"
23209           "subpd %%xmm10, %%xmm2\n"
23210           "movapd %%xmm9, %%xmm1\n"
23211           "movapd %%xmm9, %%xmm3\n"
23212           "addpd %%xmm11, %%xmm1\n"
23213           "subpd %%xmm11, %%xmm3\n"
23214           "movupd %%xmm0, (%0)\n"
23215           "movupd %%xmm1, (%1)\n"
23216           "movupd %%xmm2, (%2)\n"
23217           "movupd %%xmm3, (%3)\n"
23218           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23219         );
23220       }
23221     }
23222     return;
23223   }
23224   if (depth == 9) {
23225     helper_double_27_recursive(buf + 0, 6);
23226     helper_double_27_recursive(buf + 64, 6);
23227     helper_double_27_recursive(buf + 128, 6);
23228     helper_double_27_recursive(buf + 192, 6);
23229     helper_double_27_recursive(buf + 256, 6);
23230     helper_double_27_recursive(buf + 320, 6);
23231     helper_double_27_recursive(buf + 384, 6);
23232     helper_double_27_recursive(buf + 448, 6);
23233     for (int j = 0; j < 512; j += 512) {
23234       for (int k = 0; k < 64; k += 2) {
23235         __asm__ volatile (
23236           "movupd (%0), %%xmm0\n"
23237           "movupd (%1), %%xmm1\n"
23238           "movupd (%2), %%xmm2\n"
23239           "movupd (%3), %%xmm3\n"
23240           "movupd (%4), %%xmm4\n"
23241           "movupd (%5), %%xmm5\n"
23242           "movupd (%6), %%xmm6\n"
23243           "movupd (%7), %%xmm7\n"
23244           "movapd %%xmm0, %%xmm8\n"
23245           "movapd %%xmm0, %%xmm9\n"
23246           "addpd %%xmm1, %%xmm8\n"
23247           "subpd %%xmm1, %%xmm9\n"
23248           "movapd %%xmm2, %%xmm10\n"
23249           "movapd %%xmm2, %%xmm11\n"
23250           "addpd %%xmm3, %%xmm10\n"
23251           "subpd %%xmm3, %%xmm11\n"
23252           "movapd %%xmm4, %%xmm12\n"
23253           "movapd %%xmm4, %%xmm13\n"
23254           "addpd %%xmm5, %%xmm12\n"
23255           "subpd %%xmm5, %%xmm13\n"
23256           "movapd %%xmm6, %%xmm14\n"
23257           "movapd %%xmm6, %%xmm15\n"
23258           "addpd %%xmm7, %%xmm14\n"
23259           "subpd %%xmm7, %%xmm15\n"
23260           "movapd %%xmm8, %%xmm0\n"
23261           "movapd %%xmm8, %%xmm2\n"
23262           "addpd %%xmm10, %%xmm0\n"
23263           "subpd %%xmm10, %%xmm2\n"
23264           "movapd %%xmm9, %%xmm1\n"
23265           "movapd %%xmm9, %%xmm3\n"
23266           "addpd %%xmm11, %%xmm1\n"
23267           "subpd %%xmm11, %%xmm3\n"
23268           "movapd %%xmm12, %%xmm4\n"
23269           "movapd %%xmm12, %%xmm6\n"
23270           "addpd %%xmm14, %%xmm4\n"
23271           "subpd %%xmm14, %%xmm6\n"
23272           "movapd %%xmm13, %%xmm5\n"
23273           "movapd %%xmm13, %%xmm7\n"
23274           "addpd %%xmm15, %%xmm5\n"
23275           "subpd %%xmm15, %%xmm7\n"
23276           "movapd %%xmm0, %%xmm8\n"
23277           "movapd %%xmm0, %%xmm12\n"
23278           "addpd %%xmm4, %%xmm8\n"
23279           "subpd %%xmm4, %%xmm12\n"
23280           "movapd %%xmm1, %%xmm9\n"
23281           "movapd %%xmm1, %%xmm13\n"
23282           "addpd %%xmm5, %%xmm9\n"
23283           "subpd %%xmm5, %%xmm13\n"
23284           "movapd %%xmm2, %%xmm10\n"
23285           "movapd %%xmm2, %%xmm14\n"
23286           "addpd %%xmm6, %%xmm10\n"
23287           "subpd %%xmm6, %%xmm14\n"
23288           "movapd %%xmm3, %%xmm11\n"
23289           "movapd %%xmm3, %%xmm15\n"
23290           "addpd %%xmm7, %%xmm11\n"
23291           "subpd %%xmm7, %%xmm15\n"
23292           "movupd %%xmm8, (%0)\n"
23293           "movupd %%xmm9, (%1)\n"
23294           "movupd %%xmm10, (%2)\n"
23295           "movupd %%xmm11, (%3)\n"
23296           "movupd %%xmm12, (%4)\n"
23297           "movupd %%xmm13, (%5)\n"
23298           "movupd %%xmm14, (%6)\n"
23299           "movupd %%xmm15, (%7)\n"
23300           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23301         );
23302       }
23303     }
23304     return;
23305   }
23306   if (depth == 12) {
23307     helper_double_27_recursive(buf + 0, 9);
23308     helper_double_27_recursive(buf + 512, 9);
23309     helper_double_27_recursive(buf + 1024, 9);
23310     helper_double_27_recursive(buf + 1536, 9);
23311     helper_double_27_recursive(buf + 2048, 9);
23312     helper_double_27_recursive(buf + 2560, 9);
23313     helper_double_27_recursive(buf + 3072, 9);
23314     helper_double_27_recursive(buf + 3584, 9);
23315     for (int j = 0; j < 4096; j += 4096) {
23316       for (int k = 0; k < 512; k += 2) {
23317         __asm__ volatile (
23318           "movupd (%0), %%xmm0\n"
23319           "movupd (%1), %%xmm1\n"
23320           "movupd (%2), %%xmm2\n"
23321           "movupd (%3), %%xmm3\n"
23322           "movupd (%4), %%xmm4\n"
23323           "movupd (%5), %%xmm5\n"
23324           "movupd (%6), %%xmm6\n"
23325           "movupd (%7), %%xmm7\n"
23326           "movapd %%xmm0, %%xmm8\n"
23327           "movapd %%xmm0, %%xmm9\n"
23328           "addpd %%xmm1, %%xmm8\n"
23329           "subpd %%xmm1, %%xmm9\n"
23330           "movapd %%xmm2, %%xmm10\n"
23331           "movapd %%xmm2, %%xmm11\n"
23332           "addpd %%xmm3, %%xmm10\n"
23333           "subpd %%xmm3, %%xmm11\n"
23334           "movapd %%xmm4, %%xmm12\n"
23335           "movapd %%xmm4, %%xmm13\n"
23336           "addpd %%xmm5, %%xmm12\n"
23337           "subpd %%xmm5, %%xmm13\n"
23338           "movapd %%xmm6, %%xmm14\n"
23339           "movapd %%xmm6, %%xmm15\n"
23340           "addpd %%xmm7, %%xmm14\n"
23341           "subpd %%xmm7, %%xmm15\n"
23342           "movapd %%xmm8, %%xmm0\n"
23343           "movapd %%xmm8, %%xmm2\n"
23344           "addpd %%xmm10, %%xmm0\n"
23345           "subpd %%xmm10, %%xmm2\n"
23346           "movapd %%xmm9, %%xmm1\n"
23347           "movapd %%xmm9, %%xmm3\n"
23348           "addpd %%xmm11, %%xmm1\n"
23349           "subpd %%xmm11, %%xmm3\n"
23350           "movapd %%xmm12, %%xmm4\n"
23351           "movapd %%xmm12, %%xmm6\n"
23352           "addpd %%xmm14, %%xmm4\n"
23353           "subpd %%xmm14, %%xmm6\n"
23354           "movapd %%xmm13, %%xmm5\n"
23355           "movapd %%xmm13, %%xmm7\n"
23356           "addpd %%xmm15, %%xmm5\n"
23357           "subpd %%xmm15, %%xmm7\n"
23358           "movapd %%xmm0, %%xmm8\n"
23359           "movapd %%xmm0, %%xmm12\n"
23360           "addpd %%xmm4, %%xmm8\n"
23361           "subpd %%xmm4, %%xmm12\n"
23362           "movapd %%xmm1, %%xmm9\n"
23363           "movapd %%xmm1, %%xmm13\n"
23364           "addpd %%xmm5, %%xmm9\n"
23365           "subpd %%xmm5, %%xmm13\n"
23366           "movapd %%xmm2, %%xmm10\n"
23367           "movapd %%xmm2, %%xmm14\n"
23368           "addpd %%xmm6, %%xmm10\n"
23369           "subpd %%xmm6, %%xmm14\n"
23370           "movapd %%xmm3, %%xmm11\n"
23371           "movapd %%xmm3, %%xmm15\n"
23372           "addpd %%xmm7, %%xmm11\n"
23373           "subpd %%xmm7, %%xmm15\n"
23374           "movupd %%xmm8, (%0)\n"
23375           "movupd %%xmm9, (%1)\n"
23376           "movupd %%xmm10, (%2)\n"
23377           "movupd %%xmm11, (%3)\n"
23378           "movupd %%xmm12, (%4)\n"
23379           "movupd %%xmm13, (%5)\n"
23380           "movupd %%xmm14, (%6)\n"
23381           "movupd %%xmm15, (%7)\n"
23382           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23383         );
23384       }
23385     }
23386     return;
23387   }
23388   if (depth == 15) {
23389     helper_double_27_recursive(buf + 0, 12);
23390     helper_double_27_recursive(buf + 4096, 12);
23391     helper_double_27_recursive(buf + 8192, 12);
23392     helper_double_27_recursive(buf + 12288, 12);
23393     helper_double_27_recursive(buf + 16384, 12);
23394     helper_double_27_recursive(buf + 20480, 12);
23395     helper_double_27_recursive(buf + 24576, 12);
23396     helper_double_27_recursive(buf + 28672, 12);
23397     for (int j = 0; j < 32768; j += 32768) {
23398       for (int k = 0; k < 4096; k += 2) {
23399         __asm__ volatile (
23400           "movupd (%0), %%xmm0\n"
23401           "movupd (%1), %%xmm1\n"
23402           "movupd (%2), %%xmm2\n"
23403           "movupd (%3), %%xmm3\n"
23404           "movupd (%4), %%xmm4\n"
23405           "movupd (%5), %%xmm5\n"
23406           "movupd (%6), %%xmm6\n"
23407           "movupd (%7), %%xmm7\n"
23408           "movapd %%xmm0, %%xmm8\n"
23409           "movapd %%xmm0, %%xmm9\n"
23410           "addpd %%xmm1, %%xmm8\n"
23411           "subpd %%xmm1, %%xmm9\n"
23412           "movapd %%xmm2, %%xmm10\n"
23413           "movapd %%xmm2, %%xmm11\n"
23414           "addpd %%xmm3, %%xmm10\n"
23415           "subpd %%xmm3, %%xmm11\n"
23416           "movapd %%xmm4, %%xmm12\n"
23417           "movapd %%xmm4, %%xmm13\n"
23418           "addpd %%xmm5, %%xmm12\n"
23419           "subpd %%xmm5, %%xmm13\n"
23420           "movapd %%xmm6, %%xmm14\n"
23421           "movapd %%xmm6, %%xmm15\n"
23422           "addpd %%xmm7, %%xmm14\n"
23423           "subpd %%xmm7, %%xmm15\n"
23424           "movapd %%xmm8, %%xmm0\n"
23425           "movapd %%xmm8, %%xmm2\n"
23426           "addpd %%xmm10, %%xmm0\n"
23427           "subpd %%xmm10, %%xmm2\n"
23428           "movapd %%xmm9, %%xmm1\n"
23429           "movapd %%xmm9, %%xmm3\n"
23430           "addpd %%xmm11, %%xmm1\n"
23431           "subpd %%xmm11, %%xmm3\n"
23432           "movapd %%xmm12, %%xmm4\n"
23433           "movapd %%xmm12, %%xmm6\n"
23434           "addpd %%xmm14, %%xmm4\n"
23435           "subpd %%xmm14, %%xmm6\n"
23436           "movapd %%xmm13, %%xmm5\n"
23437           "movapd %%xmm13, %%xmm7\n"
23438           "addpd %%xmm15, %%xmm5\n"
23439           "subpd %%xmm15, %%xmm7\n"
23440           "movapd %%xmm0, %%xmm8\n"
23441           "movapd %%xmm0, %%xmm12\n"
23442           "addpd %%xmm4, %%xmm8\n"
23443           "subpd %%xmm4, %%xmm12\n"
23444           "movapd %%xmm1, %%xmm9\n"
23445           "movapd %%xmm1, %%xmm13\n"
23446           "addpd %%xmm5, %%xmm9\n"
23447           "subpd %%xmm5, %%xmm13\n"
23448           "movapd %%xmm2, %%xmm10\n"
23449           "movapd %%xmm2, %%xmm14\n"
23450           "addpd %%xmm6, %%xmm10\n"
23451           "subpd %%xmm6, %%xmm14\n"
23452           "movapd %%xmm3, %%xmm11\n"
23453           "movapd %%xmm3, %%xmm15\n"
23454           "addpd %%xmm7, %%xmm11\n"
23455           "subpd %%xmm7, %%xmm15\n"
23456           "movupd %%xmm8, (%0)\n"
23457           "movupd %%xmm9, (%1)\n"
23458           "movupd %%xmm10, (%2)\n"
23459           "movupd %%xmm11, (%3)\n"
23460           "movupd %%xmm12, (%4)\n"
23461           "movupd %%xmm13, (%5)\n"
23462           "movupd %%xmm14, (%6)\n"
23463           "movupd %%xmm15, (%7)\n"
23464           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23465         );
23466       }
23467     }
23468     return;
23469   }
23470   if (depth == 18) {
23471     helper_double_27_recursive(buf + 0, 15);
23472     helper_double_27_recursive(buf + 32768, 15);
23473     helper_double_27_recursive(buf + 65536, 15);
23474     helper_double_27_recursive(buf + 98304, 15);
23475     helper_double_27_recursive(buf + 131072, 15);
23476     helper_double_27_recursive(buf + 163840, 15);
23477     helper_double_27_recursive(buf + 196608, 15);
23478     helper_double_27_recursive(buf + 229376, 15);
23479     for (int j = 0; j < 262144; j += 262144) {
23480       for (int k = 0; k < 32768; k += 2) {
23481         __asm__ volatile (
23482           "movupd (%0), %%xmm0\n"
23483           "movupd (%1), %%xmm1\n"
23484           "movupd (%2), %%xmm2\n"
23485           "movupd (%3), %%xmm3\n"
23486           "movupd (%4), %%xmm4\n"
23487           "movupd (%5), %%xmm5\n"
23488           "movupd (%6), %%xmm6\n"
23489           "movupd (%7), %%xmm7\n"
23490           "movapd %%xmm0, %%xmm8\n"
23491           "movapd %%xmm0, %%xmm9\n"
23492           "addpd %%xmm1, %%xmm8\n"
23493           "subpd %%xmm1, %%xmm9\n"
23494           "movapd %%xmm2, %%xmm10\n"
23495           "movapd %%xmm2, %%xmm11\n"
23496           "addpd %%xmm3, %%xmm10\n"
23497           "subpd %%xmm3, %%xmm11\n"
23498           "movapd %%xmm4, %%xmm12\n"
23499           "movapd %%xmm4, %%xmm13\n"
23500           "addpd %%xmm5, %%xmm12\n"
23501           "subpd %%xmm5, %%xmm13\n"
23502           "movapd %%xmm6, %%xmm14\n"
23503           "movapd %%xmm6, %%xmm15\n"
23504           "addpd %%xmm7, %%xmm14\n"
23505           "subpd %%xmm7, %%xmm15\n"
23506           "movapd %%xmm8, %%xmm0\n"
23507           "movapd %%xmm8, %%xmm2\n"
23508           "addpd %%xmm10, %%xmm0\n"
23509           "subpd %%xmm10, %%xmm2\n"
23510           "movapd %%xmm9, %%xmm1\n"
23511           "movapd %%xmm9, %%xmm3\n"
23512           "addpd %%xmm11, %%xmm1\n"
23513           "subpd %%xmm11, %%xmm3\n"
23514           "movapd %%xmm12, %%xmm4\n"
23515           "movapd %%xmm12, %%xmm6\n"
23516           "addpd %%xmm14, %%xmm4\n"
23517           "subpd %%xmm14, %%xmm6\n"
23518           "movapd %%xmm13, %%xmm5\n"
23519           "movapd %%xmm13, %%xmm7\n"
23520           "addpd %%xmm15, %%xmm5\n"
23521           "subpd %%xmm15, %%xmm7\n"
23522           "movapd %%xmm0, %%xmm8\n"
23523           "movapd %%xmm0, %%xmm12\n"
23524           "addpd %%xmm4, %%xmm8\n"
23525           "subpd %%xmm4, %%xmm12\n"
23526           "movapd %%xmm1, %%xmm9\n"
23527           "movapd %%xmm1, %%xmm13\n"
23528           "addpd %%xmm5, %%xmm9\n"
23529           "subpd %%xmm5, %%xmm13\n"
23530           "movapd %%xmm2, %%xmm10\n"
23531           "movapd %%xmm2, %%xmm14\n"
23532           "addpd %%xmm6, %%xmm10\n"
23533           "subpd %%xmm6, %%xmm14\n"
23534           "movapd %%xmm3, %%xmm11\n"
23535           "movapd %%xmm3, %%xmm15\n"
23536           "addpd %%xmm7, %%xmm11\n"
23537           "subpd %%xmm7, %%xmm15\n"
23538           "movupd %%xmm8, (%0)\n"
23539           "movupd %%xmm9, (%1)\n"
23540           "movupd %%xmm10, (%2)\n"
23541           "movupd %%xmm11, (%3)\n"
23542           "movupd %%xmm12, (%4)\n"
23543           "movupd %%xmm13, (%5)\n"
23544           "movupd %%xmm14, (%6)\n"
23545           "movupd %%xmm15, (%7)\n"
23546           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23547         );
23548       }
23549     }
23550     return;
23551   }
23552   if (depth == 21) {
23553     helper_double_27_recursive(buf + 0, 18);
23554     helper_double_27_recursive(buf + 262144, 18);
23555     helper_double_27_recursive(buf + 524288, 18);
23556     helper_double_27_recursive(buf + 786432, 18);
23557     helper_double_27_recursive(buf + 1048576, 18);
23558     helper_double_27_recursive(buf + 1310720, 18);
23559     helper_double_27_recursive(buf + 1572864, 18);
23560     helper_double_27_recursive(buf + 1835008, 18);
23561     for (int j = 0; j < 2097152; j += 2097152) {
23562       for (int k = 0; k < 262144; k += 2) {
23563         __asm__ volatile (
23564           "movupd (%0), %%xmm0\n"
23565           "movupd (%1), %%xmm1\n"
23566           "movupd (%2), %%xmm2\n"
23567           "movupd (%3), %%xmm3\n"
23568           "movupd (%4), %%xmm4\n"
23569           "movupd (%5), %%xmm5\n"
23570           "movupd (%6), %%xmm6\n"
23571           "movupd (%7), %%xmm7\n"
23572           "movapd %%xmm0, %%xmm8\n"
23573           "movapd %%xmm0, %%xmm9\n"
23574           "addpd %%xmm1, %%xmm8\n"
23575           "subpd %%xmm1, %%xmm9\n"
23576           "movapd %%xmm2, %%xmm10\n"
23577           "movapd %%xmm2, %%xmm11\n"
23578           "addpd %%xmm3, %%xmm10\n"
23579           "subpd %%xmm3, %%xmm11\n"
23580           "movapd %%xmm4, %%xmm12\n"
23581           "movapd %%xmm4, %%xmm13\n"
23582           "addpd %%xmm5, %%xmm12\n"
23583           "subpd %%xmm5, %%xmm13\n"
23584           "movapd %%xmm6, %%xmm14\n"
23585           "movapd %%xmm6, %%xmm15\n"
23586           "addpd %%xmm7, %%xmm14\n"
23587           "subpd %%xmm7, %%xmm15\n"
23588           "movapd %%xmm8, %%xmm0\n"
23589           "movapd %%xmm8, %%xmm2\n"
23590           "addpd %%xmm10, %%xmm0\n"
23591           "subpd %%xmm10, %%xmm2\n"
23592           "movapd %%xmm9, %%xmm1\n"
23593           "movapd %%xmm9, %%xmm3\n"
23594           "addpd %%xmm11, %%xmm1\n"
23595           "subpd %%xmm11, %%xmm3\n"
23596           "movapd %%xmm12, %%xmm4\n"
23597           "movapd %%xmm12, %%xmm6\n"
23598           "addpd %%xmm14, %%xmm4\n"
23599           "subpd %%xmm14, %%xmm6\n"
23600           "movapd %%xmm13, %%xmm5\n"
23601           "movapd %%xmm13, %%xmm7\n"
23602           "addpd %%xmm15, %%xmm5\n"
23603           "subpd %%xmm15, %%xmm7\n"
23604           "movapd %%xmm0, %%xmm8\n"
23605           "movapd %%xmm0, %%xmm12\n"
23606           "addpd %%xmm4, %%xmm8\n"
23607           "subpd %%xmm4, %%xmm12\n"
23608           "movapd %%xmm1, %%xmm9\n"
23609           "movapd %%xmm1, %%xmm13\n"
23610           "addpd %%xmm5, %%xmm9\n"
23611           "subpd %%xmm5, %%xmm13\n"
23612           "movapd %%xmm2, %%xmm10\n"
23613           "movapd %%xmm2, %%xmm14\n"
23614           "addpd %%xmm6, %%xmm10\n"
23615           "subpd %%xmm6, %%xmm14\n"
23616           "movapd %%xmm3, %%xmm11\n"
23617           "movapd %%xmm3, %%xmm15\n"
23618           "addpd %%xmm7, %%xmm11\n"
23619           "subpd %%xmm7, %%xmm15\n"
23620           "movupd %%xmm8, (%0)\n"
23621           "movupd %%xmm9, (%1)\n"
23622           "movupd %%xmm10, (%2)\n"
23623           "movupd %%xmm11, (%3)\n"
23624           "movupd %%xmm12, (%4)\n"
23625           "movupd %%xmm13, (%5)\n"
23626           "movupd %%xmm14, (%6)\n"
23627           "movupd %%xmm15, (%7)\n"
23628           :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23629         );
23630       }
23631     }
23632     return;
23633   }
23634   if (depth == 24) {
23635     helper_double_27_recursive(buf + 0, 21);
23636     helper_double_27_recursive(buf + 2097152, 21);
23637     helper_double_27_recursive(buf + 4194304, 21);
23638     helper_double_27_recursive(buf + 6291456, 21);
23639     helper_double_27_recursive(buf + 8388608, 21);
23640     helper_double_27_recursive(buf + 10485760, 21);
23641     helper_double_27_recursive(buf + 12582912, 21);
23642     helper_double_27_recursive(buf + 14680064, 21);
23643     for (int j = 0; j < 16777216; j += 16777216) {
23644       for (int k = 0; k < 2097152; k += 2) {
23645         __asm__ volatile (
23646           "movupd (%0), %%xmm0\n"
23647           "movupd (%1), %%xmm1\n"
23648           "movupd (%2), %%xmm2\n"
23649           "movupd (%3), %%xmm3\n"
23650           "movupd (%4), %%xmm4\n"
23651           "movupd (%5), %%xmm5\n"
23652           "movupd (%6), %%xmm6\n"
23653           "movupd (%7), %%xmm7\n"
23654           "movapd %%xmm0, %%xmm8\n"
23655           "movapd %%xmm0, %%xmm9\n"
23656           "addpd %%xmm1, %%xmm8\n"
23657           "subpd %%xmm1, %%xmm9\n"
23658           "movapd %%xmm2, %%xmm10\n"
23659           "movapd %%xmm2, %%xmm11\n"
23660           "addpd %%xmm3, %%xmm10\n"
23661           "subpd %%xmm3, %%xmm11\n"
23662           "movapd %%xmm4, %%xmm12\n"
23663           "movapd %%xmm4, %%xmm13\n"
23664           "addpd %%xmm5, %%xmm12\n"
23665           "subpd %%xmm5, %%xmm13\n"
23666           "movapd %%xmm6, %%xmm14\n"
23667           "movapd %%xmm6, %%xmm15\n"
23668           "addpd %%xmm7, %%xmm14\n"
23669           "subpd %%xmm7, %%xmm15\n"
23670           "movapd %%xmm8, %%xmm0\n"
23671           "movapd %%xmm8, %%xmm2\n"
23672           "addpd %%xmm10, %%xmm0\n"
23673           "subpd %%xmm10, %%xmm2\n"
23674           "movapd %%xmm9, %%xmm1\n"
23675           "movapd %%xmm9, %%xmm3\n"
23676           "addpd %%xmm11, %%xmm1\n"
23677           "subpd %%xmm11, %%xmm3\n"
23678           "movapd %%xmm12, %%xmm4\n"
23679           "movapd %%xmm12, %%xmm6\n"
23680           "addpd %%xmm14, %%xmm4\n"
23681           "subpd %%xmm14, %%xmm6\n"
23682           "movapd %%xmm13, %%xmm5\n"
23683           "movapd %%xmm13, %%xmm7\n"
23684           "addpd %%xmm15, %%xmm5\n"
23685           "subpd %%xmm15, %%xmm7\n"
23686           "movapd %%xmm0, %%xmm8\n"
23687           "movapd %%xmm0, %%xmm12\n"
23688           "addpd %%xmm4, %%xmm8\n"
23689           "subpd %%xmm4, %%xmm12\n"
23690           "movapd %%xmm1, %%xmm9\n"
23691           "movapd %%xmm1, %%xmm13\n"
23692           "addpd %%xmm5, %%xmm9\n"
23693           "subpd %%xmm5, %%xmm13\n"
23694           "movapd %%xmm2, %%xmm10\n"
23695           "movapd %%xmm2, %%xmm14\n"
23696           "addpd %%xmm6, %%xmm10\n"
23697           "subpd %%xmm6, %%xmm14\n"
23698           "movapd %%xmm3, %%xmm11\n"
23699           "movapd %%xmm3, %%xmm15\n"
23700           "addpd %%xmm7, %%xmm11\n"
23701           "subpd %%xmm7, %%xmm15\n"
23702           "movupd %%xmm8, (%0)\n"
23703           "movupd %%xmm9, (%1)\n"
23704           "movupd %%xmm10, (%2)\n"
23705           "movupd %%xmm11, (%3)\n"
23706           "movupd %%xmm12, (%4)\n"
23707           "movupd %%xmm13, (%5)\n"
23708           "movupd %%xmm14, (%6)\n"
23709           "movupd %%xmm15, (%7)\n"
23710           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23711         );
23712       }
23713     }
23714     return;
23715   }
23716   if (depth == 27) {
23717     helper_double_27_recursive(buf + 0, 24);
23718     helper_double_27_recursive(buf + 16777216, 24);
23719     helper_double_27_recursive(buf + 33554432, 24);
23720     helper_double_27_recursive(buf + 50331648, 24);
23721     helper_double_27_recursive(buf + 67108864, 24);
23722     helper_double_27_recursive(buf + 83886080, 24);
23723     helper_double_27_recursive(buf + 100663296, 24);
23724     helper_double_27_recursive(buf + 117440512, 24);
23725     for (int j = 0; j < 134217728; j += 134217728) {
23726       for (int k = 0; k < 16777216; k += 2) {
23727         __asm__ volatile (
23728           "movupd (%0), %%xmm0\n"
23729           "movupd (%1), %%xmm1\n"
23730           "movupd (%2), %%xmm2\n"
23731           "movupd (%3), %%xmm3\n"
23732           "movupd (%4), %%xmm4\n"
23733           "movupd (%5), %%xmm5\n"
23734           "movupd (%6), %%xmm6\n"
23735           "movupd (%7), %%xmm7\n"
23736           "movapd %%xmm0, %%xmm8\n"
23737           "movapd %%xmm0, %%xmm9\n"
23738           "addpd %%xmm1, %%xmm8\n"
23739           "subpd %%xmm1, %%xmm9\n"
23740           "movapd %%xmm2, %%xmm10\n"
23741           "movapd %%xmm2, %%xmm11\n"
23742           "addpd %%xmm3, %%xmm10\n"
23743           "subpd %%xmm3, %%xmm11\n"
23744           "movapd %%xmm4, %%xmm12\n"
23745           "movapd %%xmm4, %%xmm13\n"
23746           "addpd %%xmm5, %%xmm12\n"
23747           "subpd %%xmm5, %%xmm13\n"
23748           "movapd %%xmm6, %%xmm14\n"
23749           "movapd %%xmm6, %%xmm15\n"
23750           "addpd %%xmm7, %%xmm14\n"
23751           "subpd %%xmm7, %%xmm15\n"
23752           "movapd %%xmm8, %%xmm0\n"
23753           "movapd %%xmm8, %%xmm2\n"
23754           "addpd %%xmm10, %%xmm0\n"
23755           "subpd %%xmm10, %%xmm2\n"
23756           "movapd %%xmm9, %%xmm1\n"
23757           "movapd %%xmm9, %%xmm3\n"
23758           "addpd %%xmm11, %%xmm1\n"
23759           "subpd %%xmm11, %%xmm3\n"
23760           "movapd %%xmm12, %%xmm4\n"
23761           "movapd %%xmm12, %%xmm6\n"
23762           "addpd %%xmm14, %%xmm4\n"
23763           "subpd %%xmm14, %%xmm6\n"
23764           "movapd %%xmm13, %%xmm5\n"
23765           "movapd %%xmm13, %%xmm7\n"
23766           "addpd %%xmm15, %%xmm5\n"
23767           "subpd %%xmm15, %%xmm7\n"
23768           "movapd %%xmm0, %%xmm8\n"
23769           "movapd %%xmm0, %%xmm12\n"
23770           "addpd %%xmm4, %%xmm8\n"
23771           "subpd %%xmm4, %%xmm12\n"
23772           "movapd %%xmm1, %%xmm9\n"
23773           "movapd %%xmm1, %%xmm13\n"
23774           "addpd %%xmm5, %%xmm9\n"
23775           "subpd %%xmm5, %%xmm13\n"
23776           "movapd %%xmm2, %%xmm10\n"
23777           "movapd %%xmm2, %%xmm14\n"
23778           "addpd %%xmm6, %%xmm10\n"
23779           "subpd %%xmm6, %%xmm14\n"
23780           "movapd %%xmm3, %%xmm11\n"
23781           "movapd %%xmm3, %%xmm15\n"
23782           "addpd %%xmm7, %%xmm11\n"
23783           "subpd %%xmm7, %%xmm15\n"
23784           "movupd %%xmm8, (%0)\n"
23785           "movupd %%xmm9, (%1)\n"
23786           "movupd %%xmm10, (%2)\n"
23787           "movupd %%xmm11, (%3)\n"
23788           "movupd %%xmm12, (%4)\n"
23789           "movupd %%xmm13, (%5)\n"
23790           "movupd %%xmm14, (%6)\n"
23791           "movupd %%xmm15, (%7)\n"
23792           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23793         );
23794       }
23795     }
23796     return;
23797   }
23798 }
23799 void helper_double_27(double *buf);
helper_double_27(double * buf)23800 void helper_double_27(double *buf) {
23801   helper_double_27_recursive(buf, 27);
23802 }
23803 void helper_double_28_recursive(double *buf, int depth);
helper_double_28_recursive(double * buf,int depth)23804 void helper_double_28_recursive(double *buf, int depth) {
23805   if (depth == 14) {
23806     for (int j = 0; j < 16384; j += 16) {
23807       for (int k = 0; k < 2; k += 2) {
23808         __asm__ volatile (
23809           "movupd (%0), %%xmm0\n"
23810           "movupd (%1), %%xmm1\n"
23811           "movupd (%2), %%xmm2\n"
23812           "movupd (%3), %%xmm3\n"
23813           "movupd (%4), %%xmm4\n"
23814           "movupd (%5), %%xmm5\n"
23815           "movupd (%6), %%xmm6\n"
23816           "movupd (%7), %%xmm7\n"
23817           "movapd %%xmm0, %%xmm8\n"
23818           "haddpd %%xmm8, %%xmm8\n"
23819           "movapd %%xmm0, %%xmm9\n"
23820           "hsubpd %%xmm9, %%xmm9\n"
23821           "blendpd $1, %%xmm8, %%xmm9\n"
23822           "movapd %%xmm9, %%xmm0\n"
23823           "movapd %%xmm1, %%xmm8\n"
23824           "haddpd %%xmm8, %%xmm8\n"
23825           "movapd %%xmm1, %%xmm9\n"
23826           "hsubpd %%xmm9, %%xmm9\n"
23827           "blendpd $1, %%xmm8, %%xmm9\n"
23828           "movapd %%xmm9, %%xmm1\n"
23829           "movapd %%xmm2, %%xmm8\n"
23830           "haddpd %%xmm8, %%xmm8\n"
23831           "movapd %%xmm2, %%xmm9\n"
23832           "hsubpd %%xmm9, %%xmm9\n"
23833           "blendpd $1, %%xmm8, %%xmm9\n"
23834           "movapd %%xmm9, %%xmm2\n"
23835           "movapd %%xmm3, %%xmm8\n"
23836           "haddpd %%xmm8, %%xmm8\n"
23837           "movapd %%xmm3, %%xmm9\n"
23838           "hsubpd %%xmm9, %%xmm9\n"
23839           "blendpd $1, %%xmm8, %%xmm9\n"
23840           "movapd %%xmm9, %%xmm3\n"
23841           "movapd %%xmm4, %%xmm8\n"
23842           "haddpd %%xmm8, %%xmm8\n"
23843           "movapd %%xmm4, %%xmm9\n"
23844           "hsubpd %%xmm9, %%xmm9\n"
23845           "blendpd $1, %%xmm8, %%xmm9\n"
23846           "movapd %%xmm9, %%xmm4\n"
23847           "movapd %%xmm5, %%xmm8\n"
23848           "haddpd %%xmm8, %%xmm8\n"
23849           "movapd %%xmm5, %%xmm9\n"
23850           "hsubpd %%xmm9, %%xmm9\n"
23851           "blendpd $1, %%xmm8, %%xmm9\n"
23852           "movapd %%xmm9, %%xmm5\n"
23853           "movapd %%xmm6, %%xmm8\n"
23854           "haddpd %%xmm8, %%xmm8\n"
23855           "movapd %%xmm6, %%xmm9\n"
23856           "hsubpd %%xmm9, %%xmm9\n"
23857           "blendpd $1, %%xmm8, %%xmm9\n"
23858           "movapd %%xmm9, %%xmm6\n"
23859           "movapd %%xmm7, %%xmm8\n"
23860           "haddpd %%xmm8, %%xmm8\n"
23861           "movapd %%xmm7, %%xmm9\n"
23862           "hsubpd %%xmm9, %%xmm9\n"
23863           "blendpd $1, %%xmm8, %%xmm9\n"
23864           "movapd %%xmm9, %%xmm7\n"
23865           "movapd %%xmm0, %%xmm8\n"
23866           "movapd %%xmm0, %%xmm9\n"
23867           "addpd %%xmm1, %%xmm8\n"
23868           "subpd %%xmm1, %%xmm9\n"
23869           "movapd %%xmm2, %%xmm10\n"
23870           "movapd %%xmm2, %%xmm11\n"
23871           "addpd %%xmm3, %%xmm10\n"
23872           "subpd %%xmm3, %%xmm11\n"
23873           "movapd %%xmm4, %%xmm12\n"
23874           "movapd %%xmm4, %%xmm13\n"
23875           "addpd %%xmm5, %%xmm12\n"
23876           "subpd %%xmm5, %%xmm13\n"
23877           "movapd %%xmm6, %%xmm14\n"
23878           "movapd %%xmm6, %%xmm15\n"
23879           "addpd %%xmm7, %%xmm14\n"
23880           "subpd %%xmm7, %%xmm15\n"
23881           "movapd %%xmm8, %%xmm0\n"
23882           "movapd %%xmm8, %%xmm2\n"
23883           "addpd %%xmm10, %%xmm0\n"
23884           "subpd %%xmm10, %%xmm2\n"
23885           "movapd %%xmm9, %%xmm1\n"
23886           "movapd %%xmm9, %%xmm3\n"
23887           "addpd %%xmm11, %%xmm1\n"
23888           "subpd %%xmm11, %%xmm3\n"
23889           "movapd %%xmm12, %%xmm4\n"
23890           "movapd %%xmm12, %%xmm6\n"
23891           "addpd %%xmm14, %%xmm4\n"
23892           "subpd %%xmm14, %%xmm6\n"
23893           "movapd %%xmm13, %%xmm5\n"
23894           "movapd %%xmm13, %%xmm7\n"
23895           "addpd %%xmm15, %%xmm5\n"
23896           "subpd %%xmm15, %%xmm7\n"
23897           "movapd %%xmm0, %%xmm8\n"
23898           "movapd %%xmm0, %%xmm12\n"
23899           "addpd %%xmm4, %%xmm8\n"
23900           "subpd %%xmm4, %%xmm12\n"
23901           "movapd %%xmm1, %%xmm9\n"
23902           "movapd %%xmm1, %%xmm13\n"
23903           "addpd %%xmm5, %%xmm9\n"
23904           "subpd %%xmm5, %%xmm13\n"
23905           "movapd %%xmm2, %%xmm10\n"
23906           "movapd %%xmm2, %%xmm14\n"
23907           "addpd %%xmm6, %%xmm10\n"
23908           "subpd %%xmm6, %%xmm14\n"
23909           "movapd %%xmm3, %%xmm11\n"
23910           "movapd %%xmm3, %%xmm15\n"
23911           "addpd %%xmm7, %%xmm11\n"
23912           "subpd %%xmm7, %%xmm15\n"
23913           "movupd %%xmm8, (%0)\n"
23914           "movupd %%xmm9, (%1)\n"
23915           "movupd %%xmm10, (%2)\n"
23916           "movupd %%xmm11, (%3)\n"
23917           "movupd %%xmm12, (%4)\n"
23918           "movupd %%xmm13, (%5)\n"
23919           "movupd %%xmm14, (%6)\n"
23920           "movupd %%xmm15, (%7)\n"
23921           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23922         );
23923       }
23924     }
23925     for (int j = 0; j < 16384; j += 128) {
23926       for (int k = 0; k < 16; k += 2) {
23927         __asm__ volatile (
23928           "movupd (%0), %%xmm0\n"
23929           "movupd (%1), %%xmm1\n"
23930           "movupd (%2), %%xmm2\n"
23931           "movupd (%3), %%xmm3\n"
23932           "movupd (%4), %%xmm4\n"
23933           "movupd (%5), %%xmm5\n"
23934           "movupd (%6), %%xmm6\n"
23935           "movupd (%7), %%xmm7\n"
23936           "movapd %%xmm0, %%xmm8\n"
23937           "movapd %%xmm0, %%xmm9\n"
23938           "addpd %%xmm1, %%xmm8\n"
23939           "subpd %%xmm1, %%xmm9\n"
23940           "movapd %%xmm2, %%xmm10\n"
23941           "movapd %%xmm2, %%xmm11\n"
23942           "addpd %%xmm3, %%xmm10\n"
23943           "subpd %%xmm3, %%xmm11\n"
23944           "movapd %%xmm4, %%xmm12\n"
23945           "movapd %%xmm4, %%xmm13\n"
23946           "addpd %%xmm5, %%xmm12\n"
23947           "subpd %%xmm5, %%xmm13\n"
23948           "movapd %%xmm6, %%xmm14\n"
23949           "movapd %%xmm6, %%xmm15\n"
23950           "addpd %%xmm7, %%xmm14\n"
23951           "subpd %%xmm7, %%xmm15\n"
23952           "movapd %%xmm8, %%xmm0\n"
23953           "movapd %%xmm8, %%xmm2\n"
23954           "addpd %%xmm10, %%xmm0\n"
23955           "subpd %%xmm10, %%xmm2\n"
23956           "movapd %%xmm9, %%xmm1\n"
23957           "movapd %%xmm9, %%xmm3\n"
23958           "addpd %%xmm11, %%xmm1\n"
23959           "subpd %%xmm11, %%xmm3\n"
23960           "movapd %%xmm12, %%xmm4\n"
23961           "movapd %%xmm12, %%xmm6\n"
23962           "addpd %%xmm14, %%xmm4\n"
23963           "subpd %%xmm14, %%xmm6\n"
23964           "movapd %%xmm13, %%xmm5\n"
23965           "movapd %%xmm13, %%xmm7\n"
23966           "addpd %%xmm15, %%xmm5\n"
23967           "subpd %%xmm15, %%xmm7\n"
23968           "movapd %%xmm0, %%xmm8\n"
23969           "movapd %%xmm0, %%xmm12\n"
23970           "addpd %%xmm4, %%xmm8\n"
23971           "subpd %%xmm4, %%xmm12\n"
23972           "movapd %%xmm1, %%xmm9\n"
23973           "movapd %%xmm1, %%xmm13\n"
23974           "addpd %%xmm5, %%xmm9\n"
23975           "subpd %%xmm5, %%xmm13\n"
23976           "movapd %%xmm2, %%xmm10\n"
23977           "movapd %%xmm2, %%xmm14\n"
23978           "addpd %%xmm6, %%xmm10\n"
23979           "subpd %%xmm6, %%xmm14\n"
23980           "movapd %%xmm3, %%xmm11\n"
23981           "movapd %%xmm3, %%xmm15\n"
23982           "addpd %%xmm7, %%xmm11\n"
23983           "subpd %%xmm7, %%xmm15\n"
23984           "movupd %%xmm8, (%0)\n"
23985           "movupd %%xmm9, (%1)\n"
23986           "movupd %%xmm10, (%2)\n"
23987           "movupd %%xmm11, (%3)\n"
23988           "movupd %%xmm12, (%4)\n"
23989           "movupd %%xmm13, (%5)\n"
23990           "movupd %%xmm14, (%6)\n"
23991           "movupd %%xmm15, (%7)\n"
23992           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23993         );
23994       }
23995     }
23996     for (int j = 0; j < 16384; j += 1024) {
23997       for (int k = 0; k < 128; k += 2) {
23998         __asm__ volatile (
23999           "movupd (%0), %%xmm0\n"
24000           "movupd (%1), %%xmm1\n"
24001           "movupd (%2), %%xmm2\n"
24002           "movupd (%3), %%xmm3\n"
24003           "movupd (%4), %%xmm4\n"
24004           "movupd (%5), %%xmm5\n"
24005           "movupd (%6), %%xmm6\n"
24006           "movupd (%7), %%xmm7\n"
24007           "movapd %%xmm0, %%xmm8\n"
24008           "movapd %%xmm0, %%xmm9\n"
24009           "addpd %%xmm1, %%xmm8\n"
24010           "subpd %%xmm1, %%xmm9\n"
24011           "movapd %%xmm2, %%xmm10\n"
24012           "movapd %%xmm2, %%xmm11\n"
24013           "addpd %%xmm3, %%xmm10\n"
24014           "subpd %%xmm3, %%xmm11\n"
24015           "movapd %%xmm4, %%xmm12\n"
24016           "movapd %%xmm4, %%xmm13\n"
24017           "addpd %%xmm5, %%xmm12\n"
24018           "subpd %%xmm5, %%xmm13\n"
24019           "movapd %%xmm6, %%xmm14\n"
24020           "movapd %%xmm6, %%xmm15\n"
24021           "addpd %%xmm7, %%xmm14\n"
24022           "subpd %%xmm7, %%xmm15\n"
24023           "movapd %%xmm8, %%xmm0\n"
24024           "movapd %%xmm8, %%xmm2\n"
24025           "addpd %%xmm10, %%xmm0\n"
24026           "subpd %%xmm10, %%xmm2\n"
24027           "movapd %%xmm9, %%xmm1\n"
24028           "movapd %%xmm9, %%xmm3\n"
24029           "addpd %%xmm11, %%xmm1\n"
24030           "subpd %%xmm11, %%xmm3\n"
24031           "movapd %%xmm12, %%xmm4\n"
24032           "movapd %%xmm12, %%xmm6\n"
24033           "addpd %%xmm14, %%xmm4\n"
24034           "subpd %%xmm14, %%xmm6\n"
24035           "movapd %%xmm13, %%xmm5\n"
24036           "movapd %%xmm13, %%xmm7\n"
24037           "addpd %%xmm15, %%xmm5\n"
24038           "subpd %%xmm15, %%xmm7\n"
24039           "movapd %%xmm0, %%xmm8\n"
24040           "movapd %%xmm0, %%xmm12\n"
24041           "addpd %%xmm4, %%xmm8\n"
24042           "subpd %%xmm4, %%xmm12\n"
24043           "movapd %%xmm1, %%xmm9\n"
24044           "movapd %%xmm1, %%xmm13\n"
24045           "addpd %%xmm5, %%xmm9\n"
24046           "subpd %%xmm5, %%xmm13\n"
24047           "movapd %%xmm2, %%xmm10\n"
24048           "movapd %%xmm2, %%xmm14\n"
24049           "addpd %%xmm6, %%xmm10\n"
24050           "subpd %%xmm6, %%xmm14\n"
24051           "movapd %%xmm3, %%xmm11\n"
24052           "movapd %%xmm3, %%xmm15\n"
24053           "addpd %%xmm7, %%xmm11\n"
24054           "subpd %%xmm7, %%xmm15\n"
24055           "movupd %%xmm8, (%0)\n"
24056           "movupd %%xmm9, (%1)\n"
24057           "movupd %%xmm10, (%2)\n"
24058           "movupd %%xmm11, (%3)\n"
24059           "movupd %%xmm12, (%4)\n"
24060           "movupd %%xmm13, (%5)\n"
24061           "movupd %%xmm14, (%6)\n"
24062           "movupd %%xmm15, (%7)\n"
24063           :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24064         );
24065       }
24066     }
24067     for (int j = 0; j < 16384; j += 8192) {
24068       for (int k = 0; k < 1024; k += 2) {
24069         __asm__ volatile (
24070           "movupd (%0), %%xmm0\n"
24071           "movupd (%1), %%xmm1\n"
24072           "movupd (%2), %%xmm2\n"
24073           "movupd (%3), %%xmm3\n"
24074           "movupd (%4), %%xmm4\n"
24075           "movupd (%5), %%xmm5\n"
24076           "movupd (%6), %%xmm6\n"
24077           "movupd (%7), %%xmm7\n"
24078           "movapd %%xmm0, %%xmm8\n"
24079           "movapd %%xmm0, %%xmm9\n"
24080           "addpd %%xmm1, %%xmm8\n"
24081           "subpd %%xmm1, %%xmm9\n"
24082           "movapd %%xmm2, %%xmm10\n"
24083           "movapd %%xmm2, %%xmm11\n"
24084           "addpd %%xmm3, %%xmm10\n"
24085           "subpd %%xmm3, %%xmm11\n"
24086           "movapd %%xmm4, %%xmm12\n"
24087           "movapd %%xmm4, %%xmm13\n"
24088           "addpd %%xmm5, %%xmm12\n"
24089           "subpd %%xmm5, %%xmm13\n"
24090           "movapd %%xmm6, %%xmm14\n"
24091           "movapd %%xmm6, %%xmm15\n"
24092           "addpd %%xmm7, %%xmm14\n"
24093           "subpd %%xmm7, %%xmm15\n"
24094           "movapd %%xmm8, %%xmm0\n"
24095           "movapd %%xmm8, %%xmm2\n"
24096           "addpd %%xmm10, %%xmm0\n"
24097           "subpd %%xmm10, %%xmm2\n"
24098           "movapd %%xmm9, %%xmm1\n"
24099           "movapd %%xmm9, %%xmm3\n"
24100           "addpd %%xmm11, %%xmm1\n"
24101           "subpd %%xmm11, %%xmm3\n"
24102           "movapd %%xmm12, %%xmm4\n"
24103           "movapd %%xmm12, %%xmm6\n"
24104           "addpd %%xmm14, %%xmm4\n"
24105           "subpd %%xmm14, %%xmm6\n"
24106           "movapd %%xmm13, %%xmm5\n"
24107           "movapd %%xmm13, %%xmm7\n"
24108           "addpd %%xmm15, %%xmm5\n"
24109           "subpd %%xmm15, %%xmm7\n"
24110           "movapd %%xmm0, %%xmm8\n"
24111           "movapd %%xmm0, %%xmm12\n"
24112           "addpd %%xmm4, %%xmm8\n"
24113           "subpd %%xmm4, %%xmm12\n"
24114           "movapd %%xmm1, %%xmm9\n"
24115           "movapd %%xmm1, %%xmm13\n"
24116           "addpd %%xmm5, %%xmm9\n"
24117           "subpd %%xmm5, %%xmm13\n"
24118           "movapd %%xmm2, %%xmm10\n"
24119           "movapd %%xmm2, %%xmm14\n"
24120           "addpd %%xmm6, %%xmm10\n"
24121           "subpd %%xmm6, %%xmm14\n"
24122           "movapd %%xmm3, %%xmm11\n"
24123           "movapd %%xmm3, %%xmm15\n"
24124           "addpd %%xmm7, %%xmm11\n"
24125           "subpd %%xmm7, %%xmm15\n"
24126           "movupd %%xmm8, (%0)\n"
24127           "movupd %%xmm9, (%1)\n"
24128           "movupd %%xmm10, (%2)\n"
24129           "movupd %%xmm11, (%3)\n"
24130           "movupd %%xmm12, (%4)\n"
24131           "movupd %%xmm13, (%5)\n"
24132           "movupd %%xmm14, (%6)\n"
24133           "movupd %%xmm15, (%7)\n"
24134           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24135         );
24136       }
24137     }
24138     for (int j = 0; j < 16384; j += 16384) {
24139       for (int k = 0; k < 8192; k += 2) {
24140         __asm__ volatile (
24141           "movupd (%0), %%xmm0\n"
24142           "movupd (%1), %%xmm1\n"
24143           "movapd %%xmm0, %%xmm8\n"
24144           "movapd %%xmm0, %%xmm9\n"
24145           "addpd %%xmm1, %%xmm8\n"
24146           "subpd %%xmm1, %%xmm9\n"
24147           "movupd %%xmm8, (%0)\n"
24148           "movupd %%xmm9, (%1)\n"
24149           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24150         );
24151       }
24152     }
24153     return;
24154   }
24155   if (depth == 17) {
24156     helper_double_28_recursive(buf + 0, 14);
24157     helper_double_28_recursive(buf + 16384, 14);
24158     helper_double_28_recursive(buf + 32768, 14);
24159     helper_double_28_recursive(buf + 49152, 14);
24160     helper_double_28_recursive(buf + 65536, 14);
24161     helper_double_28_recursive(buf + 81920, 14);
24162     helper_double_28_recursive(buf + 98304, 14);
24163     helper_double_28_recursive(buf + 114688, 14);
24164     for (int j = 0; j < 131072; j += 131072) {
24165       for (int k = 0; k < 16384; k += 2) {
24166         __asm__ volatile (
24167           "movupd (%0), %%xmm0\n"
24168           "movupd (%1), %%xmm1\n"
24169           "movupd (%2), %%xmm2\n"
24170           "movupd (%3), %%xmm3\n"
24171           "movupd (%4), %%xmm4\n"
24172           "movupd (%5), %%xmm5\n"
24173           "movupd (%6), %%xmm6\n"
24174           "movupd (%7), %%xmm7\n"
24175           "movapd %%xmm0, %%xmm8\n"
24176           "movapd %%xmm0, %%xmm9\n"
24177           "addpd %%xmm1, %%xmm8\n"
24178           "subpd %%xmm1, %%xmm9\n"
24179           "movapd %%xmm2, %%xmm10\n"
24180           "movapd %%xmm2, %%xmm11\n"
24181           "addpd %%xmm3, %%xmm10\n"
24182           "subpd %%xmm3, %%xmm11\n"
24183           "movapd %%xmm4, %%xmm12\n"
24184           "movapd %%xmm4, %%xmm13\n"
24185           "addpd %%xmm5, %%xmm12\n"
24186           "subpd %%xmm5, %%xmm13\n"
24187           "movapd %%xmm6, %%xmm14\n"
24188           "movapd %%xmm6, %%xmm15\n"
24189           "addpd %%xmm7, %%xmm14\n"
24190           "subpd %%xmm7, %%xmm15\n"
24191           "movapd %%xmm8, %%xmm0\n"
24192           "movapd %%xmm8, %%xmm2\n"
24193           "addpd %%xmm10, %%xmm0\n"
24194           "subpd %%xmm10, %%xmm2\n"
24195           "movapd %%xmm9, %%xmm1\n"
24196           "movapd %%xmm9, %%xmm3\n"
24197           "addpd %%xmm11, %%xmm1\n"
24198           "subpd %%xmm11, %%xmm3\n"
24199           "movapd %%xmm12, %%xmm4\n"
24200           "movapd %%xmm12, %%xmm6\n"
24201           "addpd %%xmm14, %%xmm4\n"
24202           "subpd %%xmm14, %%xmm6\n"
24203           "movapd %%xmm13, %%xmm5\n"
24204           "movapd %%xmm13, %%xmm7\n"
24205           "addpd %%xmm15, %%xmm5\n"
24206           "subpd %%xmm15, %%xmm7\n"
24207           "movapd %%xmm0, %%xmm8\n"
24208           "movapd %%xmm0, %%xmm12\n"
24209           "addpd %%xmm4, %%xmm8\n"
24210           "subpd %%xmm4, %%xmm12\n"
24211           "movapd %%xmm1, %%xmm9\n"
24212           "movapd %%xmm1, %%xmm13\n"
24213           "addpd %%xmm5, %%xmm9\n"
24214           "subpd %%xmm5, %%xmm13\n"
24215           "movapd %%xmm2, %%xmm10\n"
24216           "movapd %%xmm2, %%xmm14\n"
24217           "addpd %%xmm6, %%xmm10\n"
24218           "subpd %%xmm6, %%xmm14\n"
24219           "movapd %%xmm3, %%xmm11\n"
24220           "movapd %%xmm3, %%xmm15\n"
24221           "addpd %%xmm7, %%xmm11\n"
24222           "subpd %%xmm7, %%xmm15\n"
24223           "movupd %%xmm8, (%0)\n"
24224           "movupd %%xmm9, (%1)\n"
24225           "movupd %%xmm10, (%2)\n"
24226           "movupd %%xmm11, (%3)\n"
24227           "movupd %%xmm12, (%4)\n"
24228           "movupd %%xmm13, (%5)\n"
24229           "movupd %%xmm14, (%6)\n"
24230           "movupd %%xmm15, (%7)\n"
24231           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24232         );
24233       }
24234     }
24235     return;
24236   }
24237   if (depth == 20) {
24238     helper_double_28_recursive(buf + 0, 17);
24239     helper_double_28_recursive(buf + 131072, 17);
24240     helper_double_28_recursive(buf + 262144, 17);
24241     helper_double_28_recursive(buf + 393216, 17);
24242     helper_double_28_recursive(buf + 524288, 17);
24243     helper_double_28_recursive(buf + 655360, 17);
24244     helper_double_28_recursive(buf + 786432, 17);
24245     helper_double_28_recursive(buf + 917504, 17);
24246     for (int j = 0; j < 1048576; j += 1048576) {
24247       for (int k = 0; k < 131072; k += 2) {
24248         __asm__ volatile (
24249           "movupd (%0), %%xmm0\n"
24250           "movupd (%1), %%xmm1\n"
24251           "movupd (%2), %%xmm2\n"
24252           "movupd (%3), %%xmm3\n"
24253           "movupd (%4), %%xmm4\n"
24254           "movupd (%5), %%xmm5\n"
24255           "movupd (%6), %%xmm6\n"
24256           "movupd (%7), %%xmm7\n"
24257           "movapd %%xmm0, %%xmm8\n"
24258           "movapd %%xmm0, %%xmm9\n"
24259           "addpd %%xmm1, %%xmm8\n"
24260           "subpd %%xmm1, %%xmm9\n"
24261           "movapd %%xmm2, %%xmm10\n"
24262           "movapd %%xmm2, %%xmm11\n"
24263           "addpd %%xmm3, %%xmm10\n"
24264           "subpd %%xmm3, %%xmm11\n"
24265           "movapd %%xmm4, %%xmm12\n"
24266           "movapd %%xmm4, %%xmm13\n"
24267           "addpd %%xmm5, %%xmm12\n"
24268           "subpd %%xmm5, %%xmm13\n"
24269           "movapd %%xmm6, %%xmm14\n"
24270           "movapd %%xmm6, %%xmm15\n"
24271           "addpd %%xmm7, %%xmm14\n"
24272           "subpd %%xmm7, %%xmm15\n"
24273           "movapd %%xmm8, %%xmm0\n"
24274           "movapd %%xmm8, %%xmm2\n"
24275           "addpd %%xmm10, %%xmm0\n"
24276           "subpd %%xmm10, %%xmm2\n"
24277           "movapd %%xmm9, %%xmm1\n"
24278           "movapd %%xmm9, %%xmm3\n"
24279           "addpd %%xmm11, %%xmm1\n"
24280           "subpd %%xmm11, %%xmm3\n"
24281           "movapd %%xmm12, %%xmm4\n"
24282           "movapd %%xmm12, %%xmm6\n"
24283           "addpd %%xmm14, %%xmm4\n"
24284           "subpd %%xmm14, %%xmm6\n"
24285           "movapd %%xmm13, %%xmm5\n"
24286           "movapd %%xmm13, %%xmm7\n"
24287           "addpd %%xmm15, %%xmm5\n"
24288           "subpd %%xmm15, %%xmm7\n"
24289           "movapd %%xmm0, %%xmm8\n"
24290           "movapd %%xmm0, %%xmm12\n"
24291           "addpd %%xmm4, %%xmm8\n"
24292           "subpd %%xmm4, %%xmm12\n"
24293           "movapd %%xmm1, %%xmm9\n"
24294           "movapd %%xmm1, %%xmm13\n"
24295           "addpd %%xmm5, %%xmm9\n"
24296           "subpd %%xmm5, %%xmm13\n"
24297           "movapd %%xmm2, %%xmm10\n"
24298           "movapd %%xmm2, %%xmm14\n"
24299           "addpd %%xmm6, %%xmm10\n"
24300           "subpd %%xmm6, %%xmm14\n"
24301           "movapd %%xmm3, %%xmm11\n"
24302           "movapd %%xmm3, %%xmm15\n"
24303           "addpd %%xmm7, %%xmm11\n"
24304           "subpd %%xmm7, %%xmm15\n"
24305           "movupd %%xmm8, (%0)\n"
24306           "movupd %%xmm9, (%1)\n"
24307           "movupd %%xmm10, (%2)\n"
24308           "movupd %%xmm11, (%3)\n"
24309           "movupd %%xmm12, (%4)\n"
24310           "movupd %%xmm13, (%5)\n"
24311           "movupd %%xmm14, (%6)\n"
24312           "movupd %%xmm15, (%7)\n"
24313           :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24314         );
24315       }
24316     }
24317     return;
24318   }
24319   if (depth == 23) {
24320     helper_double_28_recursive(buf + 0, 20);
24321     helper_double_28_recursive(buf + 1048576, 20);
24322     helper_double_28_recursive(buf + 2097152, 20);
24323     helper_double_28_recursive(buf + 3145728, 20);
24324     helper_double_28_recursive(buf + 4194304, 20);
24325     helper_double_28_recursive(buf + 5242880, 20);
24326     helper_double_28_recursive(buf + 6291456, 20);
24327     helper_double_28_recursive(buf + 7340032, 20);
24328     for (int j = 0; j < 8388608; j += 8388608) {
24329       for (int k = 0; k < 1048576; k += 2) {
24330         __asm__ volatile (
24331           "movupd (%0), %%xmm0\n"
24332           "movupd (%1), %%xmm1\n"
24333           "movupd (%2), %%xmm2\n"
24334           "movupd (%3), %%xmm3\n"
24335           "movupd (%4), %%xmm4\n"
24336           "movupd (%5), %%xmm5\n"
24337           "movupd (%6), %%xmm6\n"
24338           "movupd (%7), %%xmm7\n"
24339           "movapd %%xmm0, %%xmm8\n"
24340           "movapd %%xmm0, %%xmm9\n"
24341           "addpd %%xmm1, %%xmm8\n"
24342           "subpd %%xmm1, %%xmm9\n"
24343           "movapd %%xmm2, %%xmm10\n"
24344           "movapd %%xmm2, %%xmm11\n"
24345           "addpd %%xmm3, %%xmm10\n"
24346           "subpd %%xmm3, %%xmm11\n"
24347           "movapd %%xmm4, %%xmm12\n"
24348           "movapd %%xmm4, %%xmm13\n"
24349           "addpd %%xmm5, %%xmm12\n"
24350           "subpd %%xmm5, %%xmm13\n"
24351           "movapd %%xmm6, %%xmm14\n"
24352           "movapd %%xmm6, %%xmm15\n"
24353           "addpd %%xmm7, %%xmm14\n"
24354           "subpd %%xmm7, %%xmm15\n"
24355           "movapd %%xmm8, %%xmm0\n"
24356           "movapd %%xmm8, %%xmm2\n"
24357           "addpd %%xmm10, %%xmm0\n"
24358           "subpd %%xmm10, %%xmm2\n"
24359           "movapd %%xmm9, %%xmm1\n"
24360           "movapd %%xmm9, %%xmm3\n"
24361           "addpd %%xmm11, %%xmm1\n"
24362           "subpd %%xmm11, %%xmm3\n"
24363           "movapd %%xmm12, %%xmm4\n"
24364           "movapd %%xmm12, %%xmm6\n"
24365           "addpd %%xmm14, %%xmm4\n"
24366           "subpd %%xmm14, %%xmm6\n"
24367           "movapd %%xmm13, %%xmm5\n"
24368           "movapd %%xmm13, %%xmm7\n"
24369           "addpd %%xmm15, %%xmm5\n"
24370           "subpd %%xmm15, %%xmm7\n"
24371           "movapd %%xmm0, %%xmm8\n"
24372           "movapd %%xmm0, %%xmm12\n"
24373           "addpd %%xmm4, %%xmm8\n"
24374           "subpd %%xmm4, %%xmm12\n"
24375           "movapd %%xmm1, %%xmm9\n"
24376           "movapd %%xmm1, %%xmm13\n"
24377           "addpd %%xmm5, %%xmm9\n"
24378           "subpd %%xmm5, %%xmm13\n"
24379           "movapd %%xmm2, %%xmm10\n"
24380           "movapd %%xmm2, %%xmm14\n"
24381           "addpd %%xmm6, %%xmm10\n"
24382           "subpd %%xmm6, %%xmm14\n"
24383           "movapd %%xmm3, %%xmm11\n"
24384           "movapd %%xmm3, %%xmm15\n"
24385           "addpd %%xmm7, %%xmm11\n"
24386           "subpd %%xmm7, %%xmm15\n"
24387           "movupd %%xmm8, (%0)\n"
24388           "movupd %%xmm9, (%1)\n"
24389           "movupd %%xmm10, (%2)\n"
24390           "movupd %%xmm11, (%3)\n"
24391           "movupd %%xmm12, (%4)\n"
24392           "movupd %%xmm13, (%5)\n"
24393           "movupd %%xmm14, (%6)\n"
24394           "movupd %%xmm15, (%7)\n"
24395           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24396         );
24397       }
24398     }
24399     return;
24400   }
24401   if (depth == 26) {
24402     helper_double_28_recursive(buf + 0, 23);
24403     helper_double_28_recursive(buf + 8388608, 23);
24404     helper_double_28_recursive(buf + 16777216, 23);
24405     helper_double_28_recursive(buf + 25165824, 23);
24406     helper_double_28_recursive(buf + 33554432, 23);
24407     helper_double_28_recursive(buf + 41943040, 23);
24408     helper_double_28_recursive(buf + 50331648, 23);
24409     helper_double_28_recursive(buf + 58720256, 23);
24410     for (int j = 0; j < 67108864; j += 67108864) {
24411       for (int k = 0; k < 8388608; k += 2) {
24412         __asm__ volatile (
24413           "movupd (%0), %%xmm0\n"
24414           "movupd (%1), %%xmm1\n"
24415           "movupd (%2), %%xmm2\n"
24416           "movupd (%3), %%xmm3\n"
24417           "movupd (%4), %%xmm4\n"
24418           "movupd (%5), %%xmm5\n"
24419           "movupd (%6), %%xmm6\n"
24420           "movupd (%7), %%xmm7\n"
24421           "movapd %%xmm0, %%xmm8\n"
24422           "movapd %%xmm0, %%xmm9\n"
24423           "addpd %%xmm1, %%xmm8\n"
24424           "subpd %%xmm1, %%xmm9\n"
24425           "movapd %%xmm2, %%xmm10\n"
24426           "movapd %%xmm2, %%xmm11\n"
24427           "addpd %%xmm3, %%xmm10\n"
24428           "subpd %%xmm3, %%xmm11\n"
24429           "movapd %%xmm4, %%xmm12\n"
24430           "movapd %%xmm4, %%xmm13\n"
24431           "addpd %%xmm5, %%xmm12\n"
24432           "subpd %%xmm5, %%xmm13\n"
24433           "movapd %%xmm6, %%xmm14\n"
24434           "movapd %%xmm6, %%xmm15\n"
24435           "addpd %%xmm7, %%xmm14\n"
24436           "subpd %%xmm7, %%xmm15\n"
24437           "movapd %%xmm8, %%xmm0\n"
24438           "movapd %%xmm8, %%xmm2\n"
24439           "addpd %%xmm10, %%xmm0\n"
24440           "subpd %%xmm10, %%xmm2\n"
24441           "movapd %%xmm9, %%xmm1\n"
24442           "movapd %%xmm9, %%xmm3\n"
24443           "addpd %%xmm11, %%xmm1\n"
24444           "subpd %%xmm11, %%xmm3\n"
24445           "movapd %%xmm12, %%xmm4\n"
24446           "movapd %%xmm12, %%xmm6\n"
24447           "addpd %%xmm14, %%xmm4\n"
24448           "subpd %%xmm14, %%xmm6\n"
24449           "movapd %%xmm13, %%xmm5\n"
24450           "movapd %%xmm13, %%xmm7\n"
24451           "addpd %%xmm15, %%xmm5\n"
24452           "subpd %%xmm15, %%xmm7\n"
24453           "movapd %%xmm0, %%xmm8\n"
24454           "movapd %%xmm0, %%xmm12\n"
24455           "addpd %%xmm4, %%xmm8\n"
24456           "subpd %%xmm4, %%xmm12\n"
24457           "movapd %%xmm1, %%xmm9\n"
24458           "movapd %%xmm1, %%xmm13\n"
24459           "addpd %%xmm5, %%xmm9\n"
24460           "subpd %%xmm5, %%xmm13\n"
24461           "movapd %%xmm2, %%xmm10\n"
24462           "movapd %%xmm2, %%xmm14\n"
24463           "addpd %%xmm6, %%xmm10\n"
24464           "subpd %%xmm6, %%xmm14\n"
24465           "movapd %%xmm3, %%xmm11\n"
24466           "movapd %%xmm3, %%xmm15\n"
24467           "addpd %%xmm7, %%xmm11\n"
24468           "subpd %%xmm7, %%xmm15\n"
24469           "movupd %%xmm8, (%0)\n"
24470           "movupd %%xmm9, (%1)\n"
24471           "movupd %%xmm10, (%2)\n"
24472           "movupd %%xmm11, (%3)\n"
24473           "movupd %%xmm12, (%4)\n"
24474           "movupd %%xmm13, (%5)\n"
24475           "movupd %%xmm14, (%6)\n"
24476           "movupd %%xmm15, (%7)\n"
24477           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24478         );
24479       }
24480     }
24481     return;
24482   }
24483   if (depth == 28) {
24484     helper_double_28_recursive(buf + 0, 26);
24485     helper_double_28_recursive(buf + 67108864, 26);
24486     helper_double_28_recursive(buf + 134217728, 26);
24487     helper_double_28_recursive(buf + 201326592, 26);
24488     for (int j = 0; j < 268435456; j += 268435456) {
24489       for (int k = 0; k < 67108864; k += 2) {
24490         __asm__ volatile (
24491           "movupd (%0), %%xmm0\n"
24492           "movupd (%1), %%xmm1\n"
24493           "movupd (%2), %%xmm2\n"
24494           "movupd (%3), %%xmm3\n"
24495           "movapd %%xmm0, %%xmm8\n"
24496           "movapd %%xmm0, %%xmm9\n"
24497           "addpd %%xmm1, %%xmm8\n"
24498           "subpd %%xmm1, %%xmm9\n"
24499           "movapd %%xmm2, %%xmm10\n"
24500           "movapd %%xmm2, %%xmm11\n"
24501           "addpd %%xmm3, %%xmm10\n"
24502           "subpd %%xmm3, %%xmm11\n"
24503           "movapd %%xmm8, %%xmm0\n"
24504           "movapd %%xmm8, %%xmm2\n"
24505           "addpd %%xmm10, %%xmm0\n"
24506           "subpd %%xmm10, %%xmm2\n"
24507           "movapd %%xmm9, %%xmm1\n"
24508           "movapd %%xmm9, %%xmm3\n"
24509           "addpd %%xmm11, %%xmm1\n"
24510           "subpd %%xmm11, %%xmm3\n"
24511           "movupd %%xmm0, (%0)\n"
24512           "movupd %%xmm1, (%1)\n"
24513           "movupd %%xmm2, (%2)\n"
24514           "movupd %%xmm3, (%3)\n"
24515           :: "r"(buf + j + k + 0), "r"(buf + j + k + 67108864), "r"(buf + j + k + 134217728), "r"(buf + j + k + 201326592) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24516         );
24517       }
24518     }
24519     return;
24520   }
24521 }
24522 void helper_double_28(double *buf);
helper_double_28(double * buf)24523 void helper_double_28(double *buf) {
24524   helper_double_28_recursive(buf, 28);
24525 }
24526 void helper_double_29_recursive(double *buf, int depth);
helper_double_29_recursive(double * buf,int depth)24527 void helper_double_29_recursive(double *buf, int depth) {
24528   if (depth == 9) {
24529     for (int j = 0; j < 512; j += 16) {
24530       for (int k = 0; k < 2; k += 2) {
24531         __asm__ volatile (
24532           "movupd (%0), %%xmm0\n"
24533           "movupd (%1), %%xmm1\n"
24534           "movupd (%2), %%xmm2\n"
24535           "movupd (%3), %%xmm3\n"
24536           "movupd (%4), %%xmm4\n"
24537           "movupd (%5), %%xmm5\n"
24538           "movupd (%6), %%xmm6\n"
24539           "movupd (%7), %%xmm7\n"
24540           "movapd %%xmm0, %%xmm8\n"
24541           "haddpd %%xmm8, %%xmm8\n"
24542           "movapd %%xmm0, %%xmm9\n"
24543           "hsubpd %%xmm9, %%xmm9\n"
24544           "blendpd $1, %%xmm8, %%xmm9\n"
24545           "movapd %%xmm9, %%xmm0\n"
24546           "movapd %%xmm1, %%xmm8\n"
24547           "haddpd %%xmm8, %%xmm8\n"
24548           "movapd %%xmm1, %%xmm9\n"
24549           "hsubpd %%xmm9, %%xmm9\n"
24550           "blendpd $1, %%xmm8, %%xmm9\n"
24551           "movapd %%xmm9, %%xmm1\n"
24552           "movapd %%xmm2, %%xmm8\n"
24553           "haddpd %%xmm8, %%xmm8\n"
24554           "movapd %%xmm2, %%xmm9\n"
24555           "hsubpd %%xmm9, %%xmm9\n"
24556           "blendpd $1, %%xmm8, %%xmm9\n"
24557           "movapd %%xmm9, %%xmm2\n"
24558           "movapd %%xmm3, %%xmm8\n"
24559           "haddpd %%xmm8, %%xmm8\n"
24560           "movapd %%xmm3, %%xmm9\n"
24561           "hsubpd %%xmm9, %%xmm9\n"
24562           "blendpd $1, %%xmm8, %%xmm9\n"
24563           "movapd %%xmm9, %%xmm3\n"
24564           "movapd %%xmm4, %%xmm8\n"
24565           "haddpd %%xmm8, %%xmm8\n"
24566           "movapd %%xmm4, %%xmm9\n"
24567           "hsubpd %%xmm9, %%xmm9\n"
24568           "blendpd $1, %%xmm8, %%xmm9\n"
24569           "movapd %%xmm9, %%xmm4\n"
24570           "movapd %%xmm5, %%xmm8\n"
24571           "haddpd %%xmm8, %%xmm8\n"
24572           "movapd %%xmm5, %%xmm9\n"
24573           "hsubpd %%xmm9, %%xmm9\n"
24574           "blendpd $1, %%xmm8, %%xmm9\n"
24575           "movapd %%xmm9, %%xmm5\n"
24576           "movapd %%xmm6, %%xmm8\n"
24577           "haddpd %%xmm8, %%xmm8\n"
24578           "movapd %%xmm6, %%xmm9\n"
24579           "hsubpd %%xmm9, %%xmm9\n"
24580           "blendpd $1, %%xmm8, %%xmm9\n"
24581           "movapd %%xmm9, %%xmm6\n"
24582           "movapd %%xmm7, %%xmm8\n"
24583           "haddpd %%xmm8, %%xmm8\n"
24584           "movapd %%xmm7, %%xmm9\n"
24585           "hsubpd %%xmm9, %%xmm9\n"
24586           "blendpd $1, %%xmm8, %%xmm9\n"
24587           "movapd %%xmm9, %%xmm7\n"
24588           "movapd %%xmm0, %%xmm8\n"
24589           "movapd %%xmm0, %%xmm9\n"
24590           "addpd %%xmm1, %%xmm8\n"
24591           "subpd %%xmm1, %%xmm9\n"
24592           "movapd %%xmm2, %%xmm10\n"
24593           "movapd %%xmm2, %%xmm11\n"
24594           "addpd %%xmm3, %%xmm10\n"
24595           "subpd %%xmm3, %%xmm11\n"
24596           "movapd %%xmm4, %%xmm12\n"
24597           "movapd %%xmm4, %%xmm13\n"
24598           "addpd %%xmm5, %%xmm12\n"
24599           "subpd %%xmm5, %%xmm13\n"
24600           "movapd %%xmm6, %%xmm14\n"
24601           "movapd %%xmm6, %%xmm15\n"
24602           "addpd %%xmm7, %%xmm14\n"
24603           "subpd %%xmm7, %%xmm15\n"
24604           "movapd %%xmm8, %%xmm0\n"
24605           "movapd %%xmm8, %%xmm2\n"
24606           "addpd %%xmm10, %%xmm0\n"
24607           "subpd %%xmm10, %%xmm2\n"
24608           "movapd %%xmm9, %%xmm1\n"
24609           "movapd %%xmm9, %%xmm3\n"
24610           "addpd %%xmm11, %%xmm1\n"
24611           "subpd %%xmm11, %%xmm3\n"
24612           "movapd %%xmm12, %%xmm4\n"
24613           "movapd %%xmm12, %%xmm6\n"
24614           "addpd %%xmm14, %%xmm4\n"
24615           "subpd %%xmm14, %%xmm6\n"
24616           "movapd %%xmm13, %%xmm5\n"
24617           "movapd %%xmm13, %%xmm7\n"
24618           "addpd %%xmm15, %%xmm5\n"
24619           "subpd %%xmm15, %%xmm7\n"
24620           "movapd %%xmm0, %%xmm8\n"
24621           "movapd %%xmm0, %%xmm12\n"
24622           "addpd %%xmm4, %%xmm8\n"
24623           "subpd %%xmm4, %%xmm12\n"
24624           "movapd %%xmm1, %%xmm9\n"
24625           "movapd %%xmm1, %%xmm13\n"
24626           "addpd %%xmm5, %%xmm9\n"
24627           "subpd %%xmm5, %%xmm13\n"
24628           "movapd %%xmm2, %%xmm10\n"
24629           "movapd %%xmm2, %%xmm14\n"
24630           "addpd %%xmm6, %%xmm10\n"
24631           "subpd %%xmm6, %%xmm14\n"
24632           "movapd %%xmm3, %%xmm11\n"
24633           "movapd %%xmm3, %%xmm15\n"
24634           "addpd %%xmm7, %%xmm11\n"
24635           "subpd %%xmm7, %%xmm15\n"
24636           "movupd %%xmm8, (%0)\n"
24637           "movupd %%xmm9, (%1)\n"
24638           "movupd %%xmm10, (%2)\n"
24639           "movupd %%xmm11, (%3)\n"
24640           "movupd %%xmm12, (%4)\n"
24641           "movupd %%xmm13, (%5)\n"
24642           "movupd %%xmm14, (%6)\n"
24643           "movupd %%xmm15, (%7)\n"
24644           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24645         );
24646       }
24647     }
24648     for (int j = 0; j < 512; j += 128) {
24649       for (int k = 0; k < 16; k += 2) {
24650         __asm__ volatile (
24651           "movupd (%0), %%xmm0\n"
24652           "movupd (%1), %%xmm1\n"
24653           "movupd (%2), %%xmm2\n"
24654           "movupd (%3), %%xmm3\n"
24655           "movupd (%4), %%xmm4\n"
24656           "movupd (%5), %%xmm5\n"
24657           "movupd (%6), %%xmm6\n"
24658           "movupd (%7), %%xmm7\n"
24659           "movapd %%xmm0, %%xmm8\n"
24660           "movapd %%xmm0, %%xmm9\n"
24661           "addpd %%xmm1, %%xmm8\n"
24662           "subpd %%xmm1, %%xmm9\n"
24663           "movapd %%xmm2, %%xmm10\n"
24664           "movapd %%xmm2, %%xmm11\n"
24665           "addpd %%xmm3, %%xmm10\n"
24666           "subpd %%xmm3, %%xmm11\n"
24667           "movapd %%xmm4, %%xmm12\n"
24668           "movapd %%xmm4, %%xmm13\n"
24669           "addpd %%xmm5, %%xmm12\n"
24670           "subpd %%xmm5, %%xmm13\n"
24671           "movapd %%xmm6, %%xmm14\n"
24672           "movapd %%xmm6, %%xmm15\n"
24673           "addpd %%xmm7, %%xmm14\n"
24674           "subpd %%xmm7, %%xmm15\n"
24675           "movapd %%xmm8, %%xmm0\n"
24676           "movapd %%xmm8, %%xmm2\n"
24677           "addpd %%xmm10, %%xmm0\n"
24678           "subpd %%xmm10, %%xmm2\n"
24679           "movapd %%xmm9, %%xmm1\n"
24680           "movapd %%xmm9, %%xmm3\n"
24681           "addpd %%xmm11, %%xmm1\n"
24682           "subpd %%xmm11, %%xmm3\n"
24683           "movapd %%xmm12, %%xmm4\n"
24684           "movapd %%xmm12, %%xmm6\n"
24685           "addpd %%xmm14, %%xmm4\n"
24686           "subpd %%xmm14, %%xmm6\n"
24687           "movapd %%xmm13, %%xmm5\n"
24688           "movapd %%xmm13, %%xmm7\n"
24689           "addpd %%xmm15, %%xmm5\n"
24690           "subpd %%xmm15, %%xmm7\n"
24691           "movapd %%xmm0, %%xmm8\n"
24692           "movapd %%xmm0, %%xmm12\n"
24693           "addpd %%xmm4, %%xmm8\n"
24694           "subpd %%xmm4, %%xmm12\n"
24695           "movapd %%xmm1, %%xmm9\n"
24696           "movapd %%xmm1, %%xmm13\n"
24697           "addpd %%xmm5, %%xmm9\n"
24698           "subpd %%xmm5, %%xmm13\n"
24699           "movapd %%xmm2, %%xmm10\n"
24700           "movapd %%xmm2, %%xmm14\n"
24701           "addpd %%xmm6, %%xmm10\n"
24702           "subpd %%xmm6, %%xmm14\n"
24703           "movapd %%xmm3, %%xmm11\n"
24704           "movapd %%xmm3, %%xmm15\n"
24705           "addpd %%xmm7, %%xmm11\n"
24706           "subpd %%xmm7, %%xmm15\n"
24707           "movupd %%xmm8, (%0)\n"
24708           "movupd %%xmm9, (%1)\n"
24709           "movupd %%xmm10, (%2)\n"
24710           "movupd %%xmm11, (%3)\n"
24711           "movupd %%xmm12, (%4)\n"
24712           "movupd %%xmm13, (%5)\n"
24713           "movupd %%xmm14, (%6)\n"
24714           "movupd %%xmm15, (%7)\n"
24715           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24716         );
24717       }
24718     }
24719     for (int j = 0; j < 512; j += 512) {
24720       for (int k = 0; k < 128; k += 2) {
24721         __asm__ volatile (
24722           "movupd (%0), %%xmm0\n"
24723           "movupd (%1), %%xmm1\n"
24724           "movupd (%2), %%xmm2\n"
24725           "movupd (%3), %%xmm3\n"
24726           "movapd %%xmm0, %%xmm8\n"
24727           "movapd %%xmm0, %%xmm9\n"
24728           "addpd %%xmm1, %%xmm8\n"
24729           "subpd %%xmm1, %%xmm9\n"
24730           "movapd %%xmm2, %%xmm10\n"
24731           "movapd %%xmm2, %%xmm11\n"
24732           "addpd %%xmm3, %%xmm10\n"
24733           "subpd %%xmm3, %%xmm11\n"
24734           "movapd %%xmm8, %%xmm0\n"
24735           "movapd %%xmm8, %%xmm2\n"
24736           "addpd %%xmm10, %%xmm0\n"
24737           "subpd %%xmm10, %%xmm2\n"
24738           "movapd %%xmm9, %%xmm1\n"
24739           "movapd %%xmm9, %%xmm3\n"
24740           "addpd %%xmm11, %%xmm1\n"
24741           "subpd %%xmm11, %%xmm3\n"
24742           "movupd %%xmm0, (%0)\n"
24743           "movupd %%xmm1, (%1)\n"
24744           "movupd %%xmm2, (%2)\n"
24745           "movupd %%xmm3, (%3)\n"
24746           :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24747         );
24748       }
24749     }
24750     return;
24751   }
24752   if (depth == 12) {
24753     helper_double_29_recursive(buf + 0, 9);
24754     helper_double_29_recursive(buf + 512, 9);
24755     helper_double_29_recursive(buf + 1024, 9);
24756     helper_double_29_recursive(buf + 1536, 9);
24757     helper_double_29_recursive(buf + 2048, 9);
24758     helper_double_29_recursive(buf + 2560, 9);
24759     helper_double_29_recursive(buf + 3072, 9);
24760     helper_double_29_recursive(buf + 3584, 9);
24761     for (int j = 0; j < 4096; j += 4096) {
24762       for (int k = 0; k < 512; k += 2) {
24763         __asm__ volatile (
24764           "movupd (%0), %%xmm0\n"
24765           "movupd (%1), %%xmm1\n"
24766           "movupd (%2), %%xmm2\n"
24767           "movupd (%3), %%xmm3\n"
24768           "movupd (%4), %%xmm4\n"
24769           "movupd (%5), %%xmm5\n"
24770           "movupd (%6), %%xmm6\n"
24771           "movupd (%7), %%xmm7\n"
24772           "movapd %%xmm0, %%xmm8\n"
24773           "movapd %%xmm0, %%xmm9\n"
24774           "addpd %%xmm1, %%xmm8\n"
24775           "subpd %%xmm1, %%xmm9\n"
24776           "movapd %%xmm2, %%xmm10\n"
24777           "movapd %%xmm2, %%xmm11\n"
24778           "addpd %%xmm3, %%xmm10\n"
24779           "subpd %%xmm3, %%xmm11\n"
24780           "movapd %%xmm4, %%xmm12\n"
24781           "movapd %%xmm4, %%xmm13\n"
24782           "addpd %%xmm5, %%xmm12\n"
24783           "subpd %%xmm5, %%xmm13\n"
24784           "movapd %%xmm6, %%xmm14\n"
24785           "movapd %%xmm6, %%xmm15\n"
24786           "addpd %%xmm7, %%xmm14\n"
24787           "subpd %%xmm7, %%xmm15\n"
24788           "movapd %%xmm8, %%xmm0\n"
24789           "movapd %%xmm8, %%xmm2\n"
24790           "addpd %%xmm10, %%xmm0\n"
24791           "subpd %%xmm10, %%xmm2\n"
24792           "movapd %%xmm9, %%xmm1\n"
24793           "movapd %%xmm9, %%xmm3\n"
24794           "addpd %%xmm11, %%xmm1\n"
24795           "subpd %%xmm11, %%xmm3\n"
24796           "movapd %%xmm12, %%xmm4\n"
24797           "movapd %%xmm12, %%xmm6\n"
24798           "addpd %%xmm14, %%xmm4\n"
24799           "subpd %%xmm14, %%xmm6\n"
24800           "movapd %%xmm13, %%xmm5\n"
24801           "movapd %%xmm13, %%xmm7\n"
24802           "addpd %%xmm15, %%xmm5\n"
24803           "subpd %%xmm15, %%xmm7\n"
24804           "movapd %%xmm0, %%xmm8\n"
24805           "movapd %%xmm0, %%xmm12\n"
24806           "addpd %%xmm4, %%xmm8\n"
24807           "subpd %%xmm4, %%xmm12\n"
24808           "movapd %%xmm1, %%xmm9\n"
24809           "movapd %%xmm1, %%xmm13\n"
24810           "addpd %%xmm5, %%xmm9\n"
24811           "subpd %%xmm5, %%xmm13\n"
24812           "movapd %%xmm2, %%xmm10\n"
24813           "movapd %%xmm2, %%xmm14\n"
24814           "addpd %%xmm6, %%xmm10\n"
24815           "subpd %%xmm6, %%xmm14\n"
24816           "movapd %%xmm3, %%xmm11\n"
24817           "movapd %%xmm3, %%xmm15\n"
24818           "addpd %%xmm7, %%xmm11\n"
24819           "subpd %%xmm7, %%xmm15\n"
24820           "movupd %%xmm8, (%0)\n"
24821           "movupd %%xmm9, (%1)\n"
24822           "movupd %%xmm10, (%2)\n"
24823           "movupd %%xmm11, (%3)\n"
24824           "movupd %%xmm12, (%4)\n"
24825           "movupd %%xmm13, (%5)\n"
24826           "movupd %%xmm14, (%6)\n"
24827           "movupd %%xmm15, (%7)\n"
24828           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24829         );
24830       }
24831     }
24832     return;
24833   }
24834   if (depth == 15) {
24835     helper_double_29_recursive(buf + 0, 12);
24836     helper_double_29_recursive(buf + 4096, 12);
24837     helper_double_29_recursive(buf + 8192, 12);
24838     helper_double_29_recursive(buf + 12288, 12);
24839     helper_double_29_recursive(buf + 16384, 12);
24840     helper_double_29_recursive(buf + 20480, 12);
24841     helper_double_29_recursive(buf + 24576, 12);
24842     helper_double_29_recursive(buf + 28672, 12);
24843     for (int j = 0; j < 32768; j += 32768) {
24844       for (int k = 0; k < 4096; k += 2) {
24845         __asm__ volatile (
24846           "movupd (%0), %%xmm0\n"
24847           "movupd (%1), %%xmm1\n"
24848           "movupd (%2), %%xmm2\n"
24849           "movupd (%3), %%xmm3\n"
24850           "movupd (%4), %%xmm4\n"
24851           "movupd (%5), %%xmm5\n"
24852           "movupd (%6), %%xmm6\n"
24853           "movupd (%7), %%xmm7\n"
24854           "movapd %%xmm0, %%xmm8\n"
24855           "movapd %%xmm0, %%xmm9\n"
24856           "addpd %%xmm1, %%xmm8\n"
24857           "subpd %%xmm1, %%xmm9\n"
24858           "movapd %%xmm2, %%xmm10\n"
24859           "movapd %%xmm2, %%xmm11\n"
24860           "addpd %%xmm3, %%xmm10\n"
24861           "subpd %%xmm3, %%xmm11\n"
24862           "movapd %%xmm4, %%xmm12\n"
24863           "movapd %%xmm4, %%xmm13\n"
24864           "addpd %%xmm5, %%xmm12\n"
24865           "subpd %%xmm5, %%xmm13\n"
24866           "movapd %%xmm6, %%xmm14\n"
24867           "movapd %%xmm6, %%xmm15\n"
24868           "addpd %%xmm7, %%xmm14\n"
24869           "subpd %%xmm7, %%xmm15\n"
24870           "movapd %%xmm8, %%xmm0\n"
24871           "movapd %%xmm8, %%xmm2\n"
24872           "addpd %%xmm10, %%xmm0\n"
24873           "subpd %%xmm10, %%xmm2\n"
24874           "movapd %%xmm9, %%xmm1\n"
24875           "movapd %%xmm9, %%xmm3\n"
24876           "addpd %%xmm11, %%xmm1\n"
24877           "subpd %%xmm11, %%xmm3\n"
24878           "movapd %%xmm12, %%xmm4\n"
24879           "movapd %%xmm12, %%xmm6\n"
24880           "addpd %%xmm14, %%xmm4\n"
24881           "subpd %%xmm14, %%xmm6\n"
24882           "movapd %%xmm13, %%xmm5\n"
24883           "movapd %%xmm13, %%xmm7\n"
24884           "addpd %%xmm15, %%xmm5\n"
24885           "subpd %%xmm15, %%xmm7\n"
24886           "movapd %%xmm0, %%xmm8\n"
24887           "movapd %%xmm0, %%xmm12\n"
24888           "addpd %%xmm4, %%xmm8\n"
24889           "subpd %%xmm4, %%xmm12\n"
24890           "movapd %%xmm1, %%xmm9\n"
24891           "movapd %%xmm1, %%xmm13\n"
24892           "addpd %%xmm5, %%xmm9\n"
24893           "subpd %%xmm5, %%xmm13\n"
24894           "movapd %%xmm2, %%xmm10\n"
24895           "movapd %%xmm2, %%xmm14\n"
24896           "addpd %%xmm6, %%xmm10\n"
24897           "subpd %%xmm6, %%xmm14\n"
24898           "movapd %%xmm3, %%xmm11\n"
24899           "movapd %%xmm3, %%xmm15\n"
24900           "addpd %%xmm7, %%xmm11\n"
24901           "subpd %%xmm7, %%xmm15\n"
24902           "movupd %%xmm8, (%0)\n"
24903           "movupd %%xmm9, (%1)\n"
24904           "movupd %%xmm10, (%2)\n"
24905           "movupd %%xmm11, (%3)\n"
24906           "movupd %%xmm12, (%4)\n"
24907           "movupd %%xmm13, (%5)\n"
24908           "movupd %%xmm14, (%6)\n"
24909           "movupd %%xmm15, (%7)\n"
24910           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24911         );
24912       }
24913     }
24914     return;
24915   }
24916   if (depth == 18) {
24917     helper_double_29_recursive(buf + 0, 15);
24918     helper_double_29_recursive(buf + 32768, 15);
24919     helper_double_29_recursive(buf + 65536, 15);
24920     helper_double_29_recursive(buf + 98304, 15);
24921     helper_double_29_recursive(buf + 131072, 15);
24922     helper_double_29_recursive(buf + 163840, 15);
24923     helper_double_29_recursive(buf + 196608, 15);
24924     helper_double_29_recursive(buf + 229376, 15);
24925     for (int j = 0; j < 262144; j += 262144) {
24926       for (int k = 0; k < 32768; k += 2) {
24927         __asm__ volatile (
24928           "movupd (%0), %%xmm0\n"
24929           "movupd (%1), %%xmm1\n"
24930           "movupd (%2), %%xmm2\n"
24931           "movupd (%3), %%xmm3\n"
24932           "movupd (%4), %%xmm4\n"
24933           "movupd (%5), %%xmm5\n"
24934           "movupd (%6), %%xmm6\n"
24935           "movupd (%7), %%xmm7\n"
24936           "movapd %%xmm0, %%xmm8\n"
24937           "movapd %%xmm0, %%xmm9\n"
24938           "addpd %%xmm1, %%xmm8\n"
24939           "subpd %%xmm1, %%xmm9\n"
24940           "movapd %%xmm2, %%xmm10\n"
24941           "movapd %%xmm2, %%xmm11\n"
24942           "addpd %%xmm3, %%xmm10\n"
24943           "subpd %%xmm3, %%xmm11\n"
24944           "movapd %%xmm4, %%xmm12\n"
24945           "movapd %%xmm4, %%xmm13\n"
24946           "addpd %%xmm5, %%xmm12\n"
24947           "subpd %%xmm5, %%xmm13\n"
24948           "movapd %%xmm6, %%xmm14\n"
24949           "movapd %%xmm6, %%xmm15\n"
24950           "addpd %%xmm7, %%xmm14\n"
24951           "subpd %%xmm7, %%xmm15\n"
24952           "movapd %%xmm8, %%xmm0\n"
24953           "movapd %%xmm8, %%xmm2\n"
24954           "addpd %%xmm10, %%xmm0\n"
24955           "subpd %%xmm10, %%xmm2\n"
24956           "movapd %%xmm9, %%xmm1\n"
24957           "movapd %%xmm9, %%xmm3\n"
24958           "addpd %%xmm11, %%xmm1\n"
24959           "subpd %%xmm11, %%xmm3\n"
24960           "movapd %%xmm12, %%xmm4\n"
24961           "movapd %%xmm12, %%xmm6\n"
24962           "addpd %%xmm14, %%xmm4\n"
24963           "subpd %%xmm14, %%xmm6\n"
24964           "movapd %%xmm13, %%xmm5\n"
24965           "movapd %%xmm13, %%xmm7\n"
24966           "addpd %%xmm15, %%xmm5\n"
24967           "subpd %%xmm15, %%xmm7\n"
24968           "movapd %%xmm0, %%xmm8\n"
24969           "movapd %%xmm0, %%xmm12\n"
24970           "addpd %%xmm4, %%xmm8\n"
24971           "subpd %%xmm4, %%xmm12\n"
24972           "movapd %%xmm1, %%xmm9\n"
24973           "movapd %%xmm1, %%xmm13\n"
24974           "addpd %%xmm5, %%xmm9\n"
24975           "subpd %%xmm5, %%xmm13\n"
24976           "movapd %%xmm2, %%xmm10\n"
24977           "movapd %%xmm2, %%xmm14\n"
24978           "addpd %%xmm6, %%xmm10\n"
24979           "subpd %%xmm6, %%xmm14\n"
24980           "movapd %%xmm3, %%xmm11\n"
24981           "movapd %%xmm3, %%xmm15\n"
24982           "addpd %%xmm7, %%xmm11\n"
24983           "subpd %%xmm7, %%xmm15\n"
24984           "movupd %%xmm8, (%0)\n"
24985           "movupd %%xmm9, (%1)\n"
24986           "movupd %%xmm10, (%2)\n"
24987           "movupd %%xmm11, (%3)\n"
24988           "movupd %%xmm12, (%4)\n"
24989           "movupd %%xmm13, (%5)\n"
24990           "movupd %%xmm14, (%6)\n"
24991           "movupd %%xmm15, (%7)\n"
24992           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24993         );
24994       }
24995     }
24996     return;
24997   }
24998   if (depth == 21) {
24999     helper_double_29_recursive(buf + 0, 18);
25000     helper_double_29_recursive(buf + 262144, 18);
25001     helper_double_29_recursive(buf + 524288, 18);
25002     helper_double_29_recursive(buf + 786432, 18);
25003     helper_double_29_recursive(buf + 1048576, 18);
25004     helper_double_29_recursive(buf + 1310720, 18);
25005     helper_double_29_recursive(buf + 1572864, 18);
25006     helper_double_29_recursive(buf + 1835008, 18);
25007     for (int j = 0; j < 2097152; j += 2097152) {
25008       for (int k = 0; k < 262144; k += 2) {
25009         __asm__ volatile (
25010           "movupd (%0), %%xmm0\n"
25011           "movupd (%1), %%xmm1\n"
25012           "movupd (%2), %%xmm2\n"
25013           "movupd (%3), %%xmm3\n"
25014           "movupd (%4), %%xmm4\n"
25015           "movupd (%5), %%xmm5\n"
25016           "movupd (%6), %%xmm6\n"
25017           "movupd (%7), %%xmm7\n"
25018           "movapd %%xmm0, %%xmm8\n"
25019           "movapd %%xmm0, %%xmm9\n"
25020           "addpd %%xmm1, %%xmm8\n"
25021           "subpd %%xmm1, %%xmm9\n"
25022           "movapd %%xmm2, %%xmm10\n"
25023           "movapd %%xmm2, %%xmm11\n"
25024           "addpd %%xmm3, %%xmm10\n"
25025           "subpd %%xmm3, %%xmm11\n"
25026           "movapd %%xmm4, %%xmm12\n"
25027           "movapd %%xmm4, %%xmm13\n"
25028           "addpd %%xmm5, %%xmm12\n"
25029           "subpd %%xmm5, %%xmm13\n"
25030           "movapd %%xmm6, %%xmm14\n"
25031           "movapd %%xmm6, %%xmm15\n"
25032           "addpd %%xmm7, %%xmm14\n"
25033           "subpd %%xmm7, %%xmm15\n"
25034           "movapd %%xmm8, %%xmm0\n"
25035           "movapd %%xmm8, %%xmm2\n"
25036           "addpd %%xmm10, %%xmm0\n"
25037           "subpd %%xmm10, %%xmm2\n"
25038           "movapd %%xmm9, %%xmm1\n"
25039           "movapd %%xmm9, %%xmm3\n"
25040           "addpd %%xmm11, %%xmm1\n"
25041           "subpd %%xmm11, %%xmm3\n"
25042           "movapd %%xmm12, %%xmm4\n"
25043           "movapd %%xmm12, %%xmm6\n"
25044           "addpd %%xmm14, %%xmm4\n"
25045           "subpd %%xmm14, %%xmm6\n"
25046           "movapd %%xmm13, %%xmm5\n"
25047           "movapd %%xmm13, %%xmm7\n"
25048           "addpd %%xmm15, %%xmm5\n"
25049           "subpd %%xmm15, %%xmm7\n"
25050           "movapd %%xmm0, %%xmm8\n"
25051           "movapd %%xmm0, %%xmm12\n"
25052           "addpd %%xmm4, %%xmm8\n"
25053           "subpd %%xmm4, %%xmm12\n"
25054           "movapd %%xmm1, %%xmm9\n"
25055           "movapd %%xmm1, %%xmm13\n"
25056           "addpd %%xmm5, %%xmm9\n"
25057           "subpd %%xmm5, %%xmm13\n"
25058           "movapd %%xmm2, %%xmm10\n"
25059           "movapd %%xmm2, %%xmm14\n"
25060           "addpd %%xmm6, %%xmm10\n"
25061           "subpd %%xmm6, %%xmm14\n"
25062           "movapd %%xmm3, %%xmm11\n"
25063           "movapd %%xmm3, %%xmm15\n"
25064           "addpd %%xmm7, %%xmm11\n"
25065           "subpd %%xmm7, %%xmm15\n"
25066           "movupd %%xmm8, (%0)\n"
25067           "movupd %%xmm9, (%1)\n"
25068           "movupd %%xmm10, (%2)\n"
25069           "movupd %%xmm11, (%3)\n"
25070           "movupd %%xmm12, (%4)\n"
25071           "movupd %%xmm13, (%5)\n"
25072           "movupd %%xmm14, (%6)\n"
25073           "movupd %%xmm15, (%7)\n"
25074           :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25075         );
25076       }
25077     }
25078     return;
25079   }
25080   if (depth == 24) {
25081     helper_double_29_recursive(buf + 0, 21);
25082     helper_double_29_recursive(buf + 2097152, 21);
25083     helper_double_29_recursive(buf + 4194304, 21);
25084     helper_double_29_recursive(buf + 6291456, 21);
25085     helper_double_29_recursive(buf + 8388608, 21);
25086     helper_double_29_recursive(buf + 10485760, 21);
25087     helper_double_29_recursive(buf + 12582912, 21);
25088     helper_double_29_recursive(buf + 14680064, 21);
25089     for (int j = 0; j < 16777216; j += 16777216) {
25090       for (int k = 0; k < 2097152; k += 2) {
25091         __asm__ volatile (
25092           "movupd (%0), %%xmm0\n"
25093           "movupd (%1), %%xmm1\n"
25094           "movupd (%2), %%xmm2\n"
25095           "movupd (%3), %%xmm3\n"
25096           "movupd (%4), %%xmm4\n"
25097           "movupd (%5), %%xmm5\n"
25098           "movupd (%6), %%xmm6\n"
25099           "movupd (%7), %%xmm7\n"
25100           "movapd %%xmm0, %%xmm8\n"
25101           "movapd %%xmm0, %%xmm9\n"
25102           "addpd %%xmm1, %%xmm8\n"
25103           "subpd %%xmm1, %%xmm9\n"
25104           "movapd %%xmm2, %%xmm10\n"
25105           "movapd %%xmm2, %%xmm11\n"
25106           "addpd %%xmm3, %%xmm10\n"
25107           "subpd %%xmm3, %%xmm11\n"
25108           "movapd %%xmm4, %%xmm12\n"
25109           "movapd %%xmm4, %%xmm13\n"
25110           "addpd %%xmm5, %%xmm12\n"
25111           "subpd %%xmm5, %%xmm13\n"
25112           "movapd %%xmm6, %%xmm14\n"
25113           "movapd %%xmm6, %%xmm15\n"
25114           "addpd %%xmm7, %%xmm14\n"
25115           "subpd %%xmm7, %%xmm15\n"
25116           "movapd %%xmm8, %%xmm0\n"
25117           "movapd %%xmm8, %%xmm2\n"
25118           "addpd %%xmm10, %%xmm0\n"
25119           "subpd %%xmm10, %%xmm2\n"
25120           "movapd %%xmm9, %%xmm1\n"
25121           "movapd %%xmm9, %%xmm3\n"
25122           "addpd %%xmm11, %%xmm1\n"
25123           "subpd %%xmm11, %%xmm3\n"
25124           "movapd %%xmm12, %%xmm4\n"
25125           "movapd %%xmm12, %%xmm6\n"
25126           "addpd %%xmm14, %%xmm4\n"
25127           "subpd %%xmm14, %%xmm6\n"
25128           "movapd %%xmm13, %%xmm5\n"
25129           "movapd %%xmm13, %%xmm7\n"
25130           "addpd %%xmm15, %%xmm5\n"
25131           "subpd %%xmm15, %%xmm7\n"
25132           "movapd %%xmm0, %%xmm8\n"
25133           "movapd %%xmm0, %%xmm12\n"
25134           "addpd %%xmm4, %%xmm8\n"
25135           "subpd %%xmm4, %%xmm12\n"
25136           "movapd %%xmm1, %%xmm9\n"
25137           "movapd %%xmm1, %%xmm13\n"
25138           "addpd %%xmm5, %%xmm9\n"
25139           "subpd %%xmm5, %%xmm13\n"
25140           "movapd %%xmm2, %%xmm10\n"
25141           "movapd %%xmm2, %%xmm14\n"
25142           "addpd %%xmm6, %%xmm10\n"
25143           "subpd %%xmm6, %%xmm14\n"
25144           "movapd %%xmm3, %%xmm11\n"
25145           "movapd %%xmm3, %%xmm15\n"
25146           "addpd %%xmm7, %%xmm11\n"
25147           "subpd %%xmm7, %%xmm15\n"
25148           "movupd %%xmm8, (%0)\n"
25149           "movupd %%xmm9, (%1)\n"
25150           "movupd %%xmm10, (%2)\n"
25151           "movupd %%xmm11, (%3)\n"
25152           "movupd %%xmm12, (%4)\n"
25153           "movupd %%xmm13, (%5)\n"
25154           "movupd %%xmm14, (%6)\n"
25155           "movupd %%xmm15, (%7)\n"
25156           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25157         );
25158       }
25159     }
25160     return;
25161   }
25162   if (depth == 27) {
25163     helper_double_29_recursive(buf + 0, 24);
25164     helper_double_29_recursive(buf + 16777216, 24);
25165     helper_double_29_recursive(buf + 33554432, 24);
25166     helper_double_29_recursive(buf + 50331648, 24);
25167     helper_double_29_recursive(buf + 67108864, 24);
25168     helper_double_29_recursive(buf + 83886080, 24);
25169     helper_double_29_recursive(buf + 100663296, 24);
25170     helper_double_29_recursive(buf + 117440512, 24);
25171     for (int j = 0; j < 134217728; j += 134217728) {
25172       for (int k = 0; k < 16777216; k += 2) {
25173         __asm__ volatile (
25174           "movupd (%0), %%xmm0\n"
25175           "movupd (%1), %%xmm1\n"
25176           "movupd (%2), %%xmm2\n"
25177           "movupd (%3), %%xmm3\n"
25178           "movupd (%4), %%xmm4\n"
25179           "movupd (%5), %%xmm5\n"
25180           "movupd (%6), %%xmm6\n"
25181           "movupd (%7), %%xmm7\n"
25182           "movapd %%xmm0, %%xmm8\n"
25183           "movapd %%xmm0, %%xmm9\n"
25184           "addpd %%xmm1, %%xmm8\n"
25185           "subpd %%xmm1, %%xmm9\n"
25186           "movapd %%xmm2, %%xmm10\n"
25187           "movapd %%xmm2, %%xmm11\n"
25188           "addpd %%xmm3, %%xmm10\n"
25189           "subpd %%xmm3, %%xmm11\n"
25190           "movapd %%xmm4, %%xmm12\n"
25191           "movapd %%xmm4, %%xmm13\n"
25192           "addpd %%xmm5, %%xmm12\n"
25193           "subpd %%xmm5, %%xmm13\n"
25194           "movapd %%xmm6, %%xmm14\n"
25195           "movapd %%xmm6, %%xmm15\n"
25196           "addpd %%xmm7, %%xmm14\n"
25197           "subpd %%xmm7, %%xmm15\n"
25198           "movapd %%xmm8, %%xmm0\n"
25199           "movapd %%xmm8, %%xmm2\n"
25200           "addpd %%xmm10, %%xmm0\n"
25201           "subpd %%xmm10, %%xmm2\n"
25202           "movapd %%xmm9, %%xmm1\n"
25203           "movapd %%xmm9, %%xmm3\n"
25204           "addpd %%xmm11, %%xmm1\n"
25205           "subpd %%xmm11, %%xmm3\n"
25206           "movapd %%xmm12, %%xmm4\n"
25207           "movapd %%xmm12, %%xmm6\n"
25208           "addpd %%xmm14, %%xmm4\n"
25209           "subpd %%xmm14, %%xmm6\n"
25210           "movapd %%xmm13, %%xmm5\n"
25211           "movapd %%xmm13, %%xmm7\n"
25212           "addpd %%xmm15, %%xmm5\n"
25213           "subpd %%xmm15, %%xmm7\n"
25214           "movapd %%xmm0, %%xmm8\n"
25215           "movapd %%xmm0, %%xmm12\n"
25216           "addpd %%xmm4, %%xmm8\n"
25217           "subpd %%xmm4, %%xmm12\n"
25218           "movapd %%xmm1, %%xmm9\n"
25219           "movapd %%xmm1, %%xmm13\n"
25220           "addpd %%xmm5, %%xmm9\n"
25221           "subpd %%xmm5, %%xmm13\n"
25222           "movapd %%xmm2, %%xmm10\n"
25223           "movapd %%xmm2, %%xmm14\n"
25224           "addpd %%xmm6, %%xmm10\n"
25225           "subpd %%xmm6, %%xmm14\n"
25226           "movapd %%xmm3, %%xmm11\n"
25227           "movapd %%xmm3, %%xmm15\n"
25228           "addpd %%xmm7, %%xmm11\n"
25229           "subpd %%xmm7, %%xmm15\n"
25230           "movupd %%xmm8, (%0)\n"
25231           "movupd %%xmm9, (%1)\n"
25232           "movupd %%xmm10, (%2)\n"
25233           "movupd %%xmm11, (%3)\n"
25234           "movupd %%xmm12, (%4)\n"
25235           "movupd %%xmm13, (%5)\n"
25236           "movupd %%xmm14, (%6)\n"
25237           "movupd %%xmm15, (%7)\n"
25238           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25239         );
25240       }
25241     }
25242     return;
25243   }
25244   if (depth == 29) {
25245     helper_double_29_recursive(buf + 0, 27);
25246     helper_double_29_recursive(buf + 134217728, 27);
25247     helper_double_29_recursive(buf + 268435456, 27);
25248     helper_double_29_recursive(buf + 402653184, 27);
25249     for (int j = 0; j < 536870912; j += 536870912) {
25250       for (int k = 0; k < 134217728; k += 2) {
25251         __asm__ volatile (
25252           "movupd (%0), %%xmm0\n"
25253           "movupd (%1), %%xmm1\n"
25254           "movupd (%2), %%xmm2\n"
25255           "movupd (%3), %%xmm3\n"
25256           "movapd %%xmm0, %%xmm8\n"
25257           "movapd %%xmm0, %%xmm9\n"
25258           "addpd %%xmm1, %%xmm8\n"
25259           "subpd %%xmm1, %%xmm9\n"
25260           "movapd %%xmm2, %%xmm10\n"
25261           "movapd %%xmm2, %%xmm11\n"
25262           "addpd %%xmm3, %%xmm10\n"
25263           "subpd %%xmm3, %%xmm11\n"
25264           "movapd %%xmm8, %%xmm0\n"
25265           "movapd %%xmm8, %%xmm2\n"
25266           "addpd %%xmm10, %%xmm0\n"
25267           "subpd %%xmm10, %%xmm2\n"
25268           "movapd %%xmm9, %%xmm1\n"
25269           "movapd %%xmm9, %%xmm3\n"
25270           "addpd %%xmm11, %%xmm1\n"
25271           "subpd %%xmm11, %%xmm3\n"
25272           "movupd %%xmm0, (%0)\n"
25273           "movupd %%xmm1, (%1)\n"
25274           "movupd %%xmm2, (%2)\n"
25275           "movupd %%xmm3, (%3)\n"
25276           :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25277         );
25278       }
25279     }
25280     return;
25281   }
25282 }
25283 void helper_double_29(double *buf);
helper_double_29(double * buf)25284 void helper_double_29(double *buf) {
25285   helper_double_29_recursive(buf, 29);
25286 }
25287 void helper_double_30_recursive(double *buf, int depth);
helper_double_30_recursive(double * buf,int depth)25288 void helper_double_30_recursive(double *buf, int depth) {
25289   if (depth == 3) {
25290     for (int j = 0; j < 8; j += 8) {
25291       for (int k = 0; k < 2; k += 2) {
25292         __asm__ volatile (
25293           "movupd (%0), %%xmm0\n"
25294           "movupd (%1), %%xmm1\n"
25295           "movupd (%2), %%xmm2\n"
25296           "movupd (%3), %%xmm3\n"
25297           "movapd %%xmm0, %%xmm8\n"
25298           "haddpd %%xmm8, %%xmm8\n"
25299           "movapd %%xmm0, %%xmm9\n"
25300           "hsubpd %%xmm9, %%xmm9\n"
25301           "blendpd $1, %%xmm8, %%xmm9\n"
25302           "movapd %%xmm9, %%xmm0\n"
25303           "movapd %%xmm1, %%xmm8\n"
25304           "haddpd %%xmm8, %%xmm8\n"
25305           "movapd %%xmm1, %%xmm9\n"
25306           "hsubpd %%xmm9, %%xmm9\n"
25307           "blendpd $1, %%xmm8, %%xmm9\n"
25308           "movapd %%xmm9, %%xmm1\n"
25309           "movapd %%xmm2, %%xmm8\n"
25310           "haddpd %%xmm8, %%xmm8\n"
25311           "movapd %%xmm2, %%xmm9\n"
25312           "hsubpd %%xmm9, %%xmm9\n"
25313           "blendpd $1, %%xmm8, %%xmm9\n"
25314           "movapd %%xmm9, %%xmm2\n"
25315           "movapd %%xmm3, %%xmm8\n"
25316           "haddpd %%xmm8, %%xmm8\n"
25317           "movapd %%xmm3, %%xmm9\n"
25318           "hsubpd %%xmm9, %%xmm9\n"
25319           "blendpd $1, %%xmm8, %%xmm9\n"
25320           "movapd %%xmm9, %%xmm3\n"
25321           "movapd %%xmm0, %%xmm8\n"
25322           "movapd %%xmm0, %%xmm9\n"
25323           "addpd %%xmm1, %%xmm8\n"
25324           "subpd %%xmm1, %%xmm9\n"
25325           "movapd %%xmm2, %%xmm10\n"
25326           "movapd %%xmm2, %%xmm11\n"
25327           "addpd %%xmm3, %%xmm10\n"
25328           "subpd %%xmm3, %%xmm11\n"
25329           "movapd %%xmm8, %%xmm0\n"
25330           "movapd %%xmm8, %%xmm2\n"
25331           "addpd %%xmm10, %%xmm0\n"
25332           "subpd %%xmm10, %%xmm2\n"
25333           "movapd %%xmm9, %%xmm1\n"
25334           "movapd %%xmm9, %%xmm3\n"
25335           "addpd %%xmm11, %%xmm1\n"
25336           "subpd %%xmm11, %%xmm3\n"
25337           "movupd %%xmm0, (%0)\n"
25338           "movupd %%xmm1, (%1)\n"
25339           "movupd %%xmm2, (%2)\n"
25340           "movupd %%xmm3, (%3)\n"
25341           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25342         );
25343       }
25344     }
25345     return;
25346   }
25347   if (depth == 6) {
25348     helper_double_30_recursive(buf + 0, 3);
25349     helper_double_30_recursive(buf + 8, 3);
25350     helper_double_30_recursive(buf + 16, 3);
25351     helper_double_30_recursive(buf + 24, 3);
25352     helper_double_30_recursive(buf + 32, 3);
25353     helper_double_30_recursive(buf + 40, 3);
25354     helper_double_30_recursive(buf + 48, 3);
25355     helper_double_30_recursive(buf + 56, 3);
25356     for (int j = 0; j < 64; j += 64) {
25357       for (int k = 0; k < 8; k += 2) {
25358         __asm__ volatile (
25359           "movupd (%0), %%xmm0\n"
25360           "movupd (%1), %%xmm1\n"
25361           "movupd (%2), %%xmm2\n"
25362           "movupd (%3), %%xmm3\n"
25363           "movupd (%4), %%xmm4\n"
25364           "movupd (%5), %%xmm5\n"
25365           "movupd (%6), %%xmm6\n"
25366           "movupd (%7), %%xmm7\n"
25367           "movapd %%xmm0, %%xmm8\n"
25368           "movapd %%xmm0, %%xmm9\n"
25369           "addpd %%xmm1, %%xmm8\n"
25370           "subpd %%xmm1, %%xmm9\n"
25371           "movapd %%xmm2, %%xmm10\n"
25372           "movapd %%xmm2, %%xmm11\n"
25373           "addpd %%xmm3, %%xmm10\n"
25374           "subpd %%xmm3, %%xmm11\n"
25375           "movapd %%xmm4, %%xmm12\n"
25376           "movapd %%xmm4, %%xmm13\n"
25377           "addpd %%xmm5, %%xmm12\n"
25378           "subpd %%xmm5, %%xmm13\n"
25379           "movapd %%xmm6, %%xmm14\n"
25380           "movapd %%xmm6, %%xmm15\n"
25381           "addpd %%xmm7, %%xmm14\n"
25382           "subpd %%xmm7, %%xmm15\n"
25383           "movapd %%xmm8, %%xmm0\n"
25384           "movapd %%xmm8, %%xmm2\n"
25385           "addpd %%xmm10, %%xmm0\n"
25386           "subpd %%xmm10, %%xmm2\n"
25387           "movapd %%xmm9, %%xmm1\n"
25388           "movapd %%xmm9, %%xmm3\n"
25389           "addpd %%xmm11, %%xmm1\n"
25390           "subpd %%xmm11, %%xmm3\n"
25391           "movapd %%xmm12, %%xmm4\n"
25392           "movapd %%xmm12, %%xmm6\n"
25393           "addpd %%xmm14, %%xmm4\n"
25394           "subpd %%xmm14, %%xmm6\n"
25395           "movapd %%xmm13, %%xmm5\n"
25396           "movapd %%xmm13, %%xmm7\n"
25397           "addpd %%xmm15, %%xmm5\n"
25398           "subpd %%xmm15, %%xmm7\n"
25399           "movapd %%xmm0, %%xmm8\n"
25400           "movapd %%xmm0, %%xmm12\n"
25401           "addpd %%xmm4, %%xmm8\n"
25402           "subpd %%xmm4, %%xmm12\n"
25403           "movapd %%xmm1, %%xmm9\n"
25404           "movapd %%xmm1, %%xmm13\n"
25405           "addpd %%xmm5, %%xmm9\n"
25406           "subpd %%xmm5, %%xmm13\n"
25407           "movapd %%xmm2, %%xmm10\n"
25408           "movapd %%xmm2, %%xmm14\n"
25409           "addpd %%xmm6, %%xmm10\n"
25410           "subpd %%xmm6, %%xmm14\n"
25411           "movapd %%xmm3, %%xmm11\n"
25412           "movapd %%xmm3, %%xmm15\n"
25413           "addpd %%xmm7, %%xmm11\n"
25414           "subpd %%xmm7, %%xmm15\n"
25415           "movupd %%xmm8, (%0)\n"
25416           "movupd %%xmm9, (%1)\n"
25417           "movupd %%xmm10, (%2)\n"
25418           "movupd %%xmm11, (%3)\n"
25419           "movupd %%xmm12, (%4)\n"
25420           "movupd %%xmm13, (%5)\n"
25421           "movupd %%xmm14, (%6)\n"
25422           "movupd %%xmm15, (%7)\n"
25423           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25424         );
25425       }
25426     }
25427     return;
25428   }
25429   if (depth == 9) {
25430     helper_double_30_recursive(buf + 0, 6);
25431     helper_double_30_recursive(buf + 64, 6);
25432     helper_double_30_recursive(buf + 128, 6);
25433     helper_double_30_recursive(buf + 192, 6);
25434     helper_double_30_recursive(buf + 256, 6);
25435     helper_double_30_recursive(buf + 320, 6);
25436     helper_double_30_recursive(buf + 384, 6);
25437     helper_double_30_recursive(buf + 448, 6);
25438     for (int j = 0; j < 512; j += 512) {
25439       for (int k = 0; k < 64; k += 2) {
25440         __asm__ volatile (
25441           "movupd (%0), %%xmm0\n"
25442           "movupd (%1), %%xmm1\n"
25443           "movupd (%2), %%xmm2\n"
25444           "movupd (%3), %%xmm3\n"
25445           "movupd (%4), %%xmm4\n"
25446           "movupd (%5), %%xmm5\n"
25447           "movupd (%6), %%xmm6\n"
25448           "movupd (%7), %%xmm7\n"
25449           "movapd %%xmm0, %%xmm8\n"
25450           "movapd %%xmm0, %%xmm9\n"
25451           "addpd %%xmm1, %%xmm8\n"
25452           "subpd %%xmm1, %%xmm9\n"
25453           "movapd %%xmm2, %%xmm10\n"
25454           "movapd %%xmm2, %%xmm11\n"
25455           "addpd %%xmm3, %%xmm10\n"
25456           "subpd %%xmm3, %%xmm11\n"
25457           "movapd %%xmm4, %%xmm12\n"
25458           "movapd %%xmm4, %%xmm13\n"
25459           "addpd %%xmm5, %%xmm12\n"
25460           "subpd %%xmm5, %%xmm13\n"
25461           "movapd %%xmm6, %%xmm14\n"
25462           "movapd %%xmm6, %%xmm15\n"
25463           "addpd %%xmm7, %%xmm14\n"
25464           "subpd %%xmm7, %%xmm15\n"
25465           "movapd %%xmm8, %%xmm0\n"
25466           "movapd %%xmm8, %%xmm2\n"
25467           "addpd %%xmm10, %%xmm0\n"
25468           "subpd %%xmm10, %%xmm2\n"
25469           "movapd %%xmm9, %%xmm1\n"
25470           "movapd %%xmm9, %%xmm3\n"
25471           "addpd %%xmm11, %%xmm1\n"
25472           "subpd %%xmm11, %%xmm3\n"
25473           "movapd %%xmm12, %%xmm4\n"
25474           "movapd %%xmm12, %%xmm6\n"
25475           "addpd %%xmm14, %%xmm4\n"
25476           "subpd %%xmm14, %%xmm6\n"
25477           "movapd %%xmm13, %%xmm5\n"
25478           "movapd %%xmm13, %%xmm7\n"
25479           "addpd %%xmm15, %%xmm5\n"
25480           "subpd %%xmm15, %%xmm7\n"
25481           "movapd %%xmm0, %%xmm8\n"
25482           "movapd %%xmm0, %%xmm12\n"
25483           "addpd %%xmm4, %%xmm8\n"
25484           "subpd %%xmm4, %%xmm12\n"
25485           "movapd %%xmm1, %%xmm9\n"
25486           "movapd %%xmm1, %%xmm13\n"
25487           "addpd %%xmm5, %%xmm9\n"
25488           "subpd %%xmm5, %%xmm13\n"
25489           "movapd %%xmm2, %%xmm10\n"
25490           "movapd %%xmm2, %%xmm14\n"
25491           "addpd %%xmm6, %%xmm10\n"
25492           "subpd %%xmm6, %%xmm14\n"
25493           "movapd %%xmm3, %%xmm11\n"
25494           "movapd %%xmm3, %%xmm15\n"
25495           "addpd %%xmm7, %%xmm11\n"
25496           "subpd %%xmm7, %%xmm15\n"
25497           "movupd %%xmm8, (%0)\n"
25498           "movupd %%xmm9, (%1)\n"
25499           "movupd %%xmm10, (%2)\n"
25500           "movupd %%xmm11, (%3)\n"
25501           "movupd %%xmm12, (%4)\n"
25502           "movupd %%xmm13, (%5)\n"
25503           "movupd %%xmm14, (%6)\n"
25504           "movupd %%xmm15, (%7)\n"
25505           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25506         );
25507       }
25508     }
25509     return;
25510   }
25511   if (depth == 12) {
25512     helper_double_30_recursive(buf + 0, 9);
25513     helper_double_30_recursive(buf + 512, 9);
25514     helper_double_30_recursive(buf + 1024, 9);
25515     helper_double_30_recursive(buf + 1536, 9);
25516     helper_double_30_recursive(buf + 2048, 9);
25517     helper_double_30_recursive(buf + 2560, 9);
25518     helper_double_30_recursive(buf + 3072, 9);
25519     helper_double_30_recursive(buf + 3584, 9);
25520     for (int j = 0; j < 4096; j += 4096) {
25521       for (int k = 0; k < 512; k += 2) {
25522         __asm__ volatile (
25523           "movupd (%0), %%xmm0\n"
25524           "movupd (%1), %%xmm1\n"
25525           "movupd (%2), %%xmm2\n"
25526           "movupd (%3), %%xmm3\n"
25527           "movupd (%4), %%xmm4\n"
25528           "movupd (%5), %%xmm5\n"
25529           "movupd (%6), %%xmm6\n"
25530           "movupd (%7), %%xmm7\n"
25531           "movapd %%xmm0, %%xmm8\n"
25532           "movapd %%xmm0, %%xmm9\n"
25533           "addpd %%xmm1, %%xmm8\n"
25534           "subpd %%xmm1, %%xmm9\n"
25535           "movapd %%xmm2, %%xmm10\n"
25536           "movapd %%xmm2, %%xmm11\n"
25537           "addpd %%xmm3, %%xmm10\n"
25538           "subpd %%xmm3, %%xmm11\n"
25539           "movapd %%xmm4, %%xmm12\n"
25540           "movapd %%xmm4, %%xmm13\n"
25541           "addpd %%xmm5, %%xmm12\n"
25542           "subpd %%xmm5, %%xmm13\n"
25543           "movapd %%xmm6, %%xmm14\n"
25544           "movapd %%xmm6, %%xmm15\n"
25545           "addpd %%xmm7, %%xmm14\n"
25546           "subpd %%xmm7, %%xmm15\n"
25547           "movapd %%xmm8, %%xmm0\n"
25548           "movapd %%xmm8, %%xmm2\n"
25549           "addpd %%xmm10, %%xmm0\n"
25550           "subpd %%xmm10, %%xmm2\n"
25551           "movapd %%xmm9, %%xmm1\n"
25552           "movapd %%xmm9, %%xmm3\n"
25553           "addpd %%xmm11, %%xmm1\n"
25554           "subpd %%xmm11, %%xmm3\n"
25555           "movapd %%xmm12, %%xmm4\n"
25556           "movapd %%xmm12, %%xmm6\n"
25557           "addpd %%xmm14, %%xmm4\n"
25558           "subpd %%xmm14, %%xmm6\n"
25559           "movapd %%xmm13, %%xmm5\n"
25560           "movapd %%xmm13, %%xmm7\n"
25561           "addpd %%xmm15, %%xmm5\n"
25562           "subpd %%xmm15, %%xmm7\n"
25563           "movapd %%xmm0, %%xmm8\n"
25564           "movapd %%xmm0, %%xmm12\n"
25565           "addpd %%xmm4, %%xmm8\n"
25566           "subpd %%xmm4, %%xmm12\n"
25567           "movapd %%xmm1, %%xmm9\n"
25568           "movapd %%xmm1, %%xmm13\n"
25569           "addpd %%xmm5, %%xmm9\n"
25570           "subpd %%xmm5, %%xmm13\n"
25571           "movapd %%xmm2, %%xmm10\n"
25572           "movapd %%xmm2, %%xmm14\n"
25573           "addpd %%xmm6, %%xmm10\n"
25574           "subpd %%xmm6, %%xmm14\n"
25575           "movapd %%xmm3, %%xmm11\n"
25576           "movapd %%xmm3, %%xmm15\n"
25577           "addpd %%xmm7, %%xmm11\n"
25578           "subpd %%xmm7, %%xmm15\n"
25579           "movupd %%xmm8, (%0)\n"
25580           "movupd %%xmm9, (%1)\n"
25581           "movupd %%xmm10, (%2)\n"
25582           "movupd %%xmm11, (%3)\n"
25583           "movupd %%xmm12, (%4)\n"
25584           "movupd %%xmm13, (%5)\n"
25585           "movupd %%xmm14, (%6)\n"
25586           "movupd %%xmm15, (%7)\n"
25587           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25588         );
25589       }
25590     }
25591     return;
25592   }
25593   if (depth == 15) {
25594     helper_double_30_recursive(buf + 0, 12);
25595     helper_double_30_recursive(buf + 4096, 12);
25596     helper_double_30_recursive(buf + 8192, 12);
25597     helper_double_30_recursive(buf + 12288, 12);
25598     helper_double_30_recursive(buf + 16384, 12);
25599     helper_double_30_recursive(buf + 20480, 12);
25600     helper_double_30_recursive(buf + 24576, 12);
25601     helper_double_30_recursive(buf + 28672, 12);
25602     for (int j = 0; j < 32768; j += 32768) {
25603       for (int k = 0; k < 4096; k += 2) {
25604         __asm__ volatile (
25605           "movupd (%0), %%xmm0\n"
25606           "movupd (%1), %%xmm1\n"
25607           "movupd (%2), %%xmm2\n"
25608           "movupd (%3), %%xmm3\n"
25609           "movupd (%4), %%xmm4\n"
25610           "movupd (%5), %%xmm5\n"
25611           "movupd (%6), %%xmm6\n"
25612           "movupd (%7), %%xmm7\n"
25613           "movapd %%xmm0, %%xmm8\n"
25614           "movapd %%xmm0, %%xmm9\n"
25615           "addpd %%xmm1, %%xmm8\n"
25616           "subpd %%xmm1, %%xmm9\n"
25617           "movapd %%xmm2, %%xmm10\n"
25618           "movapd %%xmm2, %%xmm11\n"
25619           "addpd %%xmm3, %%xmm10\n"
25620           "subpd %%xmm3, %%xmm11\n"
25621           "movapd %%xmm4, %%xmm12\n"
25622           "movapd %%xmm4, %%xmm13\n"
25623           "addpd %%xmm5, %%xmm12\n"
25624           "subpd %%xmm5, %%xmm13\n"
25625           "movapd %%xmm6, %%xmm14\n"
25626           "movapd %%xmm6, %%xmm15\n"
25627           "addpd %%xmm7, %%xmm14\n"
25628           "subpd %%xmm7, %%xmm15\n"
25629           "movapd %%xmm8, %%xmm0\n"
25630           "movapd %%xmm8, %%xmm2\n"
25631           "addpd %%xmm10, %%xmm0\n"
25632           "subpd %%xmm10, %%xmm2\n"
25633           "movapd %%xmm9, %%xmm1\n"
25634           "movapd %%xmm9, %%xmm3\n"
25635           "addpd %%xmm11, %%xmm1\n"
25636           "subpd %%xmm11, %%xmm3\n"
25637           "movapd %%xmm12, %%xmm4\n"
25638           "movapd %%xmm12, %%xmm6\n"
25639           "addpd %%xmm14, %%xmm4\n"
25640           "subpd %%xmm14, %%xmm6\n"
25641           "movapd %%xmm13, %%xmm5\n"
25642           "movapd %%xmm13, %%xmm7\n"
25643           "addpd %%xmm15, %%xmm5\n"
25644           "subpd %%xmm15, %%xmm7\n"
25645           "movapd %%xmm0, %%xmm8\n"
25646           "movapd %%xmm0, %%xmm12\n"
25647           "addpd %%xmm4, %%xmm8\n"
25648           "subpd %%xmm4, %%xmm12\n"
25649           "movapd %%xmm1, %%xmm9\n"
25650           "movapd %%xmm1, %%xmm13\n"
25651           "addpd %%xmm5, %%xmm9\n"
25652           "subpd %%xmm5, %%xmm13\n"
25653           "movapd %%xmm2, %%xmm10\n"
25654           "movapd %%xmm2, %%xmm14\n"
25655           "addpd %%xmm6, %%xmm10\n"
25656           "subpd %%xmm6, %%xmm14\n"
25657           "movapd %%xmm3, %%xmm11\n"
25658           "movapd %%xmm3, %%xmm15\n"
25659           "addpd %%xmm7, %%xmm11\n"
25660           "subpd %%xmm7, %%xmm15\n"
25661           "movupd %%xmm8, (%0)\n"
25662           "movupd %%xmm9, (%1)\n"
25663           "movupd %%xmm10, (%2)\n"
25664           "movupd %%xmm11, (%3)\n"
25665           "movupd %%xmm12, (%4)\n"
25666           "movupd %%xmm13, (%5)\n"
25667           "movupd %%xmm14, (%6)\n"
25668           "movupd %%xmm15, (%7)\n"
25669           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25670         );
25671       }
25672     }
25673     return;
25674   }
25675   if (depth == 18) {
25676     helper_double_30_recursive(buf + 0, 15);
25677     helper_double_30_recursive(buf + 32768, 15);
25678     helper_double_30_recursive(buf + 65536, 15);
25679     helper_double_30_recursive(buf + 98304, 15);
25680     helper_double_30_recursive(buf + 131072, 15);
25681     helper_double_30_recursive(buf + 163840, 15);
25682     helper_double_30_recursive(buf + 196608, 15);
25683     helper_double_30_recursive(buf + 229376, 15);
25684     for (int j = 0; j < 262144; j += 262144) {
25685       for (int k = 0; k < 32768; k += 2) {
25686         __asm__ volatile (
25687           "movupd (%0), %%xmm0\n"
25688           "movupd (%1), %%xmm1\n"
25689           "movupd (%2), %%xmm2\n"
25690           "movupd (%3), %%xmm3\n"
25691           "movupd (%4), %%xmm4\n"
25692           "movupd (%5), %%xmm5\n"
25693           "movupd (%6), %%xmm6\n"
25694           "movupd (%7), %%xmm7\n"
25695           "movapd %%xmm0, %%xmm8\n"
25696           "movapd %%xmm0, %%xmm9\n"
25697           "addpd %%xmm1, %%xmm8\n"
25698           "subpd %%xmm1, %%xmm9\n"
25699           "movapd %%xmm2, %%xmm10\n"
25700           "movapd %%xmm2, %%xmm11\n"
25701           "addpd %%xmm3, %%xmm10\n"
25702           "subpd %%xmm3, %%xmm11\n"
25703           "movapd %%xmm4, %%xmm12\n"
25704           "movapd %%xmm4, %%xmm13\n"
25705           "addpd %%xmm5, %%xmm12\n"
25706           "subpd %%xmm5, %%xmm13\n"
25707           "movapd %%xmm6, %%xmm14\n"
25708           "movapd %%xmm6, %%xmm15\n"
25709           "addpd %%xmm7, %%xmm14\n"
25710           "subpd %%xmm7, %%xmm15\n"
25711           "movapd %%xmm8, %%xmm0\n"
25712           "movapd %%xmm8, %%xmm2\n"
25713           "addpd %%xmm10, %%xmm0\n"
25714           "subpd %%xmm10, %%xmm2\n"
25715           "movapd %%xmm9, %%xmm1\n"
25716           "movapd %%xmm9, %%xmm3\n"
25717           "addpd %%xmm11, %%xmm1\n"
25718           "subpd %%xmm11, %%xmm3\n"
25719           "movapd %%xmm12, %%xmm4\n"
25720           "movapd %%xmm12, %%xmm6\n"
25721           "addpd %%xmm14, %%xmm4\n"
25722           "subpd %%xmm14, %%xmm6\n"
25723           "movapd %%xmm13, %%xmm5\n"
25724           "movapd %%xmm13, %%xmm7\n"
25725           "addpd %%xmm15, %%xmm5\n"
25726           "subpd %%xmm15, %%xmm7\n"
25727           "movapd %%xmm0, %%xmm8\n"
25728           "movapd %%xmm0, %%xmm12\n"
25729           "addpd %%xmm4, %%xmm8\n"
25730           "subpd %%xmm4, %%xmm12\n"
25731           "movapd %%xmm1, %%xmm9\n"
25732           "movapd %%xmm1, %%xmm13\n"
25733           "addpd %%xmm5, %%xmm9\n"
25734           "subpd %%xmm5, %%xmm13\n"
25735           "movapd %%xmm2, %%xmm10\n"
25736           "movapd %%xmm2, %%xmm14\n"
25737           "addpd %%xmm6, %%xmm10\n"
25738           "subpd %%xmm6, %%xmm14\n"
25739           "movapd %%xmm3, %%xmm11\n"
25740           "movapd %%xmm3, %%xmm15\n"
25741           "addpd %%xmm7, %%xmm11\n"
25742           "subpd %%xmm7, %%xmm15\n"
25743           "movupd %%xmm8, (%0)\n"
25744           "movupd %%xmm9, (%1)\n"
25745           "movupd %%xmm10, (%2)\n"
25746           "movupd %%xmm11, (%3)\n"
25747           "movupd %%xmm12, (%4)\n"
25748           "movupd %%xmm13, (%5)\n"
25749           "movupd %%xmm14, (%6)\n"
25750           "movupd %%xmm15, (%7)\n"
25751           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25752         );
25753       }
25754     }
25755     return;
25756   }
25757   if (depth == 21) {
25758     helper_double_30_recursive(buf + 0, 18);
25759     helper_double_30_recursive(buf + 262144, 18);
25760     helper_double_30_recursive(buf + 524288, 18);
25761     helper_double_30_recursive(buf + 786432, 18);
25762     helper_double_30_recursive(buf + 1048576, 18);
25763     helper_double_30_recursive(buf + 1310720, 18);
25764     helper_double_30_recursive(buf + 1572864, 18);
25765     helper_double_30_recursive(buf + 1835008, 18);
25766     for (int j = 0; j < 2097152; j += 2097152) {
25767       for (int k = 0; k < 262144; k += 2) {
25768         __asm__ volatile (
25769           "movupd (%0), %%xmm0\n"
25770           "movupd (%1), %%xmm1\n"
25771           "movupd (%2), %%xmm2\n"
25772           "movupd (%3), %%xmm3\n"
25773           "movupd (%4), %%xmm4\n"
25774           "movupd (%5), %%xmm5\n"
25775           "movupd (%6), %%xmm6\n"
25776           "movupd (%7), %%xmm7\n"
25777           "movapd %%xmm0, %%xmm8\n"
25778           "movapd %%xmm0, %%xmm9\n"
25779           "addpd %%xmm1, %%xmm8\n"
25780           "subpd %%xmm1, %%xmm9\n"
25781           "movapd %%xmm2, %%xmm10\n"
25782           "movapd %%xmm2, %%xmm11\n"
25783           "addpd %%xmm3, %%xmm10\n"
25784           "subpd %%xmm3, %%xmm11\n"
25785           "movapd %%xmm4, %%xmm12\n"
25786           "movapd %%xmm4, %%xmm13\n"
25787           "addpd %%xmm5, %%xmm12\n"
25788           "subpd %%xmm5, %%xmm13\n"
25789           "movapd %%xmm6, %%xmm14\n"
25790           "movapd %%xmm6, %%xmm15\n"
25791           "addpd %%xmm7, %%xmm14\n"
25792           "subpd %%xmm7, %%xmm15\n"
25793           "movapd %%xmm8, %%xmm0\n"
25794           "movapd %%xmm8, %%xmm2\n"
25795           "addpd %%xmm10, %%xmm0\n"
25796           "subpd %%xmm10, %%xmm2\n"
25797           "movapd %%xmm9, %%xmm1\n"
25798           "movapd %%xmm9, %%xmm3\n"
25799           "addpd %%xmm11, %%xmm1\n"
25800           "subpd %%xmm11, %%xmm3\n"
25801           "movapd %%xmm12, %%xmm4\n"
25802           "movapd %%xmm12, %%xmm6\n"
25803           "addpd %%xmm14, %%xmm4\n"
25804           "subpd %%xmm14, %%xmm6\n"
25805           "movapd %%xmm13, %%xmm5\n"
25806           "movapd %%xmm13, %%xmm7\n"
25807           "addpd %%xmm15, %%xmm5\n"
25808           "subpd %%xmm15, %%xmm7\n"
25809           "movapd %%xmm0, %%xmm8\n"
25810           "movapd %%xmm0, %%xmm12\n"
25811           "addpd %%xmm4, %%xmm8\n"
25812           "subpd %%xmm4, %%xmm12\n"
25813           "movapd %%xmm1, %%xmm9\n"
25814           "movapd %%xmm1, %%xmm13\n"
25815           "addpd %%xmm5, %%xmm9\n"
25816           "subpd %%xmm5, %%xmm13\n"
25817           "movapd %%xmm2, %%xmm10\n"
25818           "movapd %%xmm2, %%xmm14\n"
25819           "addpd %%xmm6, %%xmm10\n"
25820           "subpd %%xmm6, %%xmm14\n"
25821           "movapd %%xmm3, %%xmm11\n"
25822           "movapd %%xmm3, %%xmm15\n"
25823           "addpd %%xmm7, %%xmm11\n"
25824           "subpd %%xmm7, %%xmm15\n"
25825           "movupd %%xmm8, (%0)\n"
25826           "movupd %%xmm9, (%1)\n"
25827           "movupd %%xmm10, (%2)\n"
25828           "movupd %%xmm11, (%3)\n"
25829           "movupd %%xmm12, (%4)\n"
25830           "movupd %%xmm13, (%5)\n"
25831           "movupd %%xmm14, (%6)\n"
25832           "movupd %%xmm15, (%7)\n"
25833           :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25834         );
25835       }
25836     }
25837     return;
25838   }
25839   if (depth == 24) {
25840     helper_double_30_recursive(buf + 0, 21);
25841     helper_double_30_recursive(buf + 2097152, 21);
25842     helper_double_30_recursive(buf + 4194304, 21);
25843     helper_double_30_recursive(buf + 6291456, 21);
25844     helper_double_30_recursive(buf + 8388608, 21);
25845     helper_double_30_recursive(buf + 10485760, 21);
25846     helper_double_30_recursive(buf + 12582912, 21);
25847     helper_double_30_recursive(buf + 14680064, 21);
25848     for (int j = 0; j < 16777216; j += 16777216) {
25849       for (int k = 0; k < 2097152; k += 2) {
25850         __asm__ volatile (
25851           "movupd (%0), %%xmm0\n"
25852           "movupd (%1), %%xmm1\n"
25853           "movupd (%2), %%xmm2\n"
25854           "movupd (%3), %%xmm3\n"
25855           "movupd (%4), %%xmm4\n"
25856           "movupd (%5), %%xmm5\n"
25857           "movupd (%6), %%xmm6\n"
25858           "movupd (%7), %%xmm7\n"
25859           "movapd %%xmm0, %%xmm8\n"
25860           "movapd %%xmm0, %%xmm9\n"
25861           "addpd %%xmm1, %%xmm8\n"
25862           "subpd %%xmm1, %%xmm9\n"
25863           "movapd %%xmm2, %%xmm10\n"
25864           "movapd %%xmm2, %%xmm11\n"
25865           "addpd %%xmm3, %%xmm10\n"
25866           "subpd %%xmm3, %%xmm11\n"
25867           "movapd %%xmm4, %%xmm12\n"
25868           "movapd %%xmm4, %%xmm13\n"
25869           "addpd %%xmm5, %%xmm12\n"
25870           "subpd %%xmm5, %%xmm13\n"
25871           "movapd %%xmm6, %%xmm14\n"
25872           "movapd %%xmm6, %%xmm15\n"
25873           "addpd %%xmm7, %%xmm14\n"
25874           "subpd %%xmm7, %%xmm15\n"
25875           "movapd %%xmm8, %%xmm0\n"
25876           "movapd %%xmm8, %%xmm2\n"
25877           "addpd %%xmm10, %%xmm0\n"
25878           "subpd %%xmm10, %%xmm2\n"
25879           "movapd %%xmm9, %%xmm1\n"
25880           "movapd %%xmm9, %%xmm3\n"
25881           "addpd %%xmm11, %%xmm1\n"
25882           "subpd %%xmm11, %%xmm3\n"
25883           "movapd %%xmm12, %%xmm4\n"
25884           "movapd %%xmm12, %%xmm6\n"
25885           "addpd %%xmm14, %%xmm4\n"
25886           "subpd %%xmm14, %%xmm6\n"
25887           "movapd %%xmm13, %%xmm5\n"
25888           "movapd %%xmm13, %%xmm7\n"
25889           "addpd %%xmm15, %%xmm5\n"
25890           "subpd %%xmm15, %%xmm7\n"
25891           "movapd %%xmm0, %%xmm8\n"
25892           "movapd %%xmm0, %%xmm12\n"
25893           "addpd %%xmm4, %%xmm8\n"
25894           "subpd %%xmm4, %%xmm12\n"
25895           "movapd %%xmm1, %%xmm9\n"
25896           "movapd %%xmm1, %%xmm13\n"
25897           "addpd %%xmm5, %%xmm9\n"
25898           "subpd %%xmm5, %%xmm13\n"
25899           "movapd %%xmm2, %%xmm10\n"
25900           "movapd %%xmm2, %%xmm14\n"
25901           "addpd %%xmm6, %%xmm10\n"
25902           "subpd %%xmm6, %%xmm14\n"
25903           "movapd %%xmm3, %%xmm11\n"
25904           "movapd %%xmm3, %%xmm15\n"
25905           "addpd %%xmm7, %%xmm11\n"
25906           "subpd %%xmm7, %%xmm15\n"
25907           "movupd %%xmm8, (%0)\n"
25908           "movupd %%xmm9, (%1)\n"
25909           "movupd %%xmm10, (%2)\n"
25910           "movupd %%xmm11, (%3)\n"
25911           "movupd %%xmm12, (%4)\n"
25912           "movupd %%xmm13, (%5)\n"
25913           "movupd %%xmm14, (%6)\n"
25914           "movupd %%xmm15, (%7)\n"
25915           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25916         );
25917       }
25918     }
25919     return;
25920   }
25921   if (depth == 27) {
25922     helper_double_30_recursive(buf + 0, 24);
25923     helper_double_30_recursive(buf + 16777216, 24);
25924     helper_double_30_recursive(buf + 33554432, 24);
25925     helper_double_30_recursive(buf + 50331648, 24);
25926     helper_double_30_recursive(buf + 67108864, 24);
25927     helper_double_30_recursive(buf + 83886080, 24);
25928     helper_double_30_recursive(buf + 100663296, 24);
25929     helper_double_30_recursive(buf + 117440512, 24);
25930     for (int j = 0; j < 134217728; j += 134217728) {
25931       for (int k = 0; k < 16777216; k += 2) {
25932         __asm__ volatile (
25933           "movupd (%0), %%xmm0\n"
25934           "movupd (%1), %%xmm1\n"
25935           "movupd (%2), %%xmm2\n"
25936           "movupd (%3), %%xmm3\n"
25937           "movupd (%4), %%xmm4\n"
25938           "movupd (%5), %%xmm5\n"
25939           "movupd (%6), %%xmm6\n"
25940           "movupd (%7), %%xmm7\n"
25941           "movapd %%xmm0, %%xmm8\n"
25942           "movapd %%xmm0, %%xmm9\n"
25943           "addpd %%xmm1, %%xmm8\n"
25944           "subpd %%xmm1, %%xmm9\n"
25945           "movapd %%xmm2, %%xmm10\n"
25946           "movapd %%xmm2, %%xmm11\n"
25947           "addpd %%xmm3, %%xmm10\n"
25948           "subpd %%xmm3, %%xmm11\n"
25949           "movapd %%xmm4, %%xmm12\n"
25950           "movapd %%xmm4, %%xmm13\n"
25951           "addpd %%xmm5, %%xmm12\n"
25952           "subpd %%xmm5, %%xmm13\n"
25953           "movapd %%xmm6, %%xmm14\n"
25954           "movapd %%xmm6, %%xmm15\n"
25955           "addpd %%xmm7, %%xmm14\n"
25956           "subpd %%xmm7, %%xmm15\n"
25957           "movapd %%xmm8, %%xmm0\n"
25958           "movapd %%xmm8, %%xmm2\n"
25959           "addpd %%xmm10, %%xmm0\n"
25960           "subpd %%xmm10, %%xmm2\n"
25961           "movapd %%xmm9, %%xmm1\n"
25962           "movapd %%xmm9, %%xmm3\n"
25963           "addpd %%xmm11, %%xmm1\n"
25964           "subpd %%xmm11, %%xmm3\n"
25965           "movapd %%xmm12, %%xmm4\n"
25966           "movapd %%xmm12, %%xmm6\n"
25967           "addpd %%xmm14, %%xmm4\n"
25968           "subpd %%xmm14, %%xmm6\n"
25969           "movapd %%xmm13, %%xmm5\n"
25970           "movapd %%xmm13, %%xmm7\n"
25971           "addpd %%xmm15, %%xmm5\n"
25972           "subpd %%xmm15, %%xmm7\n"
25973           "movapd %%xmm0, %%xmm8\n"
25974           "movapd %%xmm0, %%xmm12\n"
25975           "addpd %%xmm4, %%xmm8\n"
25976           "subpd %%xmm4, %%xmm12\n"
25977           "movapd %%xmm1, %%xmm9\n"
25978           "movapd %%xmm1, %%xmm13\n"
25979           "addpd %%xmm5, %%xmm9\n"
25980           "subpd %%xmm5, %%xmm13\n"
25981           "movapd %%xmm2, %%xmm10\n"
25982           "movapd %%xmm2, %%xmm14\n"
25983           "addpd %%xmm6, %%xmm10\n"
25984           "subpd %%xmm6, %%xmm14\n"
25985           "movapd %%xmm3, %%xmm11\n"
25986           "movapd %%xmm3, %%xmm15\n"
25987           "addpd %%xmm7, %%xmm11\n"
25988           "subpd %%xmm7, %%xmm15\n"
25989           "movupd %%xmm8, (%0)\n"
25990           "movupd %%xmm9, (%1)\n"
25991           "movupd %%xmm10, (%2)\n"
25992           "movupd %%xmm11, (%3)\n"
25993           "movupd %%xmm12, (%4)\n"
25994           "movupd %%xmm13, (%5)\n"
25995           "movupd %%xmm14, (%6)\n"
25996           "movupd %%xmm15, (%7)\n"
25997           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25998         );
25999       }
26000     }
26001     return;
26002   }
26003   if (depth == 30) {
26004     helper_double_30_recursive(buf + 0, 27);
26005     helper_double_30_recursive(buf + 134217728, 27);
26006     helper_double_30_recursive(buf + 268435456, 27);
26007     helper_double_30_recursive(buf + 402653184, 27);
26008     helper_double_30_recursive(buf + 536870912, 27);
26009     helper_double_30_recursive(buf + 671088640, 27);
26010     helper_double_30_recursive(buf + 805306368, 27);
26011     helper_double_30_recursive(buf + 939524096, 27);
26012     for (int j = 0; j < 1073741824; j += 1073741824) {
26013       for (int k = 0; k < 134217728; k += 2) {
26014         __asm__ volatile (
26015           "movupd (%0), %%xmm0\n"
26016           "movupd (%1), %%xmm1\n"
26017           "movupd (%2), %%xmm2\n"
26018           "movupd (%3), %%xmm3\n"
26019           "movupd (%4), %%xmm4\n"
26020           "movupd (%5), %%xmm5\n"
26021           "movupd (%6), %%xmm6\n"
26022           "movupd (%7), %%xmm7\n"
26023           "movapd %%xmm0, %%xmm8\n"
26024           "movapd %%xmm0, %%xmm9\n"
26025           "addpd %%xmm1, %%xmm8\n"
26026           "subpd %%xmm1, %%xmm9\n"
26027           "movapd %%xmm2, %%xmm10\n"
26028           "movapd %%xmm2, %%xmm11\n"
26029           "addpd %%xmm3, %%xmm10\n"
26030           "subpd %%xmm3, %%xmm11\n"
26031           "movapd %%xmm4, %%xmm12\n"
26032           "movapd %%xmm4, %%xmm13\n"
26033           "addpd %%xmm5, %%xmm12\n"
26034           "subpd %%xmm5, %%xmm13\n"
26035           "movapd %%xmm6, %%xmm14\n"
26036           "movapd %%xmm6, %%xmm15\n"
26037           "addpd %%xmm7, %%xmm14\n"
26038           "subpd %%xmm7, %%xmm15\n"
26039           "movapd %%xmm8, %%xmm0\n"
26040           "movapd %%xmm8, %%xmm2\n"
26041           "addpd %%xmm10, %%xmm0\n"
26042           "subpd %%xmm10, %%xmm2\n"
26043           "movapd %%xmm9, %%xmm1\n"
26044           "movapd %%xmm9, %%xmm3\n"
26045           "addpd %%xmm11, %%xmm1\n"
26046           "subpd %%xmm11, %%xmm3\n"
26047           "movapd %%xmm12, %%xmm4\n"
26048           "movapd %%xmm12, %%xmm6\n"
26049           "addpd %%xmm14, %%xmm4\n"
26050           "subpd %%xmm14, %%xmm6\n"
26051           "movapd %%xmm13, %%xmm5\n"
26052           "movapd %%xmm13, %%xmm7\n"
26053           "addpd %%xmm15, %%xmm5\n"
26054           "subpd %%xmm15, %%xmm7\n"
26055           "movapd %%xmm0, %%xmm8\n"
26056           "movapd %%xmm0, %%xmm12\n"
26057           "addpd %%xmm4, %%xmm8\n"
26058           "subpd %%xmm4, %%xmm12\n"
26059           "movapd %%xmm1, %%xmm9\n"
26060           "movapd %%xmm1, %%xmm13\n"
26061           "addpd %%xmm5, %%xmm9\n"
26062           "subpd %%xmm5, %%xmm13\n"
26063           "movapd %%xmm2, %%xmm10\n"
26064           "movapd %%xmm2, %%xmm14\n"
26065           "addpd %%xmm6, %%xmm10\n"
26066           "subpd %%xmm6, %%xmm14\n"
26067           "movapd %%xmm3, %%xmm11\n"
26068           "movapd %%xmm3, %%xmm15\n"
26069           "addpd %%xmm7, %%xmm11\n"
26070           "subpd %%xmm7, %%xmm15\n"
26071           "movupd %%xmm8, (%0)\n"
26072           "movupd %%xmm9, (%1)\n"
26073           "movupd %%xmm10, (%2)\n"
26074           "movupd %%xmm11, (%3)\n"
26075           "movupd %%xmm12, (%4)\n"
26076           "movupd %%xmm13, (%5)\n"
26077           "movupd %%xmm14, (%6)\n"
26078           "movupd %%xmm15, (%7)\n"
26079           :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
26080         );
26081       }
26082     }
26083     return;
26084   }
26085 }
26086 void helper_double_30(double *buf);
helper_double_30(double * buf)26087 void helper_double_30(double *buf) {
26088   helper_double_30_recursive(buf, 30);
26089 }
fht_double(double * buf,int log_n)26090 int fht_double(double *buf, int log_n) {
26091   if (log_n == 0) {
26092     return 0;
26093   }
26094   if (log_n == 1) {
26095     helper_double_1(buf);
26096     return 0;
26097   }
26098   if (log_n == 2) {
26099     helper_double_2(buf);
26100     return 0;
26101   }
26102   if (log_n == 3) {
26103     helper_double_3(buf);
26104     return 0;
26105   }
26106   if (log_n == 4) {
26107     helper_double_4(buf);
26108     return 0;
26109   }
26110   if (log_n == 5) {
26111     helper_double_5(buf);
26112     return 0;
26113   }
26114   if (log_n == 6) {
26115     helper_double_6(buf);
26116     return 0;
26117   }
26118   if (log_n == 7) {
26119     helper_double_7(buf);
26120     return 0;
26121   }
26122   if (log_n == 8) {
26123     helper_double_8(buf);
26124     return 0;
26125   }
26126   if (log_n == 9) {
26127     helper_double_9(buf);
26128     return 0;
26129   }
26130   if (log_n == 10) {
26131     helper_double_10(buf);
26132     return 0;
26133   }
26134   if (log_n == 11) {
26135     helper_double_11(buf);
26136     return 0;
26137   }
26138   if (log_n == 12) {
26139     helper_double_12(buf);
26140     return 0;
26141   }
26142   if (log_n == 13) {
26143     helper_double_13(buf);
26144     return 0;
26145   }
26146   if (log_n == 14) {
26147     helper_double_14(buf);
26148     return 0;
26149   }
26150   if (log_n == 15) {
26151     helper_double_15(buf);
26152     return 0;
26153   }
26154   if (log_n == 16) {
26155     helper_double_16(buf);
26156     return 0;
26157   }
26158   if (log_n == 17) {
26159     helper_double_17(buf);
26160     return 0;
26161   }
26162   if (log_n == 18) {
26163     helper_double_18(buf);
26164     return 0;
26165   }
26166   if (log_n == 19) {
26167     helper_double_19(buf);
26168     return 0;
26169   }
26170   if (log_n == 20) {
26171     helper_double_20(buf);
26172     return 0;
26173   }
26174   if (log_n == 21) {
26175     helper_double_21(buf);
26176     return 0;
26177   }
26178   if (log_n == 22) {
26179     helper_double_22(buf);
26180     return 0;
26181   }
26182   if (log_n == 23) {
26183     helper_double_23(buf);
26184     return 0;
26185   }
26186   if (log_n == 24) {
26187     helper_double_24(buf);
26188     return 0;
26189   }
26190   if (log_n == 25) {
26191     helper_double_25(buf);
26192     return 0;
26193   }
26194   if (log_n == 26) {
26195     helper_double_26(buf);
26196     return 0;
26197   }
26198   if (log_n == 27) {
26199     helper_double_27(buf);
26200     return 0;
26201   }
26202   if (log_n == 28) {
26203     helper_double_28(buf);
26204     return 0;
26205   }
26206   if (log_n == 29) {
26207     helper_double_29(buf);
26208     return 0;
26209   }
26210   if (log_n == 30) {
26211     helper_double_30(buf);
26212     return 0;
26213   }
26214   return 1;
26215 }
26216